def main(): save_dir = "../data" image_file = "../data/test/karyotype.bmp" model_path = "../model/default_inference.h5" Pipeline.run(image_file=image_file, save_dir=save_dir, model_path=model_path)
def test_detect_interesting_points(self): image_file = data_dir + "/test/karyotype.bmp" image = image_utils.read_image(image_file) chromosomes = Pipeline.extract_chromosomes(image) straightened_chromosomes = Pipeline.straighten_chromosomes(chromosomes) _ = Pipeline.detect_interesting_points(straightened_chromosomes, verbose=True)
def test_shell_command_exit(): """Test that a single shell command functions properly """ actions = [TaskAction("shell_command", name="exiter", commands=["exit 1"])] executor = Pipeline(actions) source = DummySource() result = executor.schedule(source).get() assert result.results["exiter"].returncode == 1
def test_organize_chromosomes(self): image_file = data_dir + "/test/karyotype.bmp" image = Pipeline.read_image(image_file) chromosomes = Pipeline.extract_chromosomes(image) straightened_chromosomes = Pipeline.straighten_chromosomes(chromosomes) # interesting_points = Pipeline.detect_interesting_points(straightened_chromosomes) interesting_points = None classified_chromosomes = Pipeline.classify_chromosomes( straightened_chromosomes, interesting_points) karyotyping_image = Pipeline.organize_chromosomes( classified_chromosomes) image_utils.show_image(karyotyping_image)
def test_pipeline_produced_expected_data() -> bool: delete_existing_outputs(STORAGE_CONFIG) filename = os.path.basename(EXPECTED_FILE) pipeline = Pipeline(PIPELINE_CONFIG, STORAGE_CONFIG) pipeline.run(EXAMPLE_FILE) # Retrieve the output data file loc_id = pipeline.config.pipeline_definition.location_id datastream = DSUtil.get_datastream_name(config=pipeline.config) root: str = pipeline.storage._root output_file = os.path.join(root, loc_id, datastream, filename) # Assert that the basename of the processed file and expected file match assert os.path.isfile(output_file) # Compare data and optionally attributes to ensure everything matches. ds_out: xr.Dataset = xr.open_dataset(output_file) ds_exp: xr.Dataset = xr.open_dataset(EXPECTED_FILE) xr.testing.assert_allclose(ds_out, ds_exp)
def main(): inputs = { 'channel_id': CHANNEL_ID, 'search_word': 'incredible', 'limit': 20, } steps = [ Preflight(), GetVideoList(), # 写成多行,增加易读性(最后一个建议有,) InitializeYT(), DownloadCaptions(), ReadCaption(), Search(), DownloadVideos(), EditVideo(), Postflight(), ] utils = Utils() p = Pipeline(steps) p.run(inputs, utils)
def test_source_acquired(): """Test that a single shell command is executed. This will acquire a source that installs flake8-diff as part of it's acquisition instructions, and then verify that flake8-diff is installed in the task itself. """ actions = [ TaskAction( "shell_command", name="installer", # workspace= 'python3', # workspace_kwargs= { # "delete": False # }, commands=["pip freeze |grep flake8-diff"], ) ] executor = Pipeline(actions) source = DummySource() result = executor.schedule(source).get() assert result.results["installer"].returncode == 0
def test_straighten_chromosomes(self): image_file = data_dir + "/test/karyotype.bmp" image = image_utils.read_image(image_file) chromosomes = Pipeline.extract_chromosomes(image) _ = Pipeline.straighten_chromosomes(chromosomes, debug=True)
from pipeline.pipeline import Pipeline # from interface.interface import Interface # Pipeline.getInstance().run_pipeline(".", img_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_05_29/DFW_Early_190529_transformed_small.png",hmap_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_05_29/DFW_Early_190529Height_Map_trans.png") # Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_06_05") # Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Mid_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Mid_19/19_05_29") Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Late_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Late_19/19_05_29") # Interface.getInstance().run()
args = parse_args() prepare_libraries(args) settings = Settings() settings.update(args) # Set up logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_file = settings.get("log_file") if not log_file or log_file == "NONE": handler = logging.StreamHandler(sys.stdout) else: handler = logging.FileHandler(settings.get("log_file")) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) settings.set("logger", logger) # Print out the settings logger.info("**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**") logger.info("Settings used for this run of ScaffMatch are:") for s, v in settings.iteritems(): if s in ["std_dev", "ins_size", "pair_mode"]: continue logger.info(" %s -- %s" % (s, v)) # Feed the settings to the scaffolder pipeline scaffolder = Pipeline() scaffolder.set_settings(settings) # Go! scaffolder.scaffold() logger.info("Done!")
def main(): config = Config() parser = argparse.ArgumentParser( description='Code for building the Gutenberg Dialog Dataset') parser.add_argument('-dg', '--dialog_gap', default=config.dialog_gap, help='Min. number of characters between two dialogs ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument( '-isn', '--include_surrounding_narratives', default=config.include_surrounding_narratives, help='Whether to include surrounding narratives in the output dataset', action='store_true') parser.add_argument('-mnl', '--max_narrative_length', default=config.max_narrative_length, help='Max. number of words in 1 narrative ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument( '-minl', '--min_intermediate_narrative_length', default=config.min_intermediate_narrative_length, help= 'Min. number of words in 1 intermediate narrative (a narrative which occurs in-line with dialog) ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mul', '--max_utterance_length', default=config.max_utterance_length, help='Max. number of words in 1 utterance ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mb', '--max_books', default=config.max_books, help='Limit the number of books in final dataset ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-md', '--min_delimiters', default=config.min_delimiters, help='Min delimiters / 10000 words needed in a book ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mdd', '--min_double_delim', default=config.min_double_delim, help='Double delimiter threshold (romance languages ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-kl', '--kl_threshold', default=config.kl_threshold, help='KL divergence threshold for filtering books ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-st', '--size_threshold', default=config.size_threshold, help='#words threshold for filtering with KL' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-cd', '--clean_dialogs', default=config.clean_dialogs, help='Whether to run pre-processing on dialogs', action='store_true') parser.add_argument('-vt', '--vocab_threshold', default=config.vocab_threshold, help='Ratio of unknown words allowed in a dialog ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-l', '--languages', default=config.languages, help='Comma separated language codes ' + 'for which to build datasets', metavar='', type=str) parser.add_argument('-d', '--download', default=config.download, help='Whether to run download step', action='store_true') parser.add_argument('-f1', '--pre_filter', default=config.pre_filter, help='Whether to run pre-filter step', action='store_true') parser.add_argument('-e', '--extract', default=config.extract, help='Whether to run extracting step', action='store_true') parser.add_argument('-f2', '--post_filter', default=config.post_filter, help='Whether to run post filter step', action='store_true') parser.add_argument('-c', '--create_dataset', default=config.create_dataset, help='Whether to run create dataset step', action='store_true') parser.add_argument('-a', '--run_all', default=config.run_all, help='Whether to run all steps', action='store_true') parser.add_argument('-dir', '--directory', default=config.directory, help='Directory where the language folders are', metavar='', type=str) parser.parse_args(namespace=config) p = Pipeline(config) p.run()
sys.path.append(repo_path + '/logparser/logparser/LenMa/') #for lenma __init__.py sys.path.append(repo_path + '/logparser/logparser/LenMa/templateminer') #for lenma from pipeline.pipeline import Pipeline input_dir = repo_path + '/' # The input directory of log file output_dir = repo_path + '/' # The output directory of parsing results log_file = 'dayco_log.log' # The input log file name log_format = '<smonth> <sday> <shour> <ip> <id> <id2> <month> <day> <hour> <city> <type> <Content>' #dayco/rsyslog pipeline = Pipeline(parser_algorithm='drain', input_dir=input_dir, parser_output_dir=output_dir, log_file=log_file, parser_regex=log_format, feature_extractor='fixed_window', log_analizer_algorithm='mining_invariants', data_type='time_based', elasticsearch_index_name='deepia') para = { 'path': repo_path + '/', # directory for input data 'log_file_name': 'dayco_log.log', # filename for log data file 'log_event_mapping': 'dayco_log.logTemplateMap.csv', # filename for log-event mapping. A list of event index, where each row represents a log 'save_path': './time_windows/', # dir for saving sliding window data files to avoid splitting #'select_column':[0,4], # select the corresponding columns (label and time) in the raw log file 'select_column': [ 0, 1, 2
import json import io import csv import string from datetime import datetime from pipeline.pipeline import Pipeline from pipeline.csv_helper import CsvHelper exclude_words = ('the', 'to', 'a', 'of', 'for', 'in', 'and', 'is', '–', 'on', 'hn:', 'an', 'at', 'not', 'with', 'why', 'how', 'your', 'from', 'new', 'you', 'i', 'by', 'what', 'my', 'are', 'as', 'that', 'we', 'it', 'be', 'now', 'using', 'has') pipeline = Pipeline() csv_helper = CsvHelper() @pipeline.task() def file_to_json(): with open('hn_stories_2014.json', 'r') as file: data_dict = json.load(file) stories = data_dict['stories'] return stories @pipeline.task(depends_on=file_to_json) def filter_stories(stories): def is_popular(story): return story['points'] > 50 and story[ 'num_comments'] > 1 and not story['title'].startswith('Ask HN')
""" Script for running the pipeline """ #pylint: disable-all import os, sys, inspect CURRENT_DIR = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) PARENT_DIR = os.path.dirname(CURRENT_DIR) sys.path.insert(0, PARENT_DIR) from pipeline.pipeline import Pipeline # init filterbank filename fil_name = os.path.abspath("./pspm32.fil") # init filterbank sample size sample_size = 192 # init times the pipeline should run n_times = 10 # run the filterbank n times for i in range(n_times): # read static Pipeline(filename=fil_name, size=sample_size) # read stream, row per row Pipeline(filename=fil_name, as_stream=True) # read stream, n rows Pipeline(filename=fil_name, as_stream=True, n=sample_size)
def make_pipeline(self): pipe = Pipeline('my_pipeline') pipe.add_factor('returns', Returns(window_length=150)) return pipe
def __init__( self, pipeline_name="default", input_file="input.mp4", runtime_config=None ): update_preset_pipelines(input_file=input_file, runtime_config=runtime_config) Pipeline.__init__(self, preset_pipelines[pipeline_name])
plugin_base = PluginBase(package='pipeline.modules') modules = plugin_base.make_plugin_source(searchpath=[ './pipeline/modules', ]) def setup_workspace(): os.makedirs(workspace_location, exist_ok=True) os.makedirs(log_location, exist_ok=True) os.makedirs(output_location, exist_ok=True) def fake_pipeline(): open(pipeline_file, "w+").writelines(open(".pipeline").readlines()) def setup_docker(): import docker return docker.from_env() def test(): setup_workspace() fake_pipeline() if __name__ == "__main__": test() docker_client = setup_docker() pipeline = Pipeline(pipeline_file, docker_client, modules)
import pickle import pandas as pd from pipeline.pipeline import Pipeline # read datafame df = pd.read_pickle("./datasets/h1b_2019.pkl") # load into pipeline pl = Pipeline() pl.load_data(df) pl.train_test_split("CASE_STATUS") # TODO: redo to set_target
from pipeline.add_timestamp import AddTimestamp from pipeline.write_json import WriteJSON from pipeline.write_sitemaps import WriteSitemaps if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('SourceDirectory', type=dir_path, help="location of the vcpkg folder") parser.add_argument('-o', type=dir_path, help="output of the JSON file generated", default="./") args = parser.parse_args() ports_path = os.path.join(args.SourceDirectory, "ports") triplets_path = os.path.join(args.SourceDirectory, "triplets") baseline_path = os.path.join(args.SourceDirectory, "scripts/ci.baseline.txt") version_path = os.path.join(args.SourceDirectory, "versions") data_out_path = os.path.join(args.o, "data") pipeline = Pipeline(ReadPackages(ports_path), AddUsage(ports_path), AddTriplets(triplets_path), AddStatus(baseline_path), AddVersion(version_path), AddTimestamp(), WriteJSON(data_out_path, "libs.json"), WriteSitemaps(args.o, "sitemap.txt")) pipeline.run()
def test_generate_chromosome_cluster(self): image_file = data_dir + "/test/karyotype.bmp" image = image_utils.read_image(image_file) image_utils.show_image(image) chromosome_cluster = Pipeline.generate_chromosome_cluster(image) image_utils.show_image(chromosome_cluster, cmap=None)
def test_read_image(self): image_file = data_dir + "/test/karyotype.bmp" image = Pipeline.read_image(image_file) image_utils.show_image(image)
def test_extract_chromosomes(self): image_file = data_dir + "/test/karyotype.bmp" image = image_utils.read_image(image_file) chromosomes = Pipeline.extract_chromosomes(image) for chromosome in chromosomes: image_utils.show_image(chromosome, cmap=None)
import io from urllib import request import csv import psycopg2 from pipeline.pipeline import Pipeline DATA_FILE_URL = 'https://dq-content.s3.amazonaws.com/251/storm_data.csv' DB_HOST = 'localhost' DB_NAME = '' # set database name DB_USER = '' # set database user name DB_PASSWORD = '' # set database user password pipeline = Pipeline() @pipeline.task() def create_db_connection(): """Create database connection.""" return psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASSWORD) @pipeline.task(depends_on=create_db_connection) def create_db_tables(db_conn): """Create database tables for staging and final data.""" cursor = db_conn.cursor()
import csv import io import json import string from collections import Counter from datetime import datetime from pprint import pprint from pytz import timezone from pipeline.pipeline import Pipeline, build_csv from pipeline.stop_words import stop_words pipeline = Pipeline() def __get_start_end_dates(year): # Given a year, return the start end end timestamps in unix epoch utc = timezone("UTC") start = utc.localize(datetime(year, 1, 1)).timestamp() end = utc.localize(datetime(year + 1, 1, 1)).timestamp() return start, end # TODO currently we are only getting 1 page, need iterate through pages to get full dataset @pipeline.task() def get_data_from_hacker_news(year=2014): import requests url = "http://hn.algolia.com/api/v1/search_by_date" start, end = __get_start_end_dates(year) query = {