def test_pipeline_produced_expected_data() -> bool: delete_existing_outputs(STORAGE_CONFIG) filename = os.path.basename(EXPECTED_FILE) pipeline = Pipeline(PIPELINE_CONFIG, STORAGE_CONFIG) pipeline.run(EXAMPLE_FILE) # Retrieve the output data file loc_id = pipeline.config.pipeline_definition.location_id datastream = DSUtil.get_datastream_name(config=pipeline.config) root: str = pipeline.storage._root output_file = os.path.join(root, loc_id, datastream, filename) # Assert that the basename of the processed file and expected file match assert os.path.isfile(output_file) # Compare data and optionally attributes to ensure everything matches. ds_out: xr.Dataset = xr.open_dataset(output_file) ds_exp: xr.Dataset = xr.open_dataset(EXPECTED_FILE) xr.testing.assert_allclose(ds_out, ds_exp)
def main(): inputs = { 'channel_id': CHANNEL_ID, 'search_word': 'incredible', 'limit': 20, } steps = [ Preflight(), GetVideoList(), # 写成多行,增加易读性(最后一个建议有,) InitializeYT(), DownloadCaptions(), ReadCaption(), Search(), DownloadVideos(), EditVideo(), Postflight(), ] utils = Utils() p = Pipeline(steps) p.run(inputs, utils)
import io from urllib import request import csv import psycopg2 from pipeline.pipeline import Pipeline DATA_FILE_URL = 'https://dq-content.s3.amazonaws.com/251/storm_data.csv' DB_HOST = 'localhost' DB_NAME = '' # set database name DB_USER = '' # set database user name DB_PASSWORD = '' # set database user password pipeline = Pipeline() @pipeline.task() def create_db_connection(): """Create database connection.""" return psycopg2.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASSWORD) @pipeline.task(depends_on=create_db_connection) def create_db_tables(db_conn): """Create database tables for staging and final data.""" cursor = db_conn.cursor()
args = parse_args() prepare_libraries(args) settings = Settings() settings.update(args) # Set up logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) log_file = settings.get("log_file") if not log_file or log_file == "NONE": handler = logging.StreamHandler(sys.stdout) else: handler = logging.FileHandler(settings.get("log_file")) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) settings.set("logger", logger) # Print out the settings logger.info("**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**") logger.info("Settings used for this run of ScaffMatch are:") for s, v in settings.iteritems(): if s in ["std_dev", "ins_size", "pair_mode"]: continue logger.info(" %s -- %s" % (s, v)) # Feed the settings to the scaffolder pipeline scaffolder = Pipeline() scaffolder.set_settings(settings) # Go! scaffolder.scaffold() logger.info("Done!")
from pipeline.add_timestamp import AddTimestamp from pipeline.write_json import WriteJSON from pipeline.write_sitemaps import WriteSitemaps if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('SourceDirectory', type=dir_path, help="location of the vcpkg folder") parser.add_argument('-o', type=dir_path, help="output of the JSON file generated", default="./") args = parser.parse_args() ports_path = os.path.join(args.SourceDirectory, "ports") triplets_path = os.path.join(args.SourceDirectory, "triplets") baseline_path = os.path.join(args.SourceDirectory, "scripts/ci.baseline.txt") version_path = os.path.join(args.SourceDirectory, "versions") data_out_path = os.path.join(args.o, "data") pipeline = Pipeline(ReadPackages(ports_path), AddUsage(ports_path), AddTriplets(triplets_path), AddStatus(baseline_path), AddVersion(version_path), AddTimestamp(), WriteJSON(data_out_path, "libs.json"), WriteSitemaps(args.o, "sitemap.txt")) pipeline.run()
def main(): config = Config() parser = argparse.ArgumentParser( description='Code for building the Gutenberg Dialog Dataset') parser.add_argument('-dg', '--dialog_gap', default=config.dialog_gap, help='Min. number of characters between two dialogs ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument( '-isn', '--include_surrounding_narratives', default=config.include_surrounding_narratives, help='Whether to include surrounding narratives in the output dataset', action='store_true') parser.add_argument('-mnl', '--max_narrative_length', default=config.max_narrative_length, help='Max. number of words in 1 narrative ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument( '-minl', '--min_intermediate_narrative_length', default=config.min_intermediate_narrative_length, help= 'Min. number of words in 1 intermediate narrative (a narrative which occurs in-line with dialog) ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mul', '--max_utterance_length', default=config.max_utterance_length, help='Max. number of words in 1 utterance ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mb', '--max_books', default=config.max_books, help='Limit the number of books in final dataset ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-md', '--min_delimiters', default=config.min_delimiters, help='Min delimiters / 10000 words needed in a book ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-mdd', '--min_double_delim', default=config.min_double_delim, help='Double delimiter threshold (romance languages ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-kl', '--kl_threshold', default=config.kl_threshold, help='KL divergence threshold for filtering books ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-st', '--size_threshold', default=config.size_threshold, help='#words threshold for filtering with KL' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-cd', '--clean_dialogs', default=config.clean_dialogs, help='Whether to run pre-processing on dialogs', action='store_true') parser.add_argument('-vt', '--vocab_threshold', default=config.vocab_threshold, help='Ratio of unknown words allowed in a dialog ' + '(default: %(default)s)', metavar='', type=int) parser.add_argument('-l', '--languages', default=config.languages, help='Comma separated language codes ' + 'for which to build datasets', metavar='', type=str) parser.add_argument('-d', '--download', default=config.download, help='Whether to run download step', action='store_true') parser.add_argument('-f1', '--pre_filter', default=config.pre_filter, help='Whether to run pre-filter step', action='store_true') parser.add_argument('-e', '--extract', default=config.extract, help='Whether to run extracting step', action='store_true') parser.add_argument('-f2', '--post_filter', default=config.post_filter, help='Whether to run post filter step', action='store_true') parser.add_argument('-c', '--create_dataset', default=config.create_dataset, help='Whether to run create dataset step', action='store_true') parser.add_argument('-a', '--run_all', default=config.run_all, help='Whether to run all steps', action='store_true') parser.add_argument('-dir', '--directory', default=config.directory, help='Directory where the language folders are', metavar='', type=str) parser.parse_args(namespace=config) p = Pipeline(config) p.run()
plugin_base = PluginBase(package='pipeline.modules') modules = plugin_base.make_plugin_source(searchpath=[ './pipeline/modules', ]) def setup_workspace(): os.makedirs(workspace_location, exist_ok=True) os.makedirs(log_location, exist_ok=True) os.makedirs(output_location, exist_ok=True) def fake_pipeline(): open(pipeline_file, "w+").writelines(open(".pipeline").readlines()) def setup_docker(): import docker return docker.from_env() def test(): setup_workspace() fake_pipeline() if __name__ == "__main__": test() docker_client = setup_docker() pipeline = Pipeline(pipeline_file, docker_client, modules)
import pickle import pandas as pd from pipeline.pipeline import Pipeline # read datafame df = pd.read_pickle("./datasets/h1b_2019.pkl") # load into pipeline pl = Pipeline() pl.load_data(df) pl.train_test_split("CASE_STATUS") # TODO: redo to set_target
sys.path.append(repo_path + '/logparser/logparser/LenMa/') #for lenma __init__.py sys.path.append(repo_path + '/logparser/logparser/LenMa/templateminer') #for lenma from pipeline.pipeline import Pipeline input_dir = repo_path + '/' # The input directory of log file output_dir = repo_path + '/' # The output directory of parsing results log_file = 'dayco_log.log' # The input log file name log_format = '<smonth> <sday> <shour> <ip> <id> <id2> <month> <day> <hour> <city> <type> <Content>' #dayco/rsyslog pipeline = Pipeline(parser_algorithm='drain', input_dir=input_dir, parser_output_dir=output_dir, log_file=log_file, parser_regex=log_format, feature_extractor='fixed_window', log_analizer_algorithm='mining_invariants', data_type='time_based', elasticsearch_index_name='deepia') para = { 'path': repo_path + '/', # directory for input data 'log_file_name': 'dayco_log.log', # filename for log data file 'log_event_mapping': 'dayco_log.logTemplateMap.csv', # filename for log-event mapping. A list of event index, where each row represents a log 'save_path': './time_windows/', # dir for saving sliding window data files to avoid splitting #'select_column':[0,4], # select the corresponding columns (label and time) in the raw log file 'select_column': [ 0, 1, 2
""" Script for running the pipeline """ #pylint: disable-all import os, sys, inspect CURRENT_DIR = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) PARENT_DIR = os.path.dirname(CURRENT_DIR) sys.path.insert(0, PARENT_DIR) from pipeline.pipeline import Pipeline # init filterbank filename fil_name = os.path.abspath("./pspm32.fil") # init filterbank sample size sample_size = 192 # init times the pipeline should run n_times = 10 # run the filterbank n times for i in range(n_times): # read static Pipeline(filename=fil_name, size=sample_size) # read stream, row per row Pipeline(filename=fil_name, as_stream=True) # read stream, n rows Pipeline(filename=fil_name, as_stream=True, n=sample_size)
def make_pipeline(self): pipe = Pipeline('my_pipeline') pipe.add_factor('returns', Returns(window_length=150)) return pipe