Ejemplo n.º 1
0
def test_pipeline_produced_expected_data() -> bool:
    delete_existing_outputs(STORAGE_CONFIG)

    filename = os.path.basename(EXPECTED_FILE)
    pipeline = Pipeline(PIPELINE_CONFIG, STORAGE_CONFIG)
    pipeline.run(EXAMPLE_FILE)

    # Retrieve the output data file
    loc_id = pipeline.config.pipeline_definition.location_id
    datastream = DSUtil.get_datastream_name(config=pipeline.config)
    root: str = pipeline.storage._root
    output_file = os.path.join(root, loc_id, datastream, filename)

    # Assert that the basename of the processed file and expected file match
    assert os.path.isfile(output_file)

    # Compare data and optionally attributes to ensure everything matches.
    ds_out: xr.Dataset = xr.open_dataset(output_file)
    ds_exp: xr.Dataset = xr.open_dataset(EXPECTED_FILE)

    xr.testing.assert_allclose(ds_out, ds_exp)
Ejemplo n.º 2
0
def main():
    inputs = {
        'channel_id': CHANNEL_ID,
        'search_word': 'incredible',
        'limit': 20,

    }
    steps = [
        Preflight(),
        GetVideoList(),  # 写成多行,增加易读性(最后一个建议有,)
        InitializeYT(),
        DownloadCaptions(),
        ReadCaption(),
        Search(),
        DownloadVideos(),
        EditVideo(),
        Postflight(),
    ]

    utils = Utils()
    p = Pipeline(steps)
    p.run(inputs, utils)
import io
from urllib import request
import csv
import psycopg2

from pipeline.pipeline import Pipeline

DATA_FILE_URL = 'https://dq-content.s3.amazonaws.com/251/storm_data.csv'

DB_HOST = 'localhost'
DB_NAME = ''  # set database name
DB_USER = ''  # set database user name
DB_PASSWORD = ''  # set database user password

pipeline = Pipeline()


@pipeline.task()
def create_db_connection():
    """Create database connection."""
    return psycopg2.connect(host=DB_HOST,
                            database=DB_NAME,
                            user=DB_USER,
                            password=DB_PASSWORD)


@pipeline.task(depends_on=create_db_connection)
def create_db_tables(db_conn):
    """Create database tables for staging and final data."""
    cursor = db_conn.cursor()
Ejemplo n.º 4
0
    args = parse_args()
    prepare_libraries(args)
    settings = Settings()
    settings.update(args)
    # Set up logging
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_file = settings.get("log_file")
    if not log_file or log_file == "NONE":
        handler = logging.StreamHandler(sys.stdout)
    else:
        handler = logging.FileHandler(settings.get("log_file"))
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    settings.set("logger", logger)
    # Print out the settings
    logger.info("**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**")
    logger.info("Settings used for this run of ScaffMatch are:")
    for s, v in settings.iteritems():
        if s in ["std_dev", "ins_size", "pair_mode"]:
            continue
        logger.info("    %s  -- %s" % (s, v)) 
    # Feed the settings to the scaffolder pipeline
    scaffolder = Pipeline()
    scaffolder.set_settings(settings)
    # Go!
    scaffolder.scaffold()
    logger.info("Done!")
Ejemplo n.º 5
0
from pipeline.add_timestamp import AddTimestamp
from pipeline.write_json import WriteJSON
from pipeline.write_sitemaps import WriteSitemaps

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('SourceDirectory',
                        type=dir_path,
                        help="location of the vcpkg folder")
    parser.add_argument('-o',
                        type=dir_path,
                        help="output of the JSON file generated",
                        default="./")

    args = parser.parse_args()

    ports_path = os.path.join(args.SourceDirectory, "ports")
    triplets_path = os.path.join(args.SourceDirectory, "triplets")
    baseline_path = os.path.join(args.SourceDirectory,
                                 "scripts/ci.baseline.txt")
    version_path = os.path.join(args.SourceDirectory, "versions")
    data_out_path = os.path.join(args.o, "data")

    pipeline = Pipeline(ReadPackages(ports_path), AddUsage(ports_path),
                        AddTriplets(triplets_path), AddStatus(baseline_path),
                        AddVersion(version_path), AddTimestamp(),
                        WriteJSON(data_out_path, "libs.json"),
                        WriteSitemaps(args.o, "sitemap.txt"))

    pipeline.run()
Ejemplo n.º 6
0
def main():
    config = Config()
    parser = argparse.ArgumentParser(
        description='Code for building the Gutenberg Dialog Dataset')
    parser.add_argument('-dg',
                        '--dialog_gap',
                        default=config.dialog_gap,
                        help='Min. number of characters between two dialogs ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-isn',
        '--include_surrounding_narratives',
        default=config.include_surrounding_narratives,
        help='Whether to include surrounding narratives in the output dataset',
        action='store_true')
    parser.add_argument('-mnl',
                        '--max_narrative_length',
                        default=config.max_narrative_length,
                        help='Max. number of words in 1 narrative ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-minl',
        '--min_intermediate_narrative_length',
        default=config.min_intermediate_narrative_length,
        help=
        'Min. number of words in 1 intermediate narrative (a narrative which occurs in-line with dialog) '
        + '(default: %(default)s)',
        metavar='',
        type=int)
    parser.add_argument('-mul',
                        '--max_utterance_length',
                        default=config.max_utterance_length,
                        help='Max. number of words in 1 utterance ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mb',
                        '--max_books',
                        default=config.max_books,
                        help='Limit the number of books in final dataset ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-md',
                        '--min_delimiters',
                        default=config.min_delimiters,
                        help='Min delimiters / 10000 words needed in a book ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mdd',
                        '--min_double_delim',
                        default=config.min_double_delim,
                        help='Double delimiter threshold (romance languages ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-kl',
                        '--kl_threshold',
                        default=config.kl_threshold,
                        help='KL divergence threshold for filtering books ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-st',
                        '--size_threshold',
                        default=config.size_threshold,
                        help='#words threshold for filtering with KL' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-cd',
                        '--clean_dialogs',
                        default=config.clean_dialogs,
                        help='Whether to run pre-processing on dialogs',
                        action='store_true')
    parser.add_argument('-vt',
                        '--vocab_threshold',
                        default=config.vocab_threshold,
                        help='Ratio of unknown words allowed in a dialog ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-l',
                        '--languages',
                        default=config.languages,
                        help='Comma separated language codes ' +
                        'for which to build datasets',
                        metavar='',
                        type=str)
    parser.add_argument('-d',
                        '--download',
                        default=config.download,
                        help='Whether to run download step',
                        action='store_true')
    parser.add_argument('-f1',
                        '--pre_filter',
                        default=config.pre_filter,
                        help='Whether to run pre-filter step',
                        action='store_true')
    parser.add_argument('-e',
                        '--extract',
                        default=config.extract,
                        help='Whether to run extracting step',
                        action='store_true')
    parser.add_argument('-f2',
                        '--post_filter',
                        default=config.post_filter,
                        help='Whether to run post filter step',
                        action='store_true')
    parser.add_argument('-c',
                        '--create_dataset',
                        default=config.create_dataset,
                        help='Whether to run create dataset step',
                        action='store_true')
    parser.add_argument('-a',
                        '--run_all',
                        default=config.run_all,
                        help='Whether to run all steps',
                        action='store_true')
    parser.add_argument('-dir',
                        '--directory',
                        default=config.directory,
                        help='Directory where the language folders are',
                        metavar='',
                        type=str)

    parser.parse_args(namespace=config)
    p = Pipeline(config)
    p.run()
Ejemplo n.º 7
0
plugin_base = PluginBase(package='pipeline.modules')
modules = plugin_base.make_plugin_source(searchpath=[
    './pipeline/modules',
])


def setup_workspace():
    os.makedirs(workspace_location, exist_ok=True)
    os.makedirs(log_location, exist_ok=True)
    os.makedirs(output_location, exist_ok=True)


def fake_pipeline():
    open(pipeline_file, "w+").writelines(open(".pipeline").readlines())


def setup_docker():
    import docker
    return docker.from_env()


def test():
    setup_workspace()
    fake_pipeline()


if __name__ == "__main__":
    test()
    docker_client = setup_docker()
    pipeline = Pipeline(pipeline_file, docker_client, modules)
Ejemplo n.º 8
0
import pickle

import pandas as pd

from pipeline.pipeline import Pipeline

# read datafame
df = pd.read_pickle("./datasets/h1b_2019.pkl")

# load into pipeline
pl = Pipeline()
pl.load_data(df)
pl.train_test_split("CASE_STATUS")  # TODO: redo to set_target

















Ejemplo n.º 9
0
sys.path.append(repo_path +
                '/logparser/logparser/LenMa/')  #for lenma __init__.py
sys.path.append(repo_path +
                '/logparser/logparser/LenMa/templateminer')  #for lenma
from pipeline.pipeline import Pipeline

input_dir = repo_path + '/'  # The input directory of log file
output_dir = repo_path + '/'  # The output directory of parsing results
log_file = 'dayco_log.log'  # The input log file name
log_format = '<smonth> <sday> <shour> <ip> <id> <id2> <month> <day> <hour> <city> <type> <Content>'  #dayco/rsyslog

pipeline = Pipeline(parser_algorithm='drain',
                    input_dir=input_dir,
                    parser_output_dir=output_dir,
                    log_file=log_file,
                    parser_regex=log_format,
                    feature_extractor='fixed_window',
                    log_analizer_algorithm='mining_invariants',
                    data_type='time_based',
                    elasticsearch_index_name='deepia')

para = {
    'path': repo_path + '/',  # directory for input data
    'log_file_name': 'dayco_log.log',  # filename for log data file
    'log_event_mapping':
    'dayco_log.logTemplateMap.csv',  # filename for log-event mapping. A list of event index, where each row represents a log
    'save_path':
    './time_windows/',  # dir for saving sliding window data files to avoid splitting
    #'select_column':[0,4],                      # select the corresponding columns (label and time) in the raw log file
    'select_column': [
        0, 1, 2
Ejemplo n.º 10
0
"""
    Script for running the pipeline
"""
#pylint: disable-all
import os, sys, inspect

CURRENT_DIR = os.path.dirname(
    os.path.abspath(inspect.getfile(inspect.currentframe())))
PARENT_DIR = os.path.dirname(CURRENT_DIR)
sys.path.insert(0, PARENT_DIR)
from pipeline.pipeline import Pipeline

# init filterbank filename
fil_name = os.path.abspath("./pspm32.fil")
# init filterbank sample size
sample_size = 192
# init times the pipeline should run
n_times = 10

# run the filterbank n times
for i in range(n_times):
    # read static
    Pipeline(filename=fil_name, size=sample_size)
    # read stream, row per row
    Pipeline(filename=fil_name, as_stream=True)
    # read stream, n rows
    Pipeline(filename=fil_name, as_stream=True, n=sample_size)
Ejemplo n.º 11
0
    def make_pipeline(self):
        pipe = Pipeline('my_pipeline')
        pipe.add_factor('returns', Returns(window_length=150))

        return pipe