def rejester_run_function(work_unit):

    config = work_unit.spec.get('streamcorpus_pipeline')
    if not config:
        raise ConfigurationError('spec is missing streamcorpus_pipeline: %r' % work_unit.spec)

    instantiate_config(config)

    pipeline = Pipeline(config)

    work_unit.data['start_chunk_time'] = time.time()
    work_unit.data['start_count'] = 0
    pipeline._process_task(work_unit)
Example #2
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description=Pipeline.__doc__, usage="python -m streamcorpus.pipeline.run config.yaml"
    )
    parser.add_argument("-i", "--input", action="append", help="file paths to input instead of reading from stdin")
    parser.add_argument("--inglob", action="append", default=[], help="path glob specifying input files")
    parser.add_argument(
        "config",
        metavar="config.yaml",
        nargs="+",
        help="configuration parameters for a pipeline run. many config yaml files may be specified, later values win.",
    )
    args = parser.parse_args()

    ## layered configs is a feature only of the CLI
    config = load_layered_configs(args.config)
    if len(args.config) > 1:
        print "# net config:"
        print config_to_string(config)

    instantiate_config(config)

    pipeline = Pipeline(config)

    input_paths = []
    if args.inglob:
        for pattern in args.inglob:
            input_paths.extend(glob.glob(pattern))
    if args.input:
        input_paths.extend(args.input)
    if not input_paths:
        input_paths = sys.stdin

    for i_str in input_paths:
        work_unit = SimpleWorkUnit(i_str.strip())
        work_unit.data["start_chunk_time"] = time.time()
        work_unit.data["start_count"] = 0
        pipeline._process_task(work_unit)