def rejester_run_function(work_unit): config = work_unit.spec.get('streamcorpus_pipeline') if not config: raise ConfigurationError('spec is missing streamcorpus_pipeline: %r' % work_unit.spec) instantiate_config(config) pipeline = Pipeline(config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit)
def main(): import argparse parser = argparse.ArgumentParser( description=Pipeline.__doc__, usage="python -m streamcorpus.pipeline.run config.yaml" ) parser.add_argument("-i", "--input", action="append", help="file paths to input instead of reading from stdin") parser.add_argument("--inglob", action="append", default=[], help="path glob specifying input files") parser.add_argument( "config", metavar="config.yaml", nargs="+", help="configuration parameters for a pipeline run. many config yaml files may be specified, later values win.", ) args = parser.parse_args() ## layered configs is a feature only of the CLI config = load_layered_configs(args.config) if len(args.config) > 1: print "# net config:" print config_to_string(config) instantiate_config(config) pipeline = Pipeline(config) input_paths = [] if args.inglob: for pattern in args.inglob: input_paths.extend(glob.glob(pattern)) if args.input: input_paths.extend(args.input) if not input_paths: input_paths = sys.stdin for i_str in input_paths: work_unit = SimpleWorkUnit(i_str.strip()) work_unit.data["start_chunk_time"] = time.time() work_unit.data["start_count"] = 0 pipeline._process_task(work_unit)