def replace_config(config, name): # Do we have external stages? if ('external_stages_path' not in config and 'external_stages_modules' not in config): return streamcorpus_pipeline stages = PipelineStages() if 'external_stages_path' in config: path = config['external_stages_path'] if not os.path.isabs(path) and config.get('root_path'): path = os.path.join(config['root_path'], path) try: stages.load_external_stages(config['external_stages_path']) except IOError, e: return streamcorpus_pipeline # let check_config re-raise this
def replace_config(config, name): # Do we have external stages? if ('external_stages_path' not in config and 'external_stages_modules' not in config): return streamcorpus_pipeline stages = PipelineStages() if 'external_stages_path' in config: path = config['external_stages_path'] if not os.path.isabs(path) and config.get('root_path'): path = os.path.join(config['root_path'], path) try: stages.load_external_stages(config['external_stages_path']) except IOError, e: return streamcorpus_pipeline # let check_config re-raise this
def main(): import argparse parser = argparse.ArgumentParser( description='process a sequence of stream items', usage='streamcorpus_pipeline --config config.yaml --input file.in') parser.add_argument('-i', '--input', action='append', help='file paths to input instead of reading from stdin') parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files') parser.add_argument('--third-dir-path', help='path to third-party tools directory') parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large') modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config() ## this modifies the global config, passed by reference instantiate_config(config) input_paths = [] if args.in_glob: for pattern in args.in_glob: input_paths.extend(glob.glob(pattern)) if args.input: if '-' in args.input: if args.in_glob: sys.exit('cannot use "-i -" and --in-glob together') if len(args.input) > 1: sys.exit('cannot use "-i -" with multiple inputs') input_paths = sys.stdin else: input_paths.extend(args.input) scp_config = config['streamcorpus_pipeline'] stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) for i_str in input_paths: work_unit = SimpleWorkUnit(i_str.strip()) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent pipeline.cleanup()
def check_config(config, name): if 'tmp_dir_path' not in config: raise ConfigurationError( '{} requires tmp_dir_path setting'.format(name)) # Checking stages: stages = PipelineStages() # (1) Push in the external stages; if 'external_stages_path' in config: try: stages.load_external_stages(config['external_stages_path']) except IOError, e: raise ConfigurationError( 'invalid {} external_stages_path {}'.format( name, config['external_stages_path']), e)
def check_config(config, name): if 'tmp_dir_path' not in config: raise ConfigurationError('{} requires tmp_dir_path setting' .format(name)) # Checking stages: stages = PipelineStages() # (1) Push in the external stages; if 'external_stages_path' in config: try: stages.load_external_stages(config['external_stages_path']) except IOError, e: raise ConfigurationError( 'invalid {} external_stages_path {}' .format(name, config['external_stages_path']), e)
def rejester_run_function(work_unit): with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit)
def rejester_run_function(work_unit): with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent and might not ## get called by atexit if we are running under ## multiprocessing pipeline.cleanup()
def rejester_run_function(work_unit): with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent and might not ## get called by atexit if we are running under ## multiprocessing pipeline.cleanup()