def replace_config(config, name): # Do we have external stages? if ('external_stages_path' not in config and 'external_stages_modules' not in config): return streamcorpus_pipeline stages = PipelineStages() if 'external_stages_path' in config: path = config['external_stages_path'] if not os.path.isabs(path) and config.get('root_path'): path = os.path.join(config['root_path'], path) try: stages.load_external_stages(config['external_stages_path']) except IOError, e: return streamcorpus_pipeline # let check_config re-raise this
def rejester_run_function(work_unit): with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit)
def check_config(config, name): if 'tmp_dir_path' not in config: raise ConfigurationError( '{} requires tmp_dir_path setting'.format(name)) # Checking stages: stages = PipelineStages() # (1) Push in the external stages; if 'external_stages_path' in config: try: stages.load_external_stages(config['external_stages_path']) except IOError, e: raise ConfigurationError( 'invalid {} external_stages_path {}'.format( name, config['external_stages_path']), e)
def test_pipeline(request, test_data_dir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def check_config(config, name): if 'tmp_dir_path' not in config: raise ConfigurationError('{} requires tmp_dir_path setting' .format(name)) # Checking stages: stages = PipelineStages() # (1) Push in the external stages; if 'external_stages_path' in config: try: stages.load_external_stages(config['external_stages_path']) except IOError, e: raise ConfigurationError( 'invalid {} external_stages_path {}' .format(name, config['external_stages_path']), e)
def test_dedup_chunk_counts(request, test_data_dir, tmpdir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename, config={'tmp_dir_path': str(tmpdir)}) as config: ## run the pipeline pf = PipelineFactory(PipelineStages()) p = pf(config['streamcorpus_pipeline']) p.run(get_test_chunk_path(test_data_dir))
def rejester_run_function(work_unit): with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent and might not ## get called by atexit if we are running under ## multiprocessing pipeline.cleanup()
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file): """minimal end-to-end test, with a fixed pipeline""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config): """supply known-bad prefetched data""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(DecodeError): pipeline._process_task(work_unit)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config): """minimal end-to-end test, missing prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(ConfigurationError): pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config, output_file): """set a publisher_type filter that matches nothing in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'MICROBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) # no chunks means the output file won't actually get written assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file): """set a publisher_type filter that matches everything in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'WEBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def main(): import argparse parser = argparse.ArgumentParser( description='process a sequence of stream items', usage='streamcorpus_pipeline --config config.yaml --input file.in') parser.add_argument('-i', '--input', action='append', help='file paths to input instead of reading from stdin') parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files') parser.add_argument('--third-dir-path', help='path to third-party tools directory') parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large') modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config() ## this modifies the global config, passed by reference instantiate_config(config) input_paths = [] if args.in_glob: for pattern in args.in_glob: input_paths.extend(glob.glob(pattern)) if args.input: if '-' in args.input: if args.in_glob: sys.exit('cannot use "-i -" and --in-glob together') if len(args.input) > 1: sys.exit('cannot use "-i -" with multiple inputs') input_paths = sys.stdin else: input_paths.extend(args.input) scp_config = config['streamcorpus_pipeline'] stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) for i_str in input_paths: work_unit = SimpleWorkUnit(i_str.strip()) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent pipeline.cleanup()
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file): """configuration explicitly ignores bad prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': False } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config, output_file): """minimal end-to-end test, preloading data in the loader""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' with open(filename, 'rb') as f: from_spinn3r_feed._prefetched[key] = f.read() work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
default_config = { 'output_chunk_max_count': 500, 'rate_log_interval': 100, 'incremental_transforms': [], 'batch_transforms': [], 'post_batch_incremental_transforms': [], 'cleanup_tmp_files': True, 'assert_single_source': True, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], } runtime_keys = { 'tmp_dir_path': 'tmp_dir_path', 'third_dir_path': 'third_dir_path', } sub_modules = set(stage for stage in PipelineStages().itervalues() if hasattr(stage, 'config_name')) def replace_config(config, name): # Do we have external stages? if ('external_stages_path' not in config and 'external_stages_modules' not in config): return streamcorpus_pipeline stages = PipelineStages() if 'external_stages_path' in config: path = config['external_stages_path'] if not os.path.isabs(path) and config.get('root_path'): path = os.path.join(config['root_path'], path) try: stages.load_external_stages(config['external_stages_path'])