def test_pipeline(request, test_data_dir): filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def test_pipeline(request, test_data_dir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def test_pipeline(monkeypatch): def mockexit(status=0): log( ' sys.exit(%d) ' % status ) raise SuccessfulExit() monkeypatch.setattr(sys, 'exit', mockexit) path = os.path.dirname(__file__) config = yaml.load(open(os.path.join(path, 'test_dedup_chunk_counts.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path()) ## run the pipeline p = Pipeline( config ) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SuccessfulExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) log( 'now joining...' ) timeout = gevent.Timeout(1) g.join(timeout=timeout)
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file): """minimal end-to-end test, with a fixed pipeline""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config): """supply known-bad prefetched data""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(DecodeError): pipeline._process_task(work_unit)
def main(): parser = argparse.ArgumentParser( conflict_handler='resolve', description='process entire directories using streamcorpus_pipeline') parser.add_argument('directories', nargs='+', metavar='directory', help='directory name(s) to process') args = yakonfig.parse_args(parser, [ yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline, DirectoryConfig ]) gconfig = yakonfig.get_global_config() scdconfig = gconfig['streamcorpus_directory'] work_spec = { 'name': scdconfig.get('name', 'streamcorpus_directory'), 'desc': 'read files from a directory', 'min_gb': 8, 'config': gconfig, 'module': 'streamcorpus_pipeline._rejester', 'run_function': 'rejester_run_function', 'terminate_function': 'rejester_terminate_function', } def get_filenames(): for d in args.directories: if scdconfig['mode'] == 'files': yield d elif scdconfig['mode'] == 'file-lists': with open(d, 'r') as f: for line in f: yield line.strip() elif scdconfig['mode'] == 'directories': for dirpath, dirnames, filenames in os.walk(d): for filename in filenames: yield os.path.abspath(os.path.join(dirpath, filename)) work_units = {filename: {'start_count': 0} for filename in get_filenames()} if scdconfig['engine'] == 'rejester': tm = rejester.TaskMaster(gconfig['rejester']) tm.update_bundle(work_spec, work_units) elif scdconfig['engine'] == 'standalone': for k, v in work_units.iteritems(): u = SimpleWorkUnit(k) u.spec = work_spec u.data = v rejester_run_function(u)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config): """minimal end-to-end test, missing prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(ConfigurationError): pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file): """set a publisher_type filter that matches everything in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'WEBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config, output_file): """set a publisher_type filter that matches nothing in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'MICROBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) # no chunks means the output file won't actually get written assert not os.path.exists(output_file)
def main(): parser = argparse.ArgumentParser(conflict_handler='resolve', description='process entire directories using streamcorpus_pipeline') parser.add_argument('directories', nargs='+', metavar='directory', help='directory name(s) to process') args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline, DirectoryConfig]) gconfig = yakonfig.get_global_config() scdconfig = gconfig['streamcorpus_directory'] work_spec = { 'name': scdconfig.get('name', 'streamcorpus_directory'), 'desc': 'read files from a directory', 'min_gb': 8, 'config': gconfig, 'module': 'streamcorpus_pipeline._rejester', 'run_function': 'rejester_run_function', 'terminate_function': 'rejester_terminate_function', } def get_filenames(): for d in args.directories: if scdconfig['mode'] == 'files': yield d elif scdconfig['mode'] == 'file-lists': with open(d, 'r') as f: for line in f: yield line.strip() elif scdconfig['mode'] == 'directories': for dirpath, dirnames, filenames in os.walk(d): for filename in filenames: yield os.path.abspath(os.path.join(dirpath, filename)) work_units = { filename: { 'start_count': 0 } for filename in get_filenames() } if scdconfig['engine'] == 'rejester': tm = rejester.TaskMaster(gconfig['rejester']) tm.update_bundle(work_spec, work_units) elif scdconfig['engine'] == 'standalone': for k,v in work_units.iteritems(): u = SimpleWorkUnit(k) u.spec = work_spec u.data = v rejester_run_function(u)
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file): """configuration explicitly ignores bad prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': False } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config, output_file): """minimal end-to-end test, preloading data in the loader""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' with open(filename, 'rb') as f: from_spinn3r_feed._prefetched[key] = f.read() work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls