def test_check_toplevel(): '''check_config_toplevel() should work in check_config() implementation''' with yakonfig.defaulted_config([ConfigurableArgs(), Dependent], {'key': 'k'}) as config: assert sorted(iterkeys(config)) == ['config', 'dependent'] assert config['config']['k'] == 'k' with pytest.raises(yakonfig.ConfigurationError): with yakonfig.defaulted_config([ConfigurableArgs(), Dependent], {'key': 'key'}): pass
def test_normalize(): with yakonfig.defaulted_config([Normalized]): assert yakonfig.get_global_config('normalized')['k'] == 'v' with yakonfig.defaulted_config([Normalized], { 'k': 'foo' }): assert yakonfig.get_global_config('normalized')['k'] == 'f' with yakonfig.defaulted_config([Normalized], yaml=''' normalized: k: zoom! '''): assert yakonfig.get_global_config('normalized')['k'] == 'z'
def test_discovery(): with yakonfig.defaulted_config([Discovers]): # discovered overwrites default assert yakonfig.get_global_config('discovers', 'a') == 'foo' # discovered provides missing value assert yakonfig.get_global_config('discovers', 'b') == 'bar' # undiscovered uses default value assert yakonfig.get_global_config('discovers', 'c') == 'three' with yakonfig.defaulted_config([Discovers], {'a': 'alpha'}): # command-line value overrules discovery assert yakonfig.get_global_config('discovers', 'a') == 'alpha' assert yakonfig.get_global_config('discovers', 'b') == 'bar' assert yakonfig.get_global_config('discovers', 'c') == 'three'
def test_pipeline(request, test_data_dir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def main(): p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) kvlclient.delete('openquery', (key,)) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def config_local(elastic_address, namespace_string): config = { 'kvlayer': { 'storage_type': 'local', 'app_name': 'diffeo', 'namespace': 'memex_dossier.web.tests', }, 'memex_dossier.store': { 'hosts': [elastic_address], 'namespace': 'tests', 'type': namespace_string, 'shards': 1, 'replicas': 0, 'feature_indexes': [{ 'foo': { 'feature_names': ['foo'], 'es_index_type': 'string', }, }, { 'bar': { 'feature_names': ['bar'], 'es_index_type': 'string', }, }], }, } modules = [kvlayer] with yakonfig.defaulted_config(modules, config=config) as config: yield config
def highlights_worker(work_unit): '''coordinate worker wrapper around :func:`maybe_create_highlights` ''' if 'config' not in work_unit.spec: raise coordinate.exceptions.ProgrammerError( 'could not run `create_highlights` without global config') web_conf = Config() unitconf = work_unit.spec['config'] with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf], config=unitconf): file_id = make_file_id(work_unit.key) web_conf.kvlclient.setup_namespace(highlights_kvlayer_tables) payload_strs = list(web_conf.kvlclient.get('files', file_id)) if payload_strs and payload_strs[0][1]: payload_str = payload_strs[0][1] try: data = json.loads(payload_str) # now create the response payload maybe_store_highlights(file_id, data, web_conf.tfidf, web_conf.kvlclient) except Exception, exc: logger.critical('failed to decode data out of %r', payload_str, exc_info=True) payload = { 'state': ERROR, 'error': { 'code': 7, 'message': 'failed to generate stored results:\n%s' % \ traceback.format_exc(exc)} } payload_str = json.dumps(payload) kvlclient.put('highlights', (file_id, payload_str))
def client(backend, request, tmpdir, namespace_string): config_path = str(request.fspath.dirpath('config_{0}.yaml'.format(backend))) statsfile = StringIO.StringIO() params = dict( app_name='kvlayer', namespace=namespace_string, log_stats=statsfile, log_stats_interval_ops=1, blagh='hoo haa', ) # this is hacky but must go somewhere if backend == 'filestorage': local = tmpdir.join('local') with local.open('w') as f: pass params['kvlayer_filename'] = str(local) if backend == 'redis': params['storage_addresses'] = [ redis_address(request) ] with yakonfig.defaulted_config([kvlayer], filename=config_path, params=params): client = kvlayer.client() client.delete_namespace() yield client client.delete_namespace()
def run_all_perftests(redis_address=None, clientlist=None): '''Run all of the performance tests, on every backend. This is intended to be a fully-automated answer for :program:`kvlayer_test`. ''' rc = 0 for name in STORAGE_CLIENTS.iterkeys(): if clientlist is not None: if name not in clientlist: logger.info('skipping backend %r', name) continue elif name in ['cassandra', 'accumulo', 'postgres', 'cborproxy']: logger.info('skipping backend %r', name) continue config = os.path.join(os.path.dirname(__file__), 'config_{0}.yaml'.format(name)) if not os.path.exists(config): continue params = {'app_name': 'kvlayer_performance', 'namespace': 'ns' + uuid.uuid1().hex} if name == 'filestorage': params['kvlayer_filename'] = os.tmpnam() if name == 'redis' and redis_address is not None: params['storage_addresses'] = [redis_address] with yakonfig.defaulted_config( [kvlayer], filename=config, params=params): ret = run_perftests() if rc == 0: rc = ret if name == 'filestorage': os.unlink(params['kvlayer_filename']) return rc
def __init__(self, storage_client=None, table_name="log", storage_config=None): """Create a new database log handler. You must either pass in ``storage_client``, an actual kvlayer client object, or ``storage_config``, a dictionary which will be passed to ``kvlayer.client()``. Log messages will be stored in the table ``table_name``. :param storage_client: existing storage client :type storage_client: :class:`kvlayer.AbstractStorage` :param str table_name: virtual table name :param dict storage_config: configuration for new storage client """ super(DatabaseLogHandler, self).__init__() if storage_client is None: if storage_config is None: raise RuntimeError('must pass either storage_client or ' 'storage_config') with yakonfig.defaulted_config( [kvlayer], config=dict(kvlayer=storage_config)): storage_client = kvlayer.client() self.storage = storage_client self.table_name = table_name storage_client.setup_namespace({table_name: 1}) self.sequence_number = 0
def test_pipeline(request, test_data_dir): filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def config_local(elastic_address, namespace_string): config = { 'kvlayer': { 'storage_type': 'local', 'app_name': 'diffeo', 'namespace': 'dossier.web.tests', }, 'dossier.store': { 'hosts': [elastic_address], 'namespace': 'tests', 'type': namespace_string, 'shards': 1, 'replicas': 0, 'feature_indexes': [{ 'foo': { 'feature_names': ['foo'], 'es_index_type': 'string', }, }, { 'bar': { 'feature_names': ['bar'], 'es_index_type': 'string', }, }], }, } modules = [ElasticStoreSync, kvlayer] with yakonfig.defaulted_config(modules, config=config) as config: yield config
def xconfig(jobqueue_conf): with yakonfig.defaulted_config([coordinate], config={ 'coordinate': { 'job_queue': jobqueue_conf } }) as config: yield config
def main(): p = argparse.ArgumentParser( 'simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps( (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) kvlclient.delete('openquery', (key, )) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def rejester_run(work_unit): '''Rejester entry point to run the elasticsearch load. This uses the work unit key as the input filename string for the reader specified in the work unit. If the work unit data includes the key ``output`` then that value is passed as the matching output filename string. :param work_unit: work unit to run :type work_unit: :class:`rejester.WorkUnit` ''' if 'config' not in work_unit.spec: raise rejester.exceptions.ProgrammerError( 'could not run without global config') with yakonfig.defaulted_config([rejester, kvlayer, dblogger], config=work_unit.spec['config']): ## Setup elasticsearch client ## http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch es = elasticsearch.Elasticsearch(work_unit.spec['config']['elasticsearch']['cluster']) ## Setup kvlayer client kvl = kvlayer.client() kvl.setup_namespace({'stream_items': 2}) ## Get data associate with work_unit key, data = kvl.get('stream_items', work_unit.key).next() ## Index an individual stream_item elasticsearch_loader.index_stream_item(es, kvl, data)
def test_config_from_env(redis_address, monkeypatch, namespace_string): host, port = redis_address.split(':', 2) monkeypatch.setenv('REDIS_PORT_6379_TCP_ADDR', host) monkeypatch.setenv('REDIS_PORT_6379_TCP_PORT', port) with yakonfig.defaulted_config( [rejester], config={'rejester': {'namespace': namespace_string}}): pass # should not throw a configuration error
def test_replaces_proxy(): with yakonfig.defaulted_config([ConfigurableLikeTop]): c = yakonfig.get_global_config() assert sorted(iterkeys(c)) == ['top'] c = c['top'] assert 'bottom' in c c = c['bottom'] assert c['zzz'] == '-32768'
def actions(namespace_string): with yakonfig.defaulted_config( [kvlayer], params={"app_name": "diffeo", "namespace": namespace_string, "storage_type": "local", "storage_addresses": []}, ): a = Actions(stdout=StringIO()) yield a a.client.delete_namespace()
def client(namespace_string, request): config_path = str(request.fspath.dirpath('config_cassandra.yaml')) with yakonfig.defaulted_config([kvlayer], filename=config_path, params={'namespace': namespace_string, 'app_name': 'kvltest'}): client = kvlayer.client() yield client client.delete_namespace()
def test_check_dependent(): # give an invalid configuration for args; but it's not the "real" # application so it shouldn't be checked with yakonfig.defaulted_config([ConfigurableArgs(), ConfigurableLike], {'key': 'value'}) as config: assert sorted(iterkeys(config)) == ['config', 'configurable'] with pytest.raises(yakonfig.ConfigurationError): yakonfig.check_toplevel_config(ConfigurableArgs(), 'test') with pytest.raises(yakonfig.ConfigurationError): yakonfig.check_toplevel_config(ConfigurableTop, 'test')
def test_dedup_chunk_counts(request, test_data_dir, tmpdir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename, config={'tmp_dir_path': str(tmpdir)} ) as config: ## run the pipeline pf = PipelineFactory(PipelineStages()) p = pf(config['streamcorpus_pipeline']) p.run(get_test_chunk_path(test_data_dir))
def kvl(): config = { 'storage_type': 'local', 'app_name': 'diffeo', 'namespace': 'dossier.store.test', } with yakonfig.defaulted_config([kvlayer], params=config) as config: client = kvlayer.client() yield client client.delete_namespace()
def test_dedup_chunk_counts(request, test_data_dir, tmpdir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename, config={'tmp_dir_path': str(tmpdir)}) as config: ## run the pipeline pf = PipelineFactory(PipelineStages()) p = pf(config['streamcorpus_pipeline']) p.run(get_test_chunk_path(test_data_dir))
def local_storage(): config_yaml = """ kvlayer: storage_type: local storage_addresses: [] namespace: test app_name: test """ with yakonfig.defaulted_config([kvlayer], yaml=config_yaml): local_storage = LocalStorage() yield local_storage local_storage.delete_namespace()
def test_proxy_two_level(): with yakonfig.defaulted_config( [yakonfig.ProxyConfigurable(ConfigurableTop)], { 'aaa': 'a', 'bbb': 'b', 'zzz': 'z' }): c = yakonfig.get_global_config() assert sorted(iterkeys(c)) == ['top'] c = c['top'] assert sorted(iterkeys(c)) == ['aaa', 'bottom'] assert c['aaa'] == 'a' c = c['bottom'] assert sorted(iterkeys(c)) == ['zzz'] assert c['zzz'] == 'z'
def test_external_stage_default(tmpdir): with yakonfig.defaulted_config([streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'external_stages_path': __file__, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], 'tmp_dir_path': str(tmpdir), }, }): config=yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'default message'
def test_external_stage_unregistered(tmpdir): with yakonfig.defaulted_config([streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'tmp_dir_path': str(tmpdir), 'external_stage': { 'message': 'configured message', }, }, }): config=yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file): """minimal end-to-end test, with a fixed pipeline""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config): """supply known-bad prefetched data""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(DecodeError): pipeline._process_task(work_unit)
def direct(config_path, namespace_string): with yakonfig.defaulted_config( [kvlayer], filename=config_path, params={"app_name": "kvlayer", "namespace": namespace_string} ): config = yakonfig.get_global_config("kvlayer") conn = Accumulo( host="test-accumulo-1.diffeo.com", port=50096, user=config["username"], password=config["password"] ) yield conn tables = conn.list_tables() for table in tables: if re.search(namespace_string, table): conn.delete_table(table)
def worker(work_unit): '''Expects a WorkUnit from coordinated, obtains a config, and runs traverse_extract_fetch ''' if 'config' not in work_unit.spec: raise coordinate.exceptions.ProgrammerError( 'could not run extraction without global config') web_conf = Config() unitconf = work_unit.spec['config'] #logger.info(unitconf) with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf], config=unitconf): traverse_extract_fetch(web_conf, work_unit.key)
def test_external_stage_default(tmpdir): with yakonfig.defaulted_config( [streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'external_stages_path': __file__, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], 'tmp_dir_path': str(tmpdir), }, }): config = yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'default message'
def test_external_stage_unregistered(tmpdir): with yakonfig.defaulted_config( [streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'tmp_dir_path': str(tmpdir), 'external_stage': { 'message': 'configured message', }, }, }): config = yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config): """minimal end-to-end test, missing prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(ConfigurationError): pipeline._process_task(work_unit)
def rejester_run_function(work_unit): with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit)
def test_external_module_registered(tmpdir): with yakonfig.defaulted_config([streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'external_stages_modules': [ 'streamcorpus_pipeline.tests.test_pipeline' ], 'external_stage': { 'message': 'configured message', }, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], 'tmp_dir_path': str(tmpdir), }, }): config=yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config, output_file): """set a publisher_type filter that matches nothing in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'MICROBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) # no chunks means the output file won't actually get written assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file): """set a publisher_type filter that matches everything in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'WEBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file): """configuration explicitly ignores bad prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': False } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def rejester_run_function(work_unit): with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent and might not ## get called by atexit if we are running under ## multiprocessing pipeline.cleanup()
def test_external_module_registered(tmpdir): with yakonfig.defaulted_config( [streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'external_stages_modules': ['streamcorpus_pipeline.tests.test_pipeline'], 'external_stage': { 'message': 'configured message', }, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], 'tmp_dir_path': str(tmpdir), }, }): config = yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config, output_file): """minimal end-to-end test, preloading data in the loader""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' with open(filename, 'rb') as f: from_spinn3r_feed._prefetched[key] = f.read() work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def make_config(overlay={}): config = yakonfig.merge.overlay_config(base_config, overlay) with yakonfig.defaulted_config([kvlayer, MiniScp], config=config): yield client = kvlayer.client() client.delete_namespace()
def make_config(overlay={}): config = yakonfig.merge.overlay_config(base_config, overlay) with yakonfig.defaulted_config([], config=config): yield
def worker(work_unit, max_sample=1000): '''Expects a coordinate WorkUnit for DragNet and runs the following steps: 1. scans all dossiers at the *folder* level and assembles feature vectors for each folder -- see `make_feature` 2. trains a multinomial naive Bayes classifier that treats each *folder* as a classifier target. 3. sample the corpus by scanning up to `max_sample` and applying the classifier to each item to get an approx "size" of the Folder 4. Bootstrap by treating those classifier predictions as truth data and extract the learned features that are predictive as new query strings. 5. Put the data in kvlayer for webservice end point to return to polling client -- see dossier.models.routes ''' if 'config' not in work_unit.spec: raise coordinate.exceptions.ProgrammerError( 'could not run dragnet without global config') web_conf = Config() unitconf = work_unit.spec['config'] with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf], config=unitconf): labels = [] D = list() label2fid = dict() rejects = set() keepers = set() # 1. make a classifier target for each *folder*, ignoring # subfolder structure FT = Folders(web_conf.kvlclient) for idx, fid in enumerate(FT.folders()): label2fid[idx] = fid for sid in FT.subfolders(fid): for cid, subtopic_id in FT.items(fid, sid): fc = web_conf.store.get(cid) if fc: # NB: first call to make_feature feat, _rejects, _keepers = make_feature(fc) else: _rejects = {} _keepers = {} D.append(feat) labels.append(idx) rejects.update(_rejects) keepers.update(_keepers) logger.info('fid=%r, observation: %r', fid, cid) # 2. Convert the StringCounters into an sklearn format and # train MultinomialNB logger.info('transforming...') v = DictVectorizer(sparse=False) X = v.fit_transform(D) logger.info('transform fit done.') labels = np.array(labels) # Fit the sklearn Bernoulli Naive Bayes classifer clf = MultinomialNB() clf.fit(X, labels) logger.info('fit MultinomialNB') # 3. Scan the corpus up to max_sample putting the items into # each target to get an approx "size" of the Folder counts = Counter() for cid, fc in islice(web_conf.store.scan(), max_sample): # build the same feature vector as the training process feat, _rejects, _keepers = make_feature(fc) X = v.transform([feat]) # predict which folder it belongs in target = clf.predict(X[0])[0] # count the effective size of that folder in this sample counts[label2fid[target]] += 1 logger.info('counts done') ## 4. Bootstrap by treating those classifier predictions as ## truth data and extract the learned features that are ## predictive as new query strings. clusters = [] for idx in sorted(set(labels)): logger.debug('considering cluster: %d', idx) try: all_features = v.inverse_transform(clf.feature_log_prob_[idx])[0] except: logger.warn('beyond edge on cluster %d', idx) continue words = Counter(all_features) ordered = sorted(words.items(), key=operator.itemgetter(1), reverse=True) filtered = [] for it in ordered: if is_bad_token(it[0]): continue if is_username(it[0]): logger.debug('%r is_username', it[0]) #else: # continue filtered.append(it) if len(filtered) > 100: # hard cutoff break # normalize cluster size exponentially biggest = exp(filtered[0][1]) # rescale all by biggest filtered = [(key, int(round(counts[label2fid[idx]] * exp(w) / biggest))) for key, w in filtered] # describe what we just figured out logger.info('%s --> %r', label2fid[idx], ['%s: %d' % it for it in filtered[:10]]) # return build the JSON-serializable format for the # DragNet UI embedded inside SortingDesk cluster = [] cluster.append({'caption': label2fid[idx], 'weight': counts[label2fid[idx]], 'folder_id': None, }) cluster += [{'caption': caption, 'weight': weight, 'folder_id': label2fid[idx]} for caption, weight in filtered if weight > 0] clusters.append(cluster) # 5. Put the data in kvlayer for webservice end point to # return to polling client web_conf.kvlclient.setup_namespace({'dragnet': (str,)}) web_conf.kvlclient.put('dragnet', (('dragnet',), json.dumps({'clusters': clusters}))) return dict(counts)