Example #1
0
def test_check_toplevel():
    '''check_config_toplevel() should work in check_config() implementation'''
    with yakonfig.defaulted_config([ConfigurableArgs(), Dependent],
                                   {'key': 'k'}) as config:
        assert sorted(iterkeys(config)) == ['config', 'dependent']
        assert config['config']['k'] == 'k'
    with pytest.raises(yakonfig.ConfigurationError):
        with yakonfig.defaulted_config([ConfigurableArgs(), Dependent],
                                       {'key': 'key'}):
            pass
Example #2
0
def test_normalize():
    with yakonfig.defaulted_config([Normalized]):
        assert yakonfig.get_global_config('normalized')['k'] == 'v'
    with yakonfig.defaulted_config([Normalized], { 'k': 'foo' }):
        assert yakonfig.get_global_config('normalized')['k'] == 'f'
    with yakonfig.defaulted_config([Normalized],
                                   yaml='''
normalized:
  k: zoom!
'''):
        assert yakonfig.get_global_config('normalized')['k'] == 'z'
Example #3
0
def test_discovery():
    with yakonfig.defaulted_config([Discovers]):
        # discovered overwrites default
        assert yakonfig.get_global_config('discovers', 'a') == 'foo'
        # discovered provides missing value
        assert yakonfig.get_global_config('discovers', 'b') == 'bar'
        # undiscovered uses default value
        assert yakonfig.get_global_config('discovers', 'c') == 'three'
    with yakonfig.defaulted_config([Discovers], {'a': 'alpha'}):
        # command-line value overrules discovery
        assert yakonfig.get_global_config('discovers', 'a') == 'alpha'
        assert yakonfig.get_global_config('discovers', 'b') == 'bar'
        assert yakonfig.get_global_config('discovers', 'c') == 'three'
def test_pipeline(request, test_data_dir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
Example #5
0
def main():
    p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        kvlclient.delete('openquery', (key,))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
Example #6
0
def config_local(elastic_address, namespace_string):
    config = {
        'kvlayer': {
            'storage_type': 'local',
            'app_name': 'diffeo',
            'namespace': 'memex_dossier.web.tests',
        },
        'memex_dossier.store': {
            'hosts': [elastic_address],
            'namespace':
            'tests',
            'type':
            namespace_string,
            'shards':
            1,
            'replicas':
            0,
            'feature_indexes': [{
                'foo': {
                    'feature_names': ['foo'],
                    'es_index_type': 'string',
                },
            }, {
                'bar': {
                    'feature_names': ['bar'],
                    'es_index_type': 'string',
                },
            }],
        },
    }
    modules = [kvlayer]
    with yakonfig.defaulted_config(modules, config=config) as config:
        yield config
Example #7
0
def highlights_worker(work_unit):
    '''coordinate worker wrapper around :func:`maybe_create_highlights`
    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run `create_highlights` without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):
        file_id = make_file_id(work_unit.key)
        web_conf.kvlclient.setup_namespace(highlights_kvlayer_tables)
        payload_strs = list(web_conf.kvlclient.get('files', file_id))
        if payload_strs and payload_strs[0][1]:
            payload_str = payload_strs[0][1]
            try:
                data = json.loads(payload_str)
                # now create the response payload
                maybe_store_highlights(file_id, data, web_conf.tfidf,
                                       web_conf.kvlclient)
            except Exception, exc:
                logger.critical('failed to decode data out of %r',
                                payload_str,
                                exc_info=True)
                payload = {
                    'state': ERROR,
                    'error': {
                        'code': 7,
                        'message': 'failed to generate stored results:\n%s' % \
                        traceback.format_exc(exc)}
                    }
                payload_str = json.dumps(payload)
                kvlclient.put('highlights', (file_id, payload_str))
Example #8
0
def client(backend, request, tmpdir, namespace_string):
    config_path = str(request.fspath.dirpath('config_{0}.yaml'.format(backend)))
    statsfile = StringIO.StringIO()
    params = dict(
        app_name='kvlayer',
        namespace=namespace_string,
        log_stats=statsfile,
        log_stats_interval_ops=1,
        blagh='hoo haa',
    )

    # this is hacky but must go somewhere
    if backend == 'filestorage':
        local = tmpdir.join('local')
        with local.open('w') as f: pass
        params['kvlayer_filename'] = str(local)

    if backend == 'redis':
        params['storage_addresses'] = [ redis_address(request) ]

    with yakonfig.defaulted_config([kvlayer], filename=config_path,
                                   params=params):
        client = kvlayer.client()
        client.delete_namespace()
        yield client
        client.delete_namespace()
Example #9
0
def run_all_perftests(redis_address=None, clientlist=None):
    '''Run all of the performance tests, on every backend.

    This is intended to be a fully-automated answer for
    :program:`kvlayer_test`.

    '''
    rc = 0
    for name in STORAGE_CLIENTS.iterkeys():
        if clientlist is not None:
            if name not in clientlist:
                logger.info('skipping backend %r', name)
                continue
        elif name in ['cassandra', 'accumulo', 'postgres', 'cborproxy']:
            logger.info('skipping backend %r', name)
            continue
        config = os.path.join(os.path.dirname(__file__),
                              'config_{0}.yaml'.format(name))
        if not os.path.exists(config):
            continue
        params = {'app_name': 'kvlayer_performance',
                  'namespace': 'ns' + uuid.uuid1().hex}
        if name == 'filestorage':
            params['kvlayer_filename'] = os.tmpnam()
        if name == 'redis' and redis_address is not None:
            params['storage_addresses'] = [redis_address]
        with yakonfig.defaulted_config(
                [kvlayer], filename=config, params=params):
            ret = run_perftests()
            if rc == 0:
                rc = ret
        if name == 'filestorage':
            os.unlink(params['kvlayer_filename'])
    return rc
Example #10
0
    def __init__(self, storage_client=None, table_name="log",
                 storage_config=None):
        """Create a new database log handler.

        You must either pass in ``storage_client``, an actual kvlayer
        client object, or ``storage_config``, a dictionary which will
        be passed to ``kvlayer.client()``.  Log messages
        will be stored in the table ``table_name``.

        :param storage_client: existing storage client
        :type storage_client: :class:`kvlayer.AbstractStorage`
        :param str table_name: virtual table name
        :param dict storage_config: configuration for new storage client

        """
        super(DatabaseLogHandler, self).__init__()

        if storage_client is None:
            if storage_config is None:
                raise RuntimeError('must pass either storage_client or '
                                   'storage_config')
            with yakonfig.defaulted_config(
                    [kvlayer], config=dict(kvlayer=storage_config)):
                storage_client = kvlayer.client()

        self.storage = storage_client
        self.table_name = table_name
        storage_client.setup_namespace({table_name: 1})
        self.sequence_number = 0
def test_pipeline(request, test_data_dir):
    filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
Example #12
0
def config_local(elastic_address, namespace_string):
    config = {
        'kvlayer': {
            'storage_type': 'local',
            'app_name': 'diffeo',
            'namespace': 'dossier.web.tests',
        },
        'dossier.store': {
            'hosts': [elastic_address],
            'namespace': 'tests',
            'type': namespace_string,
            'shards': 1,
            'replicas': 0,
            'feature_indexes': [{
                'foo': {
                    'feature_names': ['foo'],
                    'es_index_type': 'string',
                },
            }, {
                'bar': {
                    'feature_names': ['bar'],
                    'es_index_type': 'string',
                },
            }],
        },
    }
    modules = [ElasticStoreSync, kvlayer]
    with yakonfig.defaulted_config(modules, config=config) as config:
        yield config
Example #13
0
def highlights_worker(work_unit):
    '''coordinate worker wrapper around :func:`maybe_create_highlights`
    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run `create_highlights` without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):
        file_id = make_file_id(work_unit.key)
        web_conf.kvlclient.setup_namespace(highlights_kvlayer_tables)
        payload_strs = list(web_conf.kvlclient.get('files', file_id))
        if payload_strs and payload_strs[0][1]:
            payload_str = payload_strs[0][1]
            try:
                data = json.loads(payload_str)
                # now create the response payload
                maybe_store_highlights(file_id, data, web_conf.tfidf, web_conf.kvlclient)
            except Exception, exc:
                logger.critical('failed to decode data out of %r', 
                                payload_str, exc_info=True)
                payload = {
                    'state': ERROR,
                    'error': {
                        'code': 7,
                        'message': 'failed to generate stored results:\n%s' % \
                        traceback.format_exc(exc)}
                    }
                payload_str = json.dumps(payload)
                kvlclient.put('highlights', (file_id, payload_str))
Example #14
0
def xconfig(jobqueue_conf):
    with yakonfig.defaulted_config([coordinate], config={
            'coordinate': {
                'job_queue': jobqueue_conf
            }
    }) as config:
        yield config
Example #15
0
def main():
    p = argparse.ArgumentParser(
        'simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps(
        (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf],
                                       config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        kvlclient.delete('openquery', (key, ))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
def rejester_run(work_unit):
    '''Rejester entry point to run the elasticsearch load.

    This uses the work unit key as the input filename string for the
    reader specified in the work unit.  If the work unit data
    includes the key ``output`` then that value is passed as the matching
    output filename string.

    :param work_unit: work unit to run
    :type work_unit: :class:`rejester.WorkUnit`

    '''
    if 'config' not in work_unit.spec:
        raise rejester.exceptions.ProgrammerError(
            'could not run without global config')

    with yakonfig.defaulted_config([rejester, kvlayer, dblogger],
                                   config=work_unit.spec['config']):

        ## Setup elasticsearch client
        ## http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch
        es = elasticsearch.Elasticsearch(work_unit.spec['config']['elasticsearch']['cluster'])

        ## Setup kvlayer client
        kvl = kvlayer.client()
        kvl.setup_namespace({'stream_items': 2})

        ## Get data associate with work_unit
        key, data = kvl.get('stream_items', work_unit.key).next()

        ## Index an individual stream_item
        elasticsearch_loader.index_stream_item(es, kvl, data)
Example #17
0
def test_config_from_env(redis_address, monkeypatch, namespace_string):
    host, port = redis_address.split(':', 2)
    monkeypatch.setenv('REDIS_PORT_6379_TCP_ADDR', host)
    monkeypatch.setenv('REDIS_PORT_6379_TCP_PORT', port)
    with yakonfig.defaulted_config(
            [rejester],
            config={'rejester': {'namespace': namespace_string}}):
        pass  # should not throw a configuration error
Example #18
0
def test_replaces_proxy():
    with yakonfig.defaulted_config([ConfigurableLikeTop]):
        c = yakonfig.get_global_config()
        assert sorted(iterkeys(c)) == ['top']
        c = c['top']
        assert 'bottom' in c
        c = c['bottom']
        assert c['zzz'] == '-32768'
Example #19
0
def actions(namespace_string):
    with yakonfig.defaulted_config(
        [kvlayer],
        params={"app_name": "diffeo", "namespace": namespace_string, "storage_type": "local", "storage_addresses": []},
    ):
        a = Actions(stdout=StringIO())
        yield a
        a.client.delete_namespace()
def client(namespace_string, request):
    config_path = str(request.fspath.dirpath('config_cassandra.yaml'))
    with yakonfig.defaulted_config([kvlayer], filename=config_path,
                                   params={'namespace': namespace_string,
                                           'app_name': 'kvltest'}):
        client = kvlayer.client()
        yield client
        client.delete_namespace()
Example #21
0
def test_check_dependent():
    # give an invalid configuration for args; but it's not the "real"
    # application so it shouldn't be checked
    with yakonfig.defaulted_config([ConfigurableArgs(), ConfigurableLike],
                                   {'key': 'value'}) as config:
        assert sorted(iterkeys(config)) == ['config', 'configurable']
        with pytest.raises(yakonfig.ConfigurationError):
            yakonfig.check_toplevel_config(ConfigurableArgs(), 'test')
        with pytest.raises(yakonfig.ConfigurationError):
            yakonfig.check_toplevel_config(ConfigurableTop, 'test')
def test_dedup_chunk_counts(request, test_data_dir, tmpdir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   filename=filename,
                                   config={'tmp_dir_path': str(tmpdir)}
    ) as config:
        ## run the pipeline
        pf = PipelineFactory(PipelineStages())
        p = pf(config['streamcorpus_pipeline'])
        p.run(get_test_chunk_path(test_data_dir))
Example #23
0
def kvl():
    config = {
        'storage_type': 'local',
        'app_name': 'diffeo',
        'namespace': 'dossier.store.test',
    }
    with yakonfig.defaulted_config([kvlayer], params=config) as config:
        client = kvlayer.client()
        yield client
        client.delete_namespace()
Example #24
0
def test_dedup_chunk_counts(request, test_data_dir, tmpdir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   filename=filename,
                                   config={'tmp_dir_path':
                                           str(tmpdir)}) as config:
        ## run the pipeline
        pf = PipelineFactory(PipelineStages())
        p = pf(config['streamcorpus_pipeline'])
        p.run(get_test_chunk_path(test_data_dir))
Example #25
0
def local_storage():
    config_yaml = """
kvlayer:
    storage_type: local
    storage_addresses: []
    namespace: test
    app_name: test
"""
    with yakonfig.defaulted_config([kvlayer], yaml=config_yaml):
        local_storage = LocalStorage()
        yield local_storage
        local_storage.delete_namespace()
Example #26
0
def test_proxy_two_level():
    with yakonfig.defaulted_config(
            [yakonfig.ProxyConfigurable(ConfigurableTop)],
            { 'aaa': 'a', 'bbb': 'b', 'zzz': 'z' }):
        c = yakonfig.get_global_config()
        assert sorted(iterkeys(c)) == ['top']
        c = c['top']
        assert sorted(iterkeys(c)) == ['aaa', 'bottom']
        assert c['aaa'] == 'a'
        c = c['bottom']
        assert sorted(iterkeys(c)) == ['zzz']
        assert c['zzz'] == 'z'
def test_external_stage_default(tmpdir):
    with yakonfig.defaulted_config([streamcorpus_pipeline], config={
            'streamcorpus_pipeline': {
                'external_stages_path': __file__,
                'reader': 'from_local_chunks',
                'writers': ['to_local_chunks'],
                'tmp_dir_path': str(tmpdir),
            },
    }):
        config=yakonfig.get_global_config('streamcorpus_pipeline',
                                          'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'default message'
def test_external_stage_unregistered(tmpdir):
    with yakonfig.defaulted_config([streamcorpus_pipeline], config={
            'streamcorpus_pipeline': {
                'tmp_dir_path': str(tmpdir),
                'external_stage': {
                    'message': 'configured message',
                },
            },
    }):
        config=yakonfig.get_global_config('streamcorpus_pipeline',
                                          'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
Example #33
0
def direct(config_path, namespace_string):
    with yakonfig.defaulted_config(
        [kvlayer], filename=config_path, params={"app_name": "kvlayer", "namespace": namespace_string}
    ):
        config = yakonfig.get_global_config("kvlayer")
        conn = Accumulo(
            host="test-accumulo-1.diffeo.com", port=50096, user=config["username"], password=config["password"]
        )

        yield conn

        tables = conn.list_tables()
        for table in tables:
            if re.search(namespace_string, table):
                conn.delete_table(table)
Example #34
0
def worker(work_unit):
    '''Expects a WorkUnit from coordinated, obtains a config, and runs
    traverse_extract_fetch

    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run extraction without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    #logger.info(unitconf)
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):
        traverse_extract_fetch(web_conf, work_unit.key)
def test_external_stage_default(tmpdir):
    with yakonfig.defaulted_config(
        [streamcorpus_pipeline],
            config={
                'streamcorpus_pipeline': {
                    'external_stages_path': __file__,
                    'reader': 'from_local_chunks',
                    'writers': ['to_local_chunks'],
                    'tmp_dir_path': str(tmpdir),
                },
            }):
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'default message'
def test_external_stage_unregistered(tmpdir):
    with yakonfig.defaulted_config(
        [streamcorpus_pipeline],
            config={
                'streamcorpus_pipeline': {
                    'tmp_dir_path': str(tmpdir),
                    'external_stage': {
                        'message': 'configured message',
                    },
                },
            }):
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def test_external_module_registered(tmpdir):
    with yakonfig.defaulted_config([streamcorpus_pipeline], config={
            'streamcorpus_pipeline': {
                'external_stages_modules':
                [ 'streamcorpus_pipeline.tests.test_pipeline' ],
                'external_stage': {
                    'message': 'configured message',
                },
                'reader': 'from_local_chunks',
                'writers': ['to_local_chunks'],
                'tmp_dir_path': str(tmpdir),
            },
    }):
        config=yakonfig.get_global_config('streamcorpus_pipeline',
                                          'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config,
                                            output_file):
    """set a publisher_type filter that matches nothing in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'MICROBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        # no chunks means the output file won't actually get written
        assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config,
                                         output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config,
                                            output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()
def test_external_module_registered(tmpdir):
    with yakonfig.defaulted_config(
        [streamcorpus_pipeline],
            config={
                'streamcorpus_pipeline': {
                    'external_stages_modules':
                    ['streamcorpus_pipeline.tests.test_pipeline'],
                    'external_stage': {
                        'message': 'configured message',
                    },
                    'reader':
                    'from_local_chunks',
                    'writers': ['to_local_chunks'],
                    'tmp_dir_path':
                    str(tmpdir),
                },
            }):
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config,
                                     output_file):
    """minimal end-to-end test, preloading data in the loader"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        with open(filename, 'rb') as f:
            from_spinn3r_feed._prefetched[key] = f.read()
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Example #47
0
 def make_config(overlay={}):
     config = yakonfig.merge.overlay_config(base_config, overlay)
     with yakonfig.defaulted_config([kvlayer, MiniScp], config=config):
         yield
         client = kvlayer.client()
         client.delete_namespace()
Example #48
0
 def make_config(overlay={}):
     config = yakonfig.merge.overlay_config(base_config, overlay)
     with yakonfig.defaulted_config([], config=config):
         yield
Example #49
0
def worker(work_unit, max_sample=1000):
    '''Expects a coordinate WorkUnit for DragNet and runs the following
    steps:

    1. scans all dossiers at the *folder* level and assembles feature
    vectors for each folder -- see `make_feature`

    2. trains a multinomial naive Bayes classifier that treats each
    *folder* as a classifier target.

    3. sample the corpus by scanning up to `max_sample` and applying
    the classifier to each item to get an approx "size" of the Folder

    4. Bootstrap by treating those classifier predictions as truth
    data and extract the learned features that are predictive as new
    query strings.

    5. Put the data in kvlayer for webservice end point to return to
    polling client -- see dossier.models.routes

    '''
    if 'config' not in work_unit.spec:
        raise coordinate.exceptions.ProgrammerError(
            'could not run dragnet without global config')

    web_conf = Config()
    unitconf = work_unit.spec['config']
    with yakonfig.defaulted_config([coordinate, kvlayer, dblogger, web_conf],
                                   config=unitconf):

        labels = []
        D = list()

        label2fid = dict()

        rejects = set()
        keepers = set()

        # 1. make a classifier target for each *folder*, ignoring
        # subfolder structure
        FT = Folders(web_conf.kvlclient)
        for idx, fid in enumerate(FT.folders()):
            label2fid[idx] = fid
            for sid in FT.subfolders(fid):
                for cid, subtopic_id in FT.items(fid, sid):
                    fc = web_conf.store.get(cid)
                    if fc:
                        # NB: first call to make_feature
                        feat, _rejects, _keepers = make_feature(fc)
                    else:
                        _rejects = {}
                        _keepers = {}
                    D.append(feat)
                    labels.append(idx)
                    rejects.update(_rejects)
                    keepers.update(_keepers)
                    logger.info('fid=%r, observation: %r', fid, cid)

        # 2. Convert the StringCounters into an sklearn format and
        # train MultinomialNB
        logger.info('transforming...')
        v = DictVectorizer(sparse=False)
        X = v.fit_transform(D)
        logger.info('transform fit done.')

        labels = np.array(labels)

        # Fit the sklearn Bernoulli Naive Bayes classifer
        clf = MultinomialNB()
        clf.fit(X, labels)
        logger.info('fit MultinomialNB')

        # 3. Scan the corpus up to max_sample putting the items into
        # each target to get an approx "size" of the Folder
        counts = Counter()
        for cid, fc in islice(web_conf.store.scan(), max_sample):
            # build the same feature vector as the training process
            feat, _rejects, _keepers = make_feature(fc)
            X = v.transform([feat])
            # predict which folder it belongs in
            target = clf.predict(X[0])[0]
            # count the effective size of that folder in this sample
            counts[label2fid[target]] += 1

        logger.info('counts done')

        ## 4. Bootstrap by treating those classifier predictions as
        ## truth data and extract the learned features that are
        ## predictive as new query strings.
        clusters = []
        for idx in sorted(set(labels)):
            logger.debug('considering cluster: %d', idx)
            try:
                all_features = v.inverse_transform(clf.feature_log_prob_[idx])[0]
            except:
                logger.warn('beyond edge on cluster %d', idx)
                continue
            words = Counter(all_features)
            ordered = sorted(words.items(),
                             key=operator.itemgetter(1), reverse=True)
            filtered = []
            for it in ordered:
                if is_bad_token(it[0]): continue

                if is_username(it[0]):
                    logger.debug('%r is_username', it[0])
                #else:
                #    continue
                filtered.append(it)
                if len(filtered) > 100: # hard cutoff
                    break

            # normalize cluster size exponentially
            biggest = exp(filtered[0][1])
            # rescale all by biggest
            filtered = [(key, int(round(counts[label2fid[idx]] * exp(w) / biggest))) for key, w in filtered]
            # describe what we just figured out
            logger.info('%s --> %r', label2fid[idx], ['%s: %d' % it for it in filtered[:10]])

            # return build the JSON-serializable format for the
            # DragNet UI embedded inside SortingDesk
            cluster = []
            cluster.append({'caption': label2fid[idx],
                            'weight': counts[label2fid[idx]],
                            'folder_id': None,
                            })
            cluster += [{'caption': caption, 'weight': weight, 'folder_id': label2fid[idx]} for caption, weight in filtered if weight > 0]
            clusters.append(cluster)

        # 5. Put the data in kvlayer for webservice end point to
        # return to polling client
        web_conf.kvlclient.setup_namespace({'dragnet': (str,)})
        web_conf.kvlclient.put('dragnet', (('dragnet',), json.dumps({'clusters': clusters})))
        return dict(counts)