def test_pipeline(request, test_data_dir):
    filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
def test_pipeline(request, test_data_dir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
def test_pipeline(monkeypatch):
    def mockexit(status=0):
        log( ' sys.exit(%d) ' % status )
        raise SuccessfulExit()
    monkeypatch.setattr(sys, 'exit', mockexit)
    path = os.path.dirname(__file__)
    config = yaml.load(open(os.path.join(path, 'test_dedup_chunk_counts.yaml')))

    ## config says read from stdin, so make that have what we want
    stdin = sys.stdin
    sys.stdin = StringIO(get_test_chunk_path())

    ## run the pipeline
    p = Pipeline( config )

    from streamcorpus_pipeline.run import SimpleWorkUnit
    work_unit = SimpleWorkUnit('long string indicating source of text')
    work_unit.data['start_chunk_time'] = time.time()
    work_unit.data['start_count'] = 0
    g = gevent.spawn(p._process_task, work_unit)

    gevent.sleep(5)

    with pytest.raises(SuccessfulExit):  # pylint: disable=E1101
        p.shutdown(sig=signal.SIGTERM)

    log( 'now joining...' )
    timeout = gevent.Timeout(1)
    g.join(timeout=timeout)
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(
        conflict_handler='resolve',
        description='process entire directories using streamcorpus_pipeline')
    parser.add_argument('directories',
                        nargs='+',
                        metavar='directory',
                        help='directory name(s) to process')
    args = yakonfig.parse_args(parser, [
        yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline,
        DirectoryConfig
    ])
    gconfig = yakonfig.get_global_config()
    scdconfig = gconfig['streamcorpus_directory']

    work_spec = {
        'name': scdconfig.get('name', 'streamcorpus_directory'),
        'desc': 'read files from a directory',
        'min_gb': 8,
        'config': gconfig,
        'module': 'streamcorpus_pipeline._rejester',
        'run_function': 'rejester_run_function',
        'terminate_function': 'rejester_terminate_function',
    }

    def get_filenames():
        for d in args.directories:
            if scdconfig['mode'] == 'files':
                yield d
            elif scdconfig['mode'] == 'file-lists':
                with open(d, 'r') as f:
                    for line in f:
                        yield line.strip()
            elif scdconfig['mode'] == 'directories':
                for dirpath, dirnames, filenames in os.walk(d):
                    for filename in filenames:
                        yield os.path.abspath(os.path.join(dirpath, filename))

    work_units = {filename: {'start_count': 0} for filename in get_filenames()}

    if scdconfig['engine'] == 'rejester':
        tm = rejester.TaskMaster(gconfig['rejester'])
        tm.update_bundle(work_spec, work_units)
    elif scdconfig['engine'] == 'standalone':
        for k, v in work_units.iteritems():
            u = SimpleWorkUnit(k)
            u.spec = work_spec
            u.data = v
            rejester_run_function(u)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config, output_file):
    """set a publisher_type filter that matches nothing in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'MICROBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        # no chunks means the output file won't actually get written
        assert not os.path.exists(output_file)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(conflict_handler='resolve',
        description='process entire directories using streamcorpus_pipeline')
    parser.add_argument('directories', nargs='+', metavar='directory',
                        help='directory name(s) to process')
    args = yakonfig.parse_args(parser, [yakonfig, rejester, kvlayer, dblogger,
                                        streamcorpus_pipeline, DirectoryConfig])
    gconfig = yakonfig.get_global_config()
    scdconfig = gconfig['streamcorpus_directory']
    
    work_spec = {
        'name': scdconfig.get('name', 'streamcorpus_directory'),
        'desc': 'read files from a directory',
        'min_gb': 8,
        'config': gconfig,
        'module': 'streamcorpus_pipeline._rejester',
        'run_function': 'rejester_run_function',
        'terminate_function': 'rejester_terminate_function',
    }

    def get_filenames():
        for d in args.directories:
            if scdconfig['mode'] == 'files':
                yield d
            elif scdconfig['mode'] == 'file-lists':
                with open(d, 'r') as f:
                    for line in f:
                        yield line.strip()
            elif scdconfig['mode'] == 'directories':
                for dirpath, dirnames, filenames in os.walk(d):
                    for filename in filenames:
                        yield os.path.abspath(os.path.join(dirpath, filename))

    work_units = { filename: { 'start_count': 0 }
                   for filename in get_filenames() }

    if scdconfig['engine'] == 'rejester':
        tm = rejester.TaskMaster(gconfig['rejester'])
        tm.update_bundle(work_spec, work_units)
    elif scdconfig['engine'] == 'standalone':
        for k,v in work_units.iteritems():
            u = SimpleWorkUnit(k)
            u.spec = work_spec
            u.data = v
            rejester_run_function(u)
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config,
                                            output_file):
    """set a publisher_type filter that matches nothing in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'MICROBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        # no chunks means the output file won't actually get written
        assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config,
                                         output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config,
                                            output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, preloading data in the loader"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        with open(filename, 'rb') as f:
            from_spinn3r_feed._prefetched[key] = f.read()
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
def test_spinn3r_pipeline_prefetched(filename, urls, pipeline_config,
                                     output_file):
    """minimal end-to-end test, preloading data in the loader"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        with open(filename, 'rb') as f:
            from_spinn3r_feed._prefetched[key] = f.read()
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls