Exemple #1
0
def replace_config(config, name):
    # Do we have external stages?
    if ('external_stages_path' not in config
            and 'external_stages_modules' not in config):
        return streamcorpus_pipeline
    stages = PipelineStages()
    if 'external_stages_path' in config:
        path = config['external_stages_path']
        if not os.path.isabs(path) and config.get('root_path'):
            path = os.path.join(config['root_path'], path)
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            return streamcorpus_pipeline  # let check_config re-raise this
def replace_config(config, name):
    # Do we have external stages?
    if ('external_stages_path' not in config and
        'external_stages_modules' not in config):
        return streamcorpus_pipeline
    stages = PipelineStages()
    if 'external_stages_path' in config:
        path = config['external_stages_path']
        if not os.path.isabs(path) and config.get('root_path'):
            path = os.path.join(config['root_path'], path)
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            return streamcorpus_pipeline # let check_config re-raise this
Exemple #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='process a sequence of stream items',
        usage='streamcorpus_pipeline --config config.yaml --input file.in')
    parser.add_argument('-i', '--input', action='append', 
                        help='file paths to input instead of reading from stdin')
    parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files')
    parser.add_argument('--third-dir-path', help='path to third-party tools directory')
    parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large')

    modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()

    ## this modifies the global config, passed by reference
    instantiate_config(config)

    input_paths = []
    if args.in_glob:
        for pattern in args.in_glob:
            input_paths.extend(glob.glob(pattern))
    if args.input:
        if '-' in args.input:
            if args.in_glob:
                sys.exit('cannot use "-i -" and --in-glob together')
            if len(args.input) > 1:
                sys.exit('cannot use "-i -" with multiple inputs')
            input_paths = sys.stdin
        else:
            input_paths.extend(args.input)

    scp_config = config['streamcorpus_pipeline']
    stages = PipelineStages()
    if 'external_stages_path' in scp_config:
        stages.load_external_stages(scp_config['external_stages_path'])
    if 'external_stages_modules' in scp_config:
        for mod in scp_config['external_stages_modules']:
            stages.load_module_stages(mod)
    factory = PipelineFactory(stages)
    pipeline = factory(scp_config)

    for i_str in input_paths:
        work_unit = SimpleWorkUnit(i_str.strip())
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

    ## explicitly call cleanup, which is idempotent
    pipeline.cleanup()
Exemple #4
0
def check_config(config, name):
    if 'tmp_dir_path' not in config:
        raise ConfigurationError(
            '{} requires tmp_dir_path setting'.format(name))

    # Checking stages:
    stages = PipelineStages()

    # (1) Push in the external stages;
    if 'external_stages_path' in config:
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            raise ConfigurationError(
                'invalid {} external_stages_path {}'.format(
                    name, config['external_stages_path']), e)
def check_config(config, name):
    if 'tmp_dir_path' not in config:
        raise ConfigurationError('{} requires tmp_dir_path setting'
                                 .format(name))

    # Checking stages:
    stages = PipelineStages()

    # (1) Push in the external stages; 
    if 'external_stages_path' in config:
        try:
            stages.load_external_stages(config['external_stages_path'])
        except IOError, e:
            raise ConfigurationError(
                'invalid {} external_stages_path {}'
                .format(name, config['external_stages_path']), e)
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([dblogger, kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()