Ejemplo n.º 1
0
def test_get_pipeline_fit_args():
    conf = {
        'feature_selection': {
            'must_be_in_thesaurus': True
        },
        'vectorizer': {
            'decode_token_handler': 'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler',
            'random_neighbour_thesaurus': False,
        },
        'feature_extraction': {
            'train_time_opts': {},
            'decode_time_opts': {}
        },
        'vector_sources': {
            'neighbours_file': ['tests/resources/twos.vectors.txt'],
            'entries_of': 'tests/resources/ones.vectors.txt',
            'clusters_file': '',
            'is_thesaurus': False,
            'dummy_thesaurus': False,
        }
    }

    res = get_pipeline_fit_args(conf)
    v = res['vector_source']
    assert len(v) == 4
    assert set(v.keys()) == set('a/N b/V c/J d/N'.split())
    assert v.get_vector('a/N').A.ravel().tolist() == [1, 0, 0, 0]
    assert v.get_vector('c/J').A.ravel().tolist() == [0, 0, 1, 0]

    conf['vector_sources']['entries_of'] = conf['vector_sources']['neighbours_file'][0]
    print(conf)
    res = get_pipeline_fit_args(conf)
    v = res['vector_source']
    assert len(v) == 8
    assert set(v.keys()) == set('a/N b/V c/J d/N e/N f/V g/J h/N'.split())
    assert v.get_vector('a/N').A.ravel().tolist() == [1, 0, 0, 0]
    assert v.get_vector('c/J').A.ravel().tolist() == [0, 0, 1, 0]
    assert v.get_vector('g/J').A.ravel().tolist() == [0, 0, 1.1, 0]
Ejemplo n.º 2
0
def _build_pipeline(conf, cv_i):
    """
    Builds a pipeline consisting of
        - feature extractor
        - optional feature selection
        - optional dimensionality reduction
        - classifier
    """
    exp_name = conf['name']
    debug_level = conf['debug_level']

    init_args = {}
    pipeline_list = []

    _build_vectorizer(init_args, conf, pipeline_list)

    _build_feature_selector(init_args, conf['feature_selection'], pipeline_list)

    # put the optional dumper after feature selection/dim. reduction
    if debug_level > 0:
        logging.info('Will perform post-vectorizer data dump')
        pipeline_list.append(('dumper', FeatureVectorsCsvDumper(exp_name, cv_i, conf['output_dir'])))

    # vectorizer will return a matrix (as usual) and some metadata for use with feature dumper/selector,
    # strip them before we proceed to the classifier
    pipeline_list.append(('stripper', MetadataStripper()))

    fit_args = {}
    if debug_level > 0:
        fit_args['vect__stats_hdf_file'] = 'statistics/stats-%s' % exp_name

    pipeline = PicklingPipeline(pipeline_list, exp_name) if debug_level > 1 else Pipeline(pipeline_list)
    shared_fit_args = get_pipeline_fit_args(conf)
    for step_name, _ in pipeline.steps:
        for param_name, param_val in shared_fit_args.items():
            fit_args['%s__%s' % (step_name, param_name)] = param_val
        fit_args['%s__cv_fold' % step_name] = cv_i
    fit_args['stripper__strategy'] = conf['vector_sources']['neighbour_strategy']
    # tell vector source to retrieve a few more neighbours than would be needed
    fit_args['stripper__k'] = int(conf['vectorizer']['k'] * 8)
    pipeline.set_params(**init_args)
    logging.debug('Pipeline is:\n %s', pipeline)
    return pipeline, fit_args