def test_get_pipeline_fit_args(): conf = { 'feature_selection': { 'must_be_in_thesaurus': True }, 'vectorizer': { 'decode_token_handler': 'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler', 'random_neighbour_thesaurus': False, }, 'feature_extraction': { 'train_time_opts': {}, 'decode_time_opts': {} }, 'vector_sources': { 'neighbours_file': ['tests/resources/twos.vectors.txt'], 'entries_of': 'tests/resources/ones.vectors.txt', 'clusters_file': '', 'is_thesaurus': False, 'dummy_thesaurus': False, } } res = get_pipeline_fit_args(conf) v = res['vector_source'] assert len(v) == 4 assert set(v.keys()) == set('a/N b/V c/J d/N'.split()) assert v.get_vector('a/N').A.ravel().tolist() == [1, 0, 0, 0] assert v.get_vector('c/J').A.ravel().tolist() == [0, 0, 1, 0] conf['vector_sources']['entries_of'] = conf['vector_sources']['neighbours_file'][0] print(conf) res = get_pipeline_fit_args(conf) v = res['vector_source'] assert len(v) == 8 assert set(v.keys()) == set('a/N b/V c/J d/N e/N f/V g/J h/N'.split()) assert v.get_vector('a/N').A.ravel().tolist() == [1, 0, 0, 0] assert v.get_vector('c/J').A.ravel().tolist() == [0, 0, 1, 0] assert v.get_vector('g/J').A.ravel().tolist() == [0, 0, 1.1, 0]
def _build_pipeline(conf, cv_i): """ Builds a pipeline consisting of - feature extractor - optional feature selection - optional dimensionality reduction - classifier """ exp_name = conf['name'] debug_level = conf['debug_level'] init_args = {} pipeline_list = [] _build_vectorizer(init_args, conf, pipeline_list) _build_feature_selector(init_args, conf['feature_selection'], pipeline_list) # put the optional dumper after feature selection/dim. reduction if debug_level > 0: logging.info('Will perform post-vectorizer data dump') pipeline_list.append(('dumper', FeatureVectorsCsvDumper(exp_name, cv_i, conf['output_dir']))) # vectorizer will return a matrix (as usual) and some metadata for use with feature dumper/selector, # strip them before we proceed to the classifier pipeline_list.append(('stripper', MetadataStripper())) fit_args = {} if debug_level > 0: fit_args['vect__stats_hdf_file'] = 'statistics/stats-%s' % exp_name pipeline = PicklingPipeline(pipeline_list, exp_name) if debug_level > 1 else Pipeline(pipeline_list) shared_fit_args = get_pipeline_fit_args(conf) for step_name, _ in pipeline.steps: for param_name, param_val in shared_fit_args.items(): fit_args['%s__%s' % (step_name, param_name)] = param_val fit_args['%s__cv_fold' % step_name] = cv_i fit_args['stripper__strategy'] = conf['vector_sources']['neighbour_strategy'] # tell vector source to retrieve a few more neighbours than would be needed fit_args['stripper__k'] = int(conf['vectorizer']['k'] * 8) pipeline.set_params(**init_args) logging.debug('Pipeline is:\n %s', pipeline) return pipeline, fit_args