Esempio n. 1
0
def test_nondistributional_baseline(conf):
    conf['vectorizer']['decode_token_handler'] = base_handler
    conf['vector_sources']['neighbours_file'] = []
    conf['feature_selection']['must_be_in_thesaurus'] = False

    for debug_level in [0, 1, 2]:
        conf['debug_level'] = debug_level
        run_experiment(conf)
Esempio n. 2
0
def test_nondistributional_baseline_improperly_configured(conf):
    conf['vectorizer']['decode_token_handler'] = base_handler
    conf['vector_sources']['neighbours_file'] = []

    # we ask features to be in thesaurus, but do not provide one
    conf['feature_selection']['run'] = True
    conf['feature_selection']['must_be_in_thesaurus'] = True
    with pytest.raises(ValueError):
        run_experiment(conf)
Esempio n. 3
0
def test_nondistributional_baseline_test_on_training_data(conf):
    conf['vectorizer']['decode_token_handler'] = base_handler
    conf['vector_sources']['neighbours_file'] = []
    conf['crossvalidation']['type'] = 'oracle'
    conf['test_data'] = None

    run_experiment(conf)

    df = pd.read_csv(output_file, header=0)
    assert set(df.score) == {1.0}, 'Must achieve perfect accuracy'
Esempio n. 4
0
def test_distributional_with_vector_clusters(conf, tmpdir):
    # generate random vectors for the the appropriate features and cluster them first
    x_tr, _, _, _ = get_tokenized_data(conf['training_data'], conf['tokenizer'])
    feats = FeatureExtractor().extract_features_from_tree_list([foo[0] for foo in x_tr])
    vectors = np.random.random((len(feats), 10))
    v = DenseVectors(pd.DataFrame(vectors, index=feats))
    tmpfile = str(tmpdir.join('tmp_random_vectors'))
    v.to_tsv(tmpfile, dense_hd5=True)

    tmpclusters = str(tmpdir.join('tmp_random_clusters'))
    cluster_vectors(tmpfile, tmpclusters, n_clusters=5, n_jobs=1)

    conf['vector_sources']['neighbours_file'] = []
    conf['vectorizer']['class'] = 'eval.pipeline.multivectors.KmeansVectorizer'
    conf['vector_sources']['clusters_file'] = tmpclusters
    # the features of the document are cluster ids, not phrases
    # no point in checking in they are in the thesaurus
    conf['feature_selection']['must_be_in_thesaurus'] = False

    for debug_level in [0, 1, 2]:
        conf['debug_level'] = debug_level
        run_experiment(conf)
Esempio n. 5
0
def test_std_feature_expansion(conf):
    conf['vectorizer']['decode_token_handler'] = hybrid_handler
    run_experiment(conf)
Esempio n. 6
0
def test_extreme_feature_expansion(conf):
    conf['vectorizer']['decode_token_handler'] = extreme_handler
    run_experiment(conf)