コード例 #1
0
def test_baseline_use_all_features_with_signified_random(data, conf):
    conf['feature_selection']['must_be_in_thesaurus'] = False
    conf['vectorizer']['decode_token_handler'] = \
        'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler'
    conf['vectorizer']['k'] = 1

    x1, x2, voc = _vectorize_data(data, conf, dummy=True)

    assert full_vocab == strip(voc)

    assert isinstance(x1, sp.spmatrix)
    t.assert_array_equal(
        x1.toarray(),
        training_matrix
    )

    t.assert_array_almost_equal(
        x2.toarray(),
        np.array(
            [
                [0, 11.0, 0, 0, 0, 0],
            ]
        )
    )
    # the thesaurus will always say the neighbour for something is
    # b/N with a similarity of 1, and we look up 11 tokens overall in
    # the test document
    x1, x2, voc = _vectorize_data(data, conf, dummy=True)
    assert x2.sum(), 11.0
    assert std(x2.todense()) > 0
コード例 #2
0
def test_nondistributional_baseline_without_feature_selection(data, conf):
    x1, x2, voc = _vectorize_data(data, conf)
    assert full_vocab == strip(voc)

    assert isinstance(x1, sp.spmatrix)
    t.assert_array_equal(
        x1.toarray(),
        training_matrix
    )

    t.assert_array_equal(
        x2.toarray(),
        np.array(
            [
                [4, 1, 2, 0, 0, 0],
            ]
        )
    )
コード例 #3
0
def test_baseline_ignore_nonthesaurus_features_with_signifier_signified(data, conf):
    conf['feature_selection']['must_be_in_thesaurus'] = True
    conf['vectorizer']['decode_token_handler'] = \
        'eval.pipeline.feature_handlers.SignifierSignifiedFeatureHandler'
    conf['vectorizer']['k'] = 1

    x1, x2, voc = _vectorize_data(data, conf)
    assert pruned_vocab == strip(voc)
    assert isinstance(x1, sp.spmatrix)
    t.assert_array_equal(
        x1.toarray(),
        pruned_training_matrix
    )

    t.assert_array_almost_equal(
        x2.toarray(),
        np.array(
            [
                [4, 1, 2.1]
            ]
        )
    )
コード例 #4
0
def test_baseline_use_all_features_with_signified(data, conf):
    conf['feature_selection']['must_be_in_thesaurus'] = False
    conf['vectorizer']['decode_token_handler'] = \
        'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler'
    conf['vectorizer']['k'] = 1  # equivalent to max

    x1, x2, voc = _vectorize_data(data, conf)

    assert full_vocab == strip(voc)

    assert isinstance(x1, sp.spmatrix)
    t.assert_array_equal(
        x1.toarray(),
        training_matrix
    )

    t.assert_array_almost_equal(
        x2.toarray(),
        np.array(
            [
                [0, 0, 0, 4.4, 0, 0],
            ]
        )
    )