def test_baseline_use_all_features_with_signified_random(data, conf): conf['feature_selection']['must_be_in_thesaurus'] = False conf['vectorizer']['decode_token_handler'] = \ 'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler' conf['vectorizer']['k'] = 1 x1, x2, voc = _vectorize_data(data, conf, dummy=True) assert full_vocab == strip(voc) assert isinstance(x1, sp.spmatrix) t.assert_array_equal( x1.toarray(), training_matrix ) t.assert_array_almost_equal( x2.toarray(), np.array( [ [0, 11.0, 0, 0, 0, 0], ] ) ) # the thesaurus will always say the neighbour for something is # b/N with a similarity of 1, and we look up 11 tokens overall in # the test document x1, x2, voc = _vectorize_data(data, conf, dummy=True) assert x2.sum(), 11.0 assert std(x2.todense()) > 0
def test_nondistributional_baseline_without_feature_selection(data, conf): x1, x2, voc = _vectorize_data(data, conf) assert full_vocab == strip(voc) assert isinstance(x1, sp.spmatrix) t.assert_array_equal( x1.toarray(), training_matrix ) t.assert_array_equal( x2.toarray(), np.array( [ [4, 1, 2, 0, 0, 0], ] ) )
def test_baseline_ignore_nonthesaurus_features_with_signifier_signified(data, conf): conf['feature_selection']['must_be_in_thesaurus'] = True conf['vectorizer']['decode_token_handler'] = \ 'eval.pipeline.feature_handlers.SignifierSignifiedFeatureHandler' conf['vectorizer']['k'] = 1 x1, x2, voc = _vectorize_data(data, conf) assert pruned_vocab == strip(voc) assert isinstance(x1, sp.spmatrix) t.assert_array_equal( x1.toarray(), pruned_training_matrix ) t.assert_array_almost_equal( x2.toarray(), np.array( [ [4, 1, 2.1] ] ) )
def test_baseline_use_all_features_with_signified(data, conf): conf['feature_selection']['must_be_in_thesaurus'] = False conf['vectorizer']['decode_token_handler'] = \ 'eval.pipeline.feature_handlers.SignifiedOnlyFeatureHandler' conf['vectorizer']['k'] = 1 # equivalent to max x1, x2, voc = _vectorize_data(data, conf) assert full_vocab == strip(voc) assert isinstance(x1, sp.spmatrix) t.assert_array_equal( x1.toarray(), training_matrix ) t.assert_array_almost_equal( x2.toarray(), np.array( [ [0, 0, 0, 4.4, 0, 0], ] ) )