Exemple #1
0
def test_multi_column_tfidf_vectorizer_one_column_zero_output_tokens(kwargs, output_shape):
    """Tests that a TF-IDF document-term matrix is still returned when only one column breaks"""
    corpus = np.array(
        [
            ["Cats eat rats.", "Rats are mammals."],
            ["Dogs chase cats.", "Rats are mammals."],
            ["People like dogs.", "Rats are mammals."],
            ["People hate rats.", "Rats are mammals."],
        ]
    )

    vec = MultiColumnTfidfVectorizer(**kwargs)
    output = vec.fit_transform(corpus)
    assert output.shape == output_shape
Exemple #2
0
def test_multi_column_tfidf_vectorizer():
    vec = MultiColumnTfidfVectorizer()
    output = vec.fit_transform(corpus)

    assert isinstance(output, sp.coo.coo_matrix)

    observed = output.todense()
    expected = np.hstack(
        [
            TfidfVectorizer().fit_transform(corpus[:, 0]).todense(),
            TfidfVectorizer().fit_transform(corpus[:, 1]).todense(),
        ]
    )

    np.testing.assert_array_equal(observed, expected)
Exemple #3
0
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_on(kwargs, data, shape):
    """Tests for empty matrix when no terms remain after pruning"""
    vec = MultiColumnTfidfVectorizer(**kwargs)
    output = vec.fit_transform(data)
    assert output.shape == shape
Exemple #4
0
def test_multi_column_tfidf_vectorizer_vocabulary_sizes_small():
    vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] - 1 for i in range(corpus.shape[1])]
    vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes)
    observed = vectorizer.fit_transform(corpus)
    assert observed.shape[1] == sum(vocabulary_sizes)
    assert sp.issparse(observed)
Exemple #5
0
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_off(kwargs, data):
    """Tests for ValueError when no terms remain after pruning and `ignore_overpruned_columns=False`"""
    with pytest.raises(ValueError):
        vec = MultiColumnTfidfVectorizer(**kwargs)
        vec.fit_transform(data)