Exemple #1
0
def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer,
                              normalize, dedupe_sentences, test_text_info):
    test_text, vocabulary = test_text_info
    model = WordVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        dedupe_sentences=dedupe_sentences,
    )
    result = model.fit_transform(test_text)

    if test_text == test_text_example:
        if vectorizer in ["before", "after", "symmetric"]:
            assert result.shape == (7, 7)
        if vectorizer == "directional":
            assert result.shape == (7, 14)
    else:
        if token_contractor is None:
            output_vocab = set([
                x.lstrip("pre_").lstrip("post_")
                for x in model.column_label_dictionary_.keys()
            ])
            lower_vocabulary = set([x.lower() for x in vocabulary] + [" "])
            note(output_vocab.difference(lower_vocabulary))
            assert result.shape[0] <= len(lower_vocabulary)
            # assert output_vocab.issubset(lower_vocabulary)
    assert type(result) == scipy.sparse.csr.csr_matrix
Exemple #2
0
def test_wordvectorizer_todataframe(test_text_info):
    test_text, vocabulary = test_text_info
    model = WordVectorizer().fit(test_text)
    df = model.to_DataFrame()
    if test_text == test_text_example:
        assert df.shape == (7, 14)
    else:
        assert df.shape[0] <= len(vocabulary)
        assert df.shape[1] == df.shape[0] * 2
Exemple #3
0
def test_multitokencooccurrencevectorizer():
    model = WordVectorizer(
        vectorizer=MultiTokenCooccurrenceVectorizer,
        vectorizer_kwds=_MULTITOKEN_COOCCURRENCE_VECTORIZERS["flat_1_5"]
        ["kwds"],
    ).fit(test_text_example)
    assert model.representation_.shape == (7, 28)
Exemple #4
0
def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer,
                              normalize, dedupe_sentences):
    model = WordVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        dedupe_sentences=dedupe_sentences,
    )
    result = model.fit_transform(test_text)

    if vectorizer == "flat":
        assert result.shape == (7, 14)
    if vectorizer == "flat_1_5":
        assert result.shape == (7, 28)
    assert type(result) == scipy.sparse.csr.csr_matrix
Exemple #5
0
def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer,
                              normalize, dedupe_sentences):
    model = WordVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        dedupe_sentences=dedupe_sentences,
    )
    result = model.fit_transform(test_text)

    if vectorizer in ["before", "after", "symmetric"]:
        assert result.shape == (7, 7)
    if vectorizer == "directional":
        assert result.shape == (7, 14)
    assert type(result) == scipy.sparse.csr.csr_matrix
Exemple #6
0
def test_wordvectorizer_vocabulary(test_text_info):
    test_text, vocabulary = test_text_info
    if test_text == test_text_example:
        vocab = ["foo", "bar"]
    else:
        vocab = test_text[0].split()[:2]
    model = WordVectorizer(token_dictionary=vocab).fit(test_text)
    assert model.representation_.shape == (2, 4)
    assert model.token_dictionary == vocab
Exemple #7
0
def test_wordvectorizer_todataframe():
    model = WordVectorizer().fit(test_text)
    df = model.to_DataFrame()
    assert df.shape == (7, 14)
Exemple #8
0
def test_wordvectorizer_vocabulary():
    model = WordVectorizer(token_dictionary=["foo", "bar"]).fit(test_text)
    assert model.representation_.shape == (2, 4)