def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, dedupe_sentences, test_text_info): test_text, vocabulary = test_text_info model = WordVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, dedupe_sentences=dedupe_sentences, ) result = model.fit_transform(test_text) if test_text == test_text_example: if vectorizer in ["before", "after", "symmetric"]: assert result.shape == (7, 7) if vectorizer == "directional": assert result.shape == (7, 14) else: if token_contractor is None: output_vocab = set([ x.lstrip("pre_").lstrip("post_") for x in model.column_label_dictionary_.keys() ]) lower_vocabulary = set([x.lower() for x in vocabulary] + [" "]) note(output_vocab.difference(lower_vocabulary)) assert result.shape[0] <= len(lower_vocabulary) # assert output_vocab.issubset(lower_vocabulary) assert type(result) == scipy.sparse.csr.csr_matrix
def test_wordvectorizer_todataframe(test_text_info): test_text, vocabulary = test_text_info model = WordVectorizer().fit(test_text) df = model.to_DataFrame() if test_text == test_text_example: assert df.shape == (7, 14) else: assert df.shape[0] <= len(vocabulary) assert df.shape[1] == df.shape[0] * 2
def test_multitokencooccurrencevectorizer(): model = WordVectorizer( vectorizer=MultiTokenCooccurrenceVectorizer, vectorizer_kwds=_MULTITOKEN_COOCCURRENCE_VECTORIZERS["flat_1_5"] ["kwds"], ).fit(test_text_example) assert model.representation_.shape == (7, 28)
def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, dedupe_sentences): model = WordVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, dedupe_sentences=dedupe_sentences, ) result = model.fit_transform(test_text) if vectorizer == "flat": assert result.shape == (7, 14) if vectorizer == "flat_1_5": assert result.shape == (7, 28) assert type(result) == scipy.sparse.csr.csr_matrix
def test_wordvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, dedupe_sentences): model = WordVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, dedupe_sentences=dedupe_sentences, ) result = model.fit_transform(test_text) if vectorizer in ["before", "after", "symmetric"]: assert result.shape == (7, 7) if vectorizer == "directional": assert result.shape == (7, 14) assert type(result) == scipy.sparse.csr.csr_matrix
def test_wordvectorizer_vocabulary(test_text_info): test_text, vocabulary = test_text_info if test_text == test_text_example: vocab = ["foo", "bar"] else: vocab = test_text[0].split()[:2] model = WordVectorizer(token_dictionary=vocab).fit(test_text) assert model.representation_.shape == (2, 4) assert model.token_dictionary == vocab
def test_wordvectorizer_todataframe(): model = WordVectorizer().fit(test_text) df = model.to_DataFrame() assert df.shape == (7, 14)
def test_wordvectorizer_vocabulary(): model = WordVectorizer(token_dictionary=["foo", "bar"]).fit(test_text) assert model.representation_.shape == (2, 4)