def test_count_matrix_specials_indexes(): specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") vocab.finalize() count_vectorizer = CountVectorizer(vocab=vocab) count_vectorizer._init_special_indexes() assert len(count_vectorizer._special_indexes) == 2 for i in specials: assert vocab.stoi[i] in count_vectorizer._special_indexes
def test_specials_indexes(): specials = (UNK(), PAD()) vocab = Vocab(specials=specials) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() assert len(tfidf._special_indexes) == 2 for i in specials: assert vocab.stoi[i] in tfidf._special_indexes
def test_build_count_matrix_costum_specials_vocab_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) vocab_words = ["this", "is", "the", "first", "document"] vocab += vocab_words vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0], [0, 1, 1, 1]]) assert np.all(count_matrix == expected)
def test_label_field(): vocab = Vocab(specials=()) data = ["label_1", "label_2", "label_3"] vocab += data vocab.finalize() label_field = LabelField("test_label_field", numericalizer=vocab) preprocessed_data = [label_field.preprocess(label) for label in data] for x in preprocessed_data: _, data = x[0] _, tokenized = data assert label_field.numericalize(data) == vocab.stoi[tokenized]
def test_build_count_matrix_costum_specials_vocab_without_specials(): vocab = Vocab(specials=()) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer( vocab=vocab, specials=["the", "first", "second", "one", "third", "and"]) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]]) assert np.all(count_matrix == expected)
def test_count_vectorizer_transform_tokens_tensor(): vocab = Vocab(specials=()) for i in DATA: vocab += i.split(" ") vocab.finalize() count_vectorizer = CountVectorizer(vocab=vocab) count_vectorizer.fit(dataset=None, field=None) numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) bow = count_vectorizer.transform(numericalized_data).todense() expected = np.array([ [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 2, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], ]) assert np.allclose(a=bow, b=expected, rtol=0, atol=1.0e-6)
def test_build_count_matrix_from_tensor_with_specials(): vocab = Vocab(specials=(UNK(), PAD())) for i in DATA: vocab += i.split(" ") vocab.finalize() tfidf = TfIdfVectorizer(vocab=vocab) tfidf._init_special_indexes() numericalized_data = get_numericalized_data(data=DATA, vocab=vocab) count_matrix = tfidf._build_count_matrix( data=numericalized_data, unpack_data=tfidf._get_tensor_values) expected = np.array([ [1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 2, 0, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0, 0, 0], ]) assert np.all(count_matrix == expected)