Beispiel #1
0
def test_count_matrix_specials_indexes():
    specials = (UNK(), PAD())
    vocab = Vocab(specials=specials)
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()

    count_vectorizer = CountVectorizer(vocab=vocab)
    count_vectorizer._init_special_indexes()

    assert len(count_vectorizer._special_indexes) == 2
    for i in specials:
        assert vocab.stoi[i] in count_vectorizer._special_indexes
Beispiel #2
0
def test_specials_indexes():
    specials = (UNK(), PAD())
    vocab = Vocab(specials=specials)
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()

    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    assert len(tfidf._special_indexes) == 2
    for i in specials:
        assert vocab.stoi[i] in tfidf._special_indexes
Beispiel #3
0
def test_build_count_matrix_costum_specials_vocab_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    vocab_words = ["this", "is", "the", "first", "document"]
    vocab += vocab_words
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab, specials=[PAD(), "this", "first"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[0, 1, 1, 1], [1, 1, 1, 2], [3, 1, 1, 0],
                         [0, 1, 1, 1]])
    assert np.all(count_matrix == expected)
Beispiel #4
0
def test_label_field():
    vocab = Vocab(specials=())
    data = ["label_1", "label_2", "label_3"]

    vocab += data
    vocab.finalize()

    label_field = LabelField("test_label_field", numericalizer=vocab)

    preprocessed_data = [label_field.preprocess(label) for label in data]

    for x in preprocessed_data:
        _, data = x[0]
        _, tokenized = data
        assert label_field.numericalize(data) == vocab.stoi[tokenized]
Beispiel #5
0
def test_build_count_matrix_costum_specials_vocab_without_specials():
    vocab = Vocab(specials=())
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(
        vocab=vocab,
        specials=["the", "first", "second", "one", "third", "and"])
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([[1, 1, 1], [1, 1, 2], [1, 1, 0], [1, 1, 1]])
    assert np.all(count_matrix == expected)
Beispiel #6
0
def test_count_vectorizer_transform_tokens_tensor():
    vocab = Vocab(specials=())
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    count_vectorizer = CountVectorizer(vocab=vocab)
    count_vectorizer.fit(dataset=None, field=None)

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    bow = count_vectorizer.transform(numericalized_data).todense()
    expected = np.array([
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 2, 0, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
    ])
    assert np.allclose(a=bow, b=expected, rtol=0, atol=1.0e-6)
Beispiel #7
0
def test_build_count_matrix_from_tensor_with_specials():
    vocab = Vocab(specials=(UNK(), PAD()))
    for i in DATA:
        vocab += i.split(" ")
    vocab.finalize()
    tfidf = TfIdfVectorizer(vocab=vocab)
    tfidf._init_special_indexes()

    numericalized_data = get_numericalized_data(data=DATA, vocab=vocab)
    count_matrix = tfidf._build_count_matrix(
        data=numericalized_data, unpack_data=tfidf._get_tensor_values)
    expected = np.array([
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 2, 0, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0],
    ])
    assert np.all(count_matrix == expected)