Ejemplo n.º 1
0
def test_field_custom_numericalization_vocab_non_string():
    vocab = Vocab(specials=())
    tfield = Field("bla", numericalizer=vocab, tokenizer=None)

    _, data1 = tfield.preprocess([1, 2, 3])[0]
    _, data2 = tfield.preprocess([3, 2, 1])[0]
    _, data3 = tfield.preprocess([3, 4, 5, 6])[0]
    _, data4 = tfield.preprocess([2, 3, 6])[0]

    tfield.finalize()

    assert np.all(tfield.numericalize(data1) == vocab.numericalize([1, 2, 3]))
    assert np.all(tfield.numericalize(data2) == vocab.numericalize([3, 2, 1]))
    assert np.all(tfield.numericalize(data3) == vocab.numericalize([3, 4, 5, 6]))
    assert np.all(tfield.numericalize(data4) == vocab.numericalize([2, 3, 6]))
Ejemplo n.º 2
0
def test_field_vocab_no_tokenization():
    vocab = Vocab(eager=True)
    pretokenized_input1 = ["word", "words", "uttering"]
    pretokenized_input2 = ["word", "words"]
    pretokenized_input3 = ["word"]

    pretokenized_input4 = ["word", "uttering"]

    tokenized_field = Field("test_field", tokenizer=None, numericalizer=vocab)

    _, data1 = tokenized_field.preprocess(pretokenized_input1)[0]
    _, data2 = tokenized_field.preprocess(pretokenized_input2)[0]
    _, data3 = tokenized_field.preprocess(pretokenized_input3)[0]
    _, data4 = tokenized_field.preprocess(pretokenized_input4)[0]

    tokenized_field.finalize()

    expected_numericalization_1 = np.array([2, 3, 4])
    _, tok1 = data1
    assert np.all(vocab.numericalize(tok1) == expected_numericalization_1)
    assert np.all(tokenized_field.numericalize(data1) == expected_numericalization_1)

    expected_numericalization_2 = np.array([2, 3])
    _, tok2 = data2
    assert np.all(vocab.numericalize(tok2) == expected_numericalization_2)
    assert np.all(tokenized_field.numericalize(data2) == expected_numericalization_2)

    expected_numericalization_3 = np.array([2])
    _, tok3 = data3
    assert np.all(vocab.numericalize(tok3) == expected_numericalization_3)
    assert np.all(tokenized_field.numericalize(data3) == expected_numericalization_3)

    expected_numericalization_4 = np.array([2, 4])
    _, tok4 = data4
    assert np.all(vocab.numericalize(tok4) == expected_numericalization_4)
    assert np.all(tokenized_field.numericalize(data4) == expected_numericalization_4)