def test_field_preprocess_eager(): vocab = MockVocab(eager=True) f = Field(name="F", numericalizer=vocab) f.preprocess("some text") # vocab was updated assert len(vocab.values) > 0
def test_field_applies_specials(): bos, eos = BOS(), EOS() vocab = Vocab(specials=(bos, eos)) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", [bos, "asd", "123", "BLA", eos]) assert received == expected # Test with empty specials vocab = Vocab(specials=()) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected # Test core specials are a no-op vocab = Vocab(specials=(PAD(), UNK())) f = Field(name="F", tokenizer="split", numericalizer=vocab, keep_raw=True) _, received = f.preprocess("asd 123 BLA")[0] expected = ("asd 123 BLA", ["asd", "123", "BLA"]) assert received == expected
def test_field_custom_numericalization_no_tokenization(): tfield = Field("bla", numericalizer=lambda x: x, tokenizer=None) _, data1 = tfield.preprocess([1, 2, 3])[0] _, data2 = tfield.preprocess([3, 2, 1])[0] _, data3 = tfield.preprocess([3, 4, 5, 6])[0] _, data4 = tfield.preprocess([2, 3, 6])[0] tfield.finalize() assert np.all(tfield.numericalize(data1) == np.array([1, 2, 3])) assert np.all(tfield.numericalize(data2) == np.array([3, 2, 1])) assert np.all(tfield.numericalize(data3) == np.array([3, 4, 5, 6])) assert np.all(tfield.numericalize(data4) == np.array([2, 3, 6]))
def test_field_custom_numericalization_vocab_non_string(): vocab = Vocab(specials=()) tfield = Field("bla", numericalizer=vocab, tokenizer=None) _, data1 = tfield.preprocess([1, 2, 3])[0] _, data2 = tfield.preprocess([3, 2, 1])[0] _, data3 = tfield.preprocess([3, 4, 5, 6])[0] _, data4 = tfield.preprocess([2, 3, 6])[0] tfield.finalize() assert np.all(tfield.numericalize(data1) == vocab.numericalize([1, 2, 3])) assert np.all(tfield.numericalize(data2) == vocab.numericalize([3, 2, 1])) assert np.all(tfield.numericalize(data3) == vocab.numericalize([3, 4, 5, 6])) assert np.all(tfield.numericalize(data4) == vocab.numericalize([2, 3, 6]))
def test_field_custom_numericalization_no_tokenization_2(): label_indexer = {"one": 1, "two": 2, "three": 3, "four": 4} tfield = Field("bla", numericalizer=label_indexer.get, tokenizer=None) _, data1 = tfield.preprocess(["one", "two", "three"])[0] _, data2 = tfield.preprocess(["three", "two", "one"])[0] _, data3 = tfield.preprocess(["three", "four", "four", "two"])[0] _, data4 = tfield.preprocess(["two", "three", "one"])[0] tfield.finalize() assert np.all(tfield.numericalize(data1) == np.array([1, 2, 3])) assert np.all(tfield.numericalize(data2) == np.array([3, 2, 1])) assert np.all(tfield.numericalize(data3) == np.array([3, 4, 4, 2])) assert np.all(tfield.numericalize(data4) == np.array([2, 3, 1]))
def test_field_repeated_hooks(): def replace_tag_hook(raw, tokenized): replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized) return raw, replaced_tags def to_lower_hook(raw, tokenized): # keep track of the function call count to_lower_hook.call_count += 1 tokenized = map(str.lower, tokenized) return raw, tokenized to_lower_hook.call_count = 0 f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) # TAG -> tag f.add_posttokenize_hook(to_lower_hook) # <tag> -> ABC f.add_posttokenize_hook(replace_tag_hook) # ABC -> abc f.add_posttokenize_hook(to_lower_hook) _, received = f.preprocess("BLA <TAG> bla")[0] expected = ("BLA <TAG> bla", ["bla", "abc", "bla"]) assert received == expected # check that the hook that was added twice was also called twice assert to_lower_hook.call_count == 2
def test_field_posttokenize_hooks_detach(): f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) def remove_tags_hook(raw, tokenized): raw = raw.replace("<tag>", "") tokenized = map(lambda x: x.replace("<tag>", ""), tokenized) return raw, tokenized def to_upper_hook(raw, tokenized): raw = raw.upper() tokenized = map(str.upper, tokenized) return raw, tokenized f.add_posttokenize_hook(remove_tags_hook) f.add_posttokenize_hook(to_upper_hook) # detaching the hooks f.remove_posttokenize_hooks() _, received = f.preprocess("asd 123<tag> B<tag>LA")[0] expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"]) assert received == expected
def test_hook_returning_iterable(): data = "1,2,3,4" expected_tokens = [3, 5, 7, 9] field = Field( "Iterator_hook_test_field", tokenizer=lambda raw: [int(x) for x in raw.split(",")], numericalizer=id, keep_raw=True, ) def multiply_by_two_hook(raw, tokens): return raw, (i * 2 for i in tokens) def add_one_hook(raw, tokens): assert not isinstance(tokens, (list, tuple)) return raw, (i + 1 for i in tokens) field.add_posttokenize_hook(multiply_by_two_hook) field.add_posttokenize_hook(add_one_hook) _, (raw, tokens) = field.preprocess(data)[0] assert raw == data assert isinstance(tokens, (list, tuple)) assert tokens == expected_tokens
def test_missing_symbol_index_vocab(): vocab = Vocab() fld = Field( name="test_field", tokenizer="split", keep_raw=False, numericalizer=vocab, allow_missing_data=True, ) fld.preprocess("a b c d") ((_, data),) = fld.preprocess(None) assert data == (None, None) fld.finalize() assert fld.numericalize((None, None)) is None assert fld.get_default_value() == -1
def test_field_get_tokenizer_spacy_ok(): mp = patch.dict("sys.modules", spacy=MockSpacy()) mp.start() f = Field(name="F", numericalizer=MockVocab(), tokenizer="spacy") _, data = f.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) mp.stop()
def test_missing_values_default_sequential(): fld = Field( name="bla", keep_raw=False, tokenizer="split", numericalizer=hash, allow_missing_data=True, ) _, data_missing = fld.preprocess(None)[0] _, data_exists = fld.preprocess("data_string")[0] assert data_missing == (None, None) assert data_exists == (None, ["data_string"]) fld.finalize() assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([hash("data_string")]))
def test_field_get_tokenizer_callable(): vocab = MockVocab() def my_tokenizer(string): return [string[0], string[1:]] f = Field(name="F", numericalizer=vocab, tokenizer=my_tokenizer) _, data = f.preprocess("asd dsa")[0] assert data == (None, ["a", "sd dsa"])
def test_missing_values_custom_numericalize(): fld = Field( name="test_field", keep_raw=True, tokenizer=None, numericalizer=int, allow_missing_data=True, ) _, data_missing = fld.preprocess(None)[0] _, data_exists = fld.preprocess("404")[0] assert data_missing == (None, None) assert data_exists == ("404", "404") fld.finalize() assert fld.numericalize(data_missing) is None assert np.all(fld.numericalize(data_exists) == np.array([404]))
def test_field_preprocess_raw_sequential( value, store_raw, tokenize, expected_raw_value, expected_tokenized_value ): tokenizer = "split" if tokenize else None f = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer) ((_, (received_raw_value, received_tokenized_value)),) = f.preprocess(value) assert received_raw_value == expected_raw_value assert received_tokenized_value == expected_tokenized_value
def test_field_pretokenize_hooks(): f = Field(name="F", tokenizer="split", keep_raw=True) f.add_pretokenize_hook(str.lower) f.add_pretokenize_hook(lambda x: x.replace("bla", "blu")) f.add_pretokenize_hook(lambda x: x.replace(";", " ")) f.add_pretokenize_hook(lambda x: x.replace(",", " ")) raw_str = "asd;123,BLA" _, received = f.preprocess(raw_str)[0] expected = ("asd 123 blu", ["asd", "123", "blu"]) assert received == expected
def test_field_vocab_no_tokenization(): vocab = Vocab(eager=True) pretokenized_input1 = ["word", "words", "uttering"] pretokenized_input2 = ["word", "words"] pretokenized_input3 = ["word"] pretokenized_input4 = ["word", "uttering"] tokenized_field = Field("test_field", tokenizer=None, numericalizer=vocab) _, data1 = tokenized_field.preprocess(pretokenized_input1)[0] _, data2 = tokenized_field.preprocess(pretokenized_input2)[0] _, data3 = tokenized_field.preprocess(pretokenized_input3)[0] _, data4 = tokenized_field.preprocess(pretokenized_input4)[0] tokenized_field.finalize() expected_numericalization_1 = np.array([2, 3, 4]) _, tok1 = data1 assert np.all(vocab.numericalize(tok1) == expected_numericalization_1) assert np.all(tokenized_field.numericalize(data1) == expected_numericalization_1) expected_numericalization_2 = np.array([2, 3]) _, tok2 = data2 assert np.all(vocab.numericalize(tok2) == expected_numericalization_2) assert np.all(tokenized_field.numericalize(data2) == expected_numericalization_2) expected_numericalization_3 = np.array([2]) _, tok3 = data3 assert np.all(vocab.numericalize(tok3) == expected_numericalization_3) assert np.all(tokenized_field.numericalize(data3) == expected_numericalization_3) expected_numericalization_4 = np.array([2, 4]) _, tok4 = data4 assert np.all(vocab.numericalize(tok4) == expected_numericalization_4) assert np.all(tokenized_field.numericalize(data4) == expected_numericalization_4)
def test_field_pretokenize_hooks_detach(): f = Field(name="F", tokenizer="split", keep_raw=True) f.add_pretokenize_hook(str.lower) f.add_pretokenize_hook(lambda x: x.replace(";", " ")) f.add_pretokenize_hook(lambda x: x.replace(",", " ")) # detaching f.remove_pretokenize_hooks() raw_str = "asd;123,BLA" _, received = f.preprocess(raw_str)[0] expected = (raw_str, [raw_str]) assert received == expected
def test_field_pickle_spacy_tokenizer(tmpdir): mp = patch.dict("sys.modules", spacy=MockSpacy()) mp.start() fld = Field(name="F", numericalizer=None, tokenizer="spacy") _, data = fld.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) field_file = os.path.join(tmpdir, "field.pkl") with open(field_file, "wb") as fdata: dill.dump(fld, fdata) with open(field_file, "rb") as fdata: loaded_fld = dill.load(fdata) assert loaded_fld._tokenizer_arg_string == "spacy" _, data = loaded_fld.preprocess("bla blu")[0] assert data == (None, ["bla", "blu"]) mp.stop()
def test_field_posttokenize_hooks(): f = Field(name="F", tokenizer="split", keep_raw=True) def remove_tags_hook(raw, tokenized): raw = raw.replace("<tag>", "") tokenized = map(lambda x: x.replace("<tag>", ""), tokenized) return raw, tokenized def to_upper_hook(raw, tokenized): raw = raw.upper() tokenized = map(str.upper, tokenized) return raw, tokenized f.add_posttokenize_hook(remove_tags_hook) f.add_posttokenize_hook(to_upper_hook) _, received = f.preprocess("asd 123<tag> B<tag>LA")[0] expected = ("ASD 123 BLA", ["ASD", "123", "BLA"]) assert received == expected
def test_posttokenize_hooks_in_field_no_tokenization_single_execution(mocker): f = Field(name="F", tokenizer=None) def hk(data, tokenized): def caseness(token): if token.islower(): return "lowercase" else: return "uppercase" return data, [caseness(token) for token in tokenized] patched_hook = mocker.spy(hk, "__call__") f.add_posttokenize_hook(patched_hook) raw_str = ["Upper", "lower"] _, received = f.preprocess(raw_str)[0] expected = (None, ["uppercase", "lowercase"]) assert received == expected patched_hook.assert_called_once()
def test_field_pickle_tokenized( value, store_raw, tokenize, expected_raw_value, expected_tokenized_value, tmpdir ): tokenizer = "split" if tokenize else None fld = Field(name="F", keep_raw=store_raw, tokenizer=tokenizer) ((_, (received_raw_value, received_tokenized_value)),) = fld.preprocess(value) assert received_raw_value == expected_raw_value assert received_tokenized_value == expected_tokenized_value field_file = os.path.join(tmpdir, "field.pkl") with open(field_file, "wb") as fdata: dill.dump(fld, fdata) with open(field_file, "rb") as fdata: loaded_fld = dill.load(fdata) ((_, (raw_value, tokenized_value)),) = loaded_fld.preprocess(value) assert raw_value == expected_raw_value assert tokenized_value == expected_tokenized_value assert loaded_fld.name == "F" assert loaded_fld._keep_raw == store_raw
def test_field_get_tokenizer_default(): f = Field(name="F", numericalizer=MockVocab()) _, data = f.preprocess("asd dsa")[0] assert data == (None, ["asd", "dsa"])
def test_missing_values_fail(): fld = Field(name="bla", keep_raw=True, tokenizer=None, numericalizer=hash) with pytest.raises(ValueError): fld.preprocess(None)