def test_field_posttokenize_hooks_detach(): f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) def remove_tags_hook(raw, tokenized): raw = raw.replace("<tag>", "") tokenized = map(lambda x: x.replace("<tag>", ""), tokenized) return raw, tokenized def to_upper_hook(raw, tokenized): raw = raw.upper() tokenized = map(str.upper, tokenized) return raw, tokenized f.add_posttokenize_hook(remove_tags_hook) f.add_posttokenize_hook(to_upper_hook) # detaching the hooks f.remove_posttokenize_hooks() _, received = f.preprocess("asd 123<tag> B<tag>LA")[0] expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"]) assert received == expected
def test_hook_returning_iterable(): data = "1,2,3,4" expected_tokens = [3, 5, 7, 9] field = Field( "Iterator_hook_test_field", tokenizer=lambda raw: [int(x) for x in raw.split(",")], numericalizer=id, keep_raw=True, ) def multiply_by_two_hook(raw, tokens): return raw, (i * 2 for i in tokens) def add_one_hook(raw, tokens): assert not isinstance(tokens, (list, tuple)) return raw, (i + 1 for i in tokens) field.add_posttokenize_hook(multiply_by_two_hook) field.add_posttokenize_hook(add_one_hook) _, (raw, tokens) = field.preprocess(data)[0] assert raw == data assert isinstance(tokens, (list, tuple)) assert tokens == expected_tokens
def test_remove_stopwords(): data = "I'll tell you a joke" field = Field(name="data") field.add_posttokenize_hook(remove_stopwords("en")) example = ExampleFactory([field]).from_list([data]) assert "you" not in example["data"][1] assert "a" not in example["data"][1]
def test_keyword_extractor(alg, alg_pkg_name): pytest.importorskip(alg_pkg_name) field = Field(name="data", tokenizer=None, keep_raw=True) field.add_posttokenize_hook(KeywordExtractor(alg)) example = ExampleFactory([field]).from_list([TEXT]) # make sure all the keywords originate from the raw data text_ = TEXT.lower() assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
def test_lemmatization_and_stemming(hook): # we need this to postpone initialization # in pytest.mark.parametrize if inspect.isfunction(hook): hook = hook() data = "stemming playing books" field = Field(name="data") field.add_posttokenize_hook(hook) example = ExampleFactory([field]).from_list([data]) # we don't check the exact results, # instead we expect some modifications assert data != example["data"][1]
def test_hook_conversion(): field = Field(name="data", tokenizer="split", keep_raw=True) text_clean_up_hook = TextCleanUp(replace_url="<URL>") assert text_clean_up_hook.__hook_type__ == HookType.PRETOKENIZE with pytest.raises(ValueError): field.add_posttokenize_hook(text_clean_up_hook) text_clean_up_hook = as_posttokenize_hook(text_clean_up_hook) assert text_clean_up_hook.__hook_type__ == HookType.POSTTOKENIZE field.add_posttokenize_hook(text_clean_up_hook) data = "url to github is https://github.com" example = ExampleFactory([field]).from_list([data]) assert example["data"][1] == ["url", "to", "github", "is", "<URL>"]
def test_multioutput_field_posttokenization(): uppercase_field = Field("uppercase_field", keep_raw=True) lowercase_field = Field("lowercase_field", keep_raw=True) def post_tokenization_all_upper(raw, tokenized): return raw, [token.upper() for token in tokenized] def post_tokenization_all_lower(raw, tokenized): return raw, [token.lower() for token in tokenized] uppercase_field.add_posttokenize_hook(post_tokenization_all_upper) lowercase_field.add_posttokenize_hook(post_tokenization_all_lower) output_fields = uppercase_field, lowercase_field mo_field = MultioutputField(output_fields, tokenizer="split") result1, result2 = mo_field.preprocess("mOcK TeXt") assert result1 == (uppercase_field.name, ("mOcK TeXt", ["MOCK", "TEXT"])) assert result2 == (lowercase_field.name, ("mOcK TeXt", ["mock", "text"]))
def test_field_repeated_hooks(): def replace_tag_hook(raw, tokenized): replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized) return raw, replaced_tags def to_lower_hook(raw, tokenized): # keep track of the function call count to_lower_hook.call_count += 1 tokenized = map(str.lower, tokenized) return raw, tokenized to_lower_hook.call_count = 0 f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True) # TAG -> tag f.add_posttokenize_hook(to_lower_hook) # <tag> -> ABC f.add_posttokenize_hook(replace_tag_hook) # ABC -> abc f.add_posttokenize_hook(to_lower_hook) _, received = f.preprocess("BLA <TAG> bla")[0] expected = ("BLA <TAG> bla", ["bla", "abc", "bla"]) assert received == expected # check that the hook that was added twice was also called twice assert to_lower_hook.call_count == 2
def test_field_posttokenize_hooks(): f = Field(name="F", tokenizer="split", keep_raw=True) def remove_tags_hook(raw, tokenized): raw = raw.replace("<tag>", "") tokenized = map(lambda x: x.replace("<tag>", ""), tokenized) return raw, tokenized def to_upper_hook(raw, tokenized): raw = raw.upper() tokenized = map(str.upper, tokenized) return raw, tokenized f.add_posttokenize_hook(remove_tags_hook) f.add_posttokenize_hook(to_upper_hook) _, received = f.preprocess("asd 123<tag> B<tag>LA")[0] expected = ("ASD 123 BLA", ["ASD", "123", "BLA"]) assert received == expected
def test_posttokenize_hooks_in_field_no_tokenization_single_execution(mocker): f = Field(name="F", tokenizer=None) def hk(data, tokenized): def caseness(token): if token.islower(): return "lowercase" else: return "uppercase" return data, [caseness(token) for token in tokenized] patched_hook = mocker.spy(hk, "__call__") f.add_posttokenize_hook(patched_hook) raw_str = ["Upper", "lower"] _, received = f.preprocess(raw_str)[0] expected = (None, ["uppercase", "lowercase"]) assert received == expected patched_hook.assert_called_once()
def test_multioutput_field_remove_pretokenization(): output_field_1 = Field("test_field_1") output_field_2 = Field("test_field_2") def first_lower(raw, tokenized): def f(token): if len(token) == 0: return "" else: return token[0].lower() + token[1:] return raw, [f(token) for token in tokenized] output_field_2.add_posttokenize_hook(first_lower) mo_field = MultioutputField((output_field_1, output_field_2)) mo_field.add_pretokenize_hook(str.upper) (_, (raw_1, tokenized_1)), (_, (raw_2, tokenized_2)) = mo_field.preprocess( "this is a test sentence" ) assert tokenized_1 == ["THIS", "IS", "A", "TEST", "SENTENCE"] assert tokenized_2 == ["tHIS", "iS", "a", "tEST", "sENTENCE"]