コード例 #1
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_field_posttokenize_hooks_detach():
    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    def remove_tags_hook(raw, tokenized):
        raw = raw.replace("<tag>", "")
        tokenized = map(lambda x: x.replace("<tag>", ""), tokenized)

        return raw, tokenized

    def to_upper_hook(raw, tokenized):
        raw = raw.upper()
        tokenized = map(str.upper, tokenized)

        return raw, tokenized

    f.add_posttokenize_hook(remove_tags_hook)
    f.add_posttokenize_hook(to_upper_hook)

    # detaching the hooks
    f.remove_posttokenize_hooks()

    _, received = f.preprocess("asd 123<tag> B<tag>LA")[0]
    expected = ("asd 123<tag> B<tag>LA", ["asd", "123<tag>", "B<tag>LA"])

    assert received == expected
コード例 #2
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_hook_returning_iterable():
    data = "1,2,3,4"
    expected_tokens = [3, 5, 7, 9]

    field = Field(
        "Iterator_hook_test_field",
        tokenizer=lambda raw: [int(x) for x in raw.split(",")],
        numericalizer=id,
        keep_raw=True,
    )

    def multiply_by_two_hook(raw, tokens):
        return raw, (i * 2 for i in tokens)

    def add_one_hook(raw, tokens):
        assert not isinstance(tokens, (list, tuple))
        return raw, (i + 1 for i in tokens)

    field.add_posttokenize_hook(multiply_by_two_hook)
    field.add_posttokenize_hook(add_one_hook)

    _, (raw, tokens) = field.preprocess(data)[0]

    assert raw == data
    assert isinstance(tokens, (list, tuple))
    assert tokens == expected_tokens
コード例 #3
0
ファイル: test_hooks.py プロジェクト: TakeLab/podium
def test_remove_stopwords():
    data = "I'll tell you a joke"
    field = Field(name="data")
    field.add_posttokenize_hook(remove_stopwords("en"))
    example = ExampleFactory([field]).from_list([data])

    assert "you" not in example["data"][1]
    assert "a" not in example["data"][1]
コード例 #4
0
ファイル: test_hooks.py プロジェクト: TakeLab/podium
def test_keyword_extractor(alg, alg_pkg_name):
    pytest.importorskip(alg_pkg_name)

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_posttokenize_hook(KeywordExtractor(alg))
    example = ExampleFactory([field]).from_list([TEXT])

    # make sure all the keywords originate from the raw data
    text_ = TEXT.lower()
    assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
コード例 #5
0
ファイル: test_hooks.py プロジェクト: TakeLab/podium
def test_lemmatization_and_stemming(hook):
    # we need this to postpone initialization
    # in pytest.mark.parametrize
    if inspect.isfunction(hook):
        hook = hook()

    data = "stemming playing books"
    field = Field(name="data")
    field.add_posttokenize_hook(hook)
    example = ExampleFactory([field]).from_list([data])

    # we don't check the exact results,
    # instead we expect some modifications
    assert data != example["data"][1]
コード例 #6
0
ファイル: test_hooks.py プロジェクト: TakeLab/podium
def test_hook_conversion():
    field = Field(name="data", tokenizer="split", keep_raw=True)
    text_clean_up_hook = TextCleanUp(replace_url="<URL>")

    assert text_clean_up_hook.__hook_type__ == HookType.PRETOKENIZE
    with pytest.raises(ValueError):
        field.add_posttokenize_hook(text_clean_up_hook)

    text_clean_up_hook = as_posttokenize_hook(text_clean_up_hook)
    assert text_clean_up_hook.__hook_type__ == HookType.POSTTOKENIZE

    field.add_posttokenize_hook(text_clean_up_hook)

    data = "url to github is https://github.com"
    example = ExampleFactory([field]).from_list([data])

    assert example["data"][1] == ["url", "to", "github", "is", "<URL>"]
コード例 #7
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_multioutput_field_posttokenization():
    uppercase_field = Field("uppercase_field", keep_raw=True)
    lowercase_field = Field("lowercase_field", keep_raw=True)

    def post_tokenization_all_upper(raw, tokenized):
        return raw, [token.upper() for token in tokenized]

    def post_tokenization_all_lower(raw, tokenized):
        return raw, [token.lower() for token in tokenized]

    uppercase_field.add_posttokenize_hook(post_tokenization_all_upper)
    lowercase_field.add_posttokenize_hook(post_tokenization_all_lower)

    output_fields = uppercase_field, lowercase_field
    mo_field = MultioutputField(output_fields, tokenizer="split")

    result1, result2 = mo_field.preprocess("mOcK TeXt")

    assert result1 == (uppercase_field.name, ("mOcK TeXt", ["MOCK", "TEXT"]))
    assert result2 == (lowercase_field.name, ("mOcK TeXt", ["mock", "text"]))
コード例 #8
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_field_repeated_hooks():
    def replace_tag_hook(raw, tokenized):
        replaced_tags = map(lambda s: s.replace("<tag>", "ABC"), tokenized)

        return raw, replaced_tags

    def to_lower_hook(raw, tokenized):
        # keep track of the function call count
        to_lower_hook.call_count += 1

        tokenized = map(str.lower, tokenized)

        return raw, tokenized

    to_lower_hook.call_count = 0

    f = Field(name="F", tokenizer="split", numericalizer=float, keep_raw=True)

    # TAG -> tag
    f.add_posttokenize_hook(to_lower_hook)

    # <tag> -> ABC
    f.add_posttokenize_hook(replace_tag_hook)

    # ABC -> abc
    f.add_posttokenize_hook(to_lower_hook)

    _, received = f.preprocess("BLA <TAG> bla")[0]

    expected = ("BLA <TAG> bla", ["bla", "abc", "bla"])

    assert received == expected

    # check that the hook that was added twice was also called twice
    assert to_lower_hook.call_count == 2
コード例 #9
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_field_posttokenize_hooks():
    f = Field(name="F", tokenizer="split", keep_raw=True)

    def remove_tags_hook(raw, tokenized):
        raw = raw.replace("<tag>", "")
        tokenized = map(lambda x: x.replace("<tag>", ""), tokenized)

        return raw, tokenized

    def to_upper_hook(raw, tokenized):
        raw = raw.upper()
        tokenized = map(str.upper, tokenized)

        return raw, tokenized

    f.add_posttokenize_hook(remove_tags_hook)
    f.add_posttokenize_hook(to_upper_hook)

    _, received = f.preprocess("asd 123<tag> B<tag>LA")[0]
    expected = ("ASD 123 BLA", ["ASD", "123", "BLA"])

    assert received == expected
コード例 #10
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_posttokenize_hooks_in_field_no_tokenization_single_execution(mocker):
    f = Field(name="F", tokenizer=None)

    def hk(data, tokenized):
        def caseness(token):
            if token.islower():
                return "lowercase"
            else:
                return "uppercase"

        return data, [caseness(token) for token in tokenized]

    patched_hook = mocker.spy(hk, "__call__")

    f.add_posttokenize_hook(patched_hook)

    raw_str = ["Upper", "lower"]

    _, received = f.preprocess(raw_str)[0]
    expected = (None, ["uppercase", "lowercase"])

    assert received == expected
    patched_hook.assert_called_once()
コード例 #11
0
ファイル: test_field.py プロジェクト: TakeLab/podium
def test_multioutput_field_remove_pretokenization():
    output_field_1 = Field("test_field_1")
    output_field_2 = Field("test_field_2")

    def first_lower(raw, tokenized):
        def f(token):
            if len(token) == 0:
                return ""
            else:
                return token[0].lower() + token[1:]

        return raw, [f(token) for token in tokenized]

    output_field_2.add_posttokenize_hook(first_lower)

    mo_field = MultioutputField((output_field_1, output_field_2))
    mo_field.add_pretokenize_hook(str.upper)

    (_, (raw_1, tokenized_1)), (_, (raw_2, tokenized_2)) = mo_field.preprocess(
        "this is a test sentence"
    )

    assert tokenized_1 == ["THIS", "IS", "A", "TEST", "SENTENCE"]
    assert tokenized_2 == ["tHIS", "iS", "a", "tEST", "sENTENCE"]