Beispiel #1
0
    def test_normalize(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("My Name Is John")
        assert output == "my name is john"
Beispiel #2
0
    def test_strip_accents(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = BertNormalizer(
            strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
        )

        output = tokenizer.normalize("Héllò")
        assert output == "Hello"
Beispiel #3
0
    def test_clean_text(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.normalizer = BertNormalizer(strip_accents=False,
                                              lowercase=False,
                                              handle_chinese_chars=False,
                                              clean_text=True)

        output = tokenizer.normalize("\ufeffHello")
        assert output == "Hello"
Beispiel #4
0
'''
将句子中的中文和英文分开,使用huggingface/tokenizers

https://github.com/huggingface/tokenizers/blob/master/bindings/python/tests/bindings/test_normalizers.py
'''

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import BertNormalizer

text = "薛定谔的猫(英文名称:Erwin Schrödinger's Cat)是奥地利著名物理学家薛定谔"

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = BertNormalizer(strip_accents=False,
                                      lowercase=False,
                                      handle_chinese_chars=True,
                                      clean_text=False)

output = tokenizer.normalize(txt)
print(output)
Beispiel #5
0
    def test_full_strip(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Strip(left=True, right=True)

        output = tokenizer.normalize("  hello  ")
        assert output == "hello"
Beispiel #6
0
    def test_lowercase(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("HELLO")
        assert output == "hello"
Beispiel #7
0
    def test_can_make_sequences(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Sequence([Lowercase(), Strip()])

        output = tokenizer.normalize("  HELLO  ")
        assert output == "hello"
Beispiel #8
0
    def test_right_strip(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.normalizer = Strip(left=False, right=True)

        output = tokenizer.normalize("  hello  ")
        assert output == "  hello"