Exemple #1
0
def test_Chinese_and_cjk():
    text = """
Hello World!世界和平?
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Speed = 3×10^8 ㎞/s.
"""
    expect_chinese = "世界和平?"
    expect_cjk = "世界和平?㎞"
    assert "".join([m.value for m in extract(text, CHINESE)]) == expect_chinese
    assert "".join([m.value for m in extract(text, CJK)]) == expect_cjk
Exemple #2
0
def test_custom_regex():
    custom_regex = build_new_regex("custom", r"(one|two|three)", re.I, "<NUM>")
    text = "One of the two will be used. Which one is undefined."
    num = ["One", "two", "one"]
    expect = "<NUM> of the <NUM> will be used. Which <NUM> is undefined."

    result = list(extract(text, custom_regex))
    assert len(result) == len(num)
    for i in range(len(num)):
        assert num[i] == result[i].value

    assert replace(text, custom_regex) == expect
Exemple #3
0
def test_space():
    text = [
        "hello world!\texciting!",
        """
        One more time.
        """,
    ]
    no_space = [
        [" ", "\t"],
        ["\n        ", " ", " ", "\n        "],
    ]
    expect = ["hello world! exciting!", " One more time. "]
    assert_list(no_space,
                [[x.value for x in list(extract(t, SPACE))] for t in text])
    assert_list(expect, [replace(t, SPACE) for t in text])
Exemple #4
0
def test_ascii_word():
    text = "( ⊙ o ⊙ )What's that? It cost me $1000 to buy 0.1 bitcoin."
    expect = [
        "o",
        "What's",
        "that",
        "It",
        "cost",
        "me",
        "$1000",
        "to",
        "buy",
        "0.1",
        "bitcoin",
    ]
    assert [t.value for t in extract(text, ASCII_WORD)] == expect
Exemple #5
0
def test_email():
    text = [
        "Email Address: [email protected].",
        "电子邮件地址中通常会忽略一部分 full stops,比如 [email protected] "
        "相当于 [email protected]",
    ]
    emails = [
        ["*****@*****.**"],
        ["*****@*****.**", "*****@*****.**"],
    ]
    expect = [
        "Email Address: <Email>.",
        "电子邮件地址中通常会忽略一部分 full stops,比如 <Email> 相当于 <Email>",
    ]
    assert_list(emails,
                [[x.value for x in list(extract(t, EMAIL))] for t in text])
    assert_list(expect, [replace(t, EMAIL) for t in text])
Exemple #6
0
def test_phone():
    text = [
        "Call me at 400-2333-6666",
        "这个13323336666的号码是谁的?",
        "188 0000 9999 是以前的,现在结尾是 2333",
    ]
    phones = [
        ["400-2333-6666"],
        ["13323336666"],
        ["188 0000 9999"],
    ]
    expect = [
        "Call me at <Telephone>",
        "这个<Telephone>的号码是谁的?",
        "<Telephone> 是以前的,现在结尾是 2333",
    ]
    assert_list(phones,
                [[x.value for x in list(extract(t, TELEPHONE))] for t in text])
    assert_list(expect, [replace(t, TELEPHONE) for t in text])
Exemple #7
0
def test_url():
    text = [
        "http://www.guokr.com/如何",
        "For more information about Python Re, check"
        " https://docs.python.org/3/library/re.html",
        "HTTPS://github.com/ try the upper case http://u.me.",
    ]
    urls = [
        ["http://www.guokr.com/"],
        ["https://docs.python.org/3/library/re.html"],
        ["HTTPS://github.com/", "http://u.me."],
    ]
    expect = [
        "<URL>如何",
        "For more information about Python Re, check <URL>",
        "<URL> try the upper case <URL>",
    ]
    assert_list(urls, [[x.value for x in list(extract(t, URL))] for t in text])
    assert_list(expect, [replace(t, URL) for t in text])
Exemple #8
0
def test_brascii():
    text = "1234 Código Brasileiro para Intercâmbio de Informação 4321"
    expect = "Código Brasileiro para Intercâmbio de Informação"
    assert " ".join([t.value for t in extract(text, BraSCII)]) == expect
Exemple #9
0
def test_number():
    text = "2012 is not the end of world. So does 1999."
    expect = "2012 1999"
    assert " ".join([t.value for t in extract(text, NUMBER)]) == expect
Exemple #10
0
def test_english():
    text = "全世界都在说 hello world!"
    expect = "hello world!"
    assert " ".join([t.value for t in extract(text, ENGLISH)]) == expect
Exemple #11
0
def test_viet():
    text = "越南语Vietnamese: Trường đại học bách khoa hà nội"
    expect = "Vietnamese: Trường đại học bách khoa hà nội"
    assert " ".join([t.value for t in extract(text, VIETNAMESE)]) == expect
Exemple #12
0
def test_thai():
    text = "กากา cat หมา"
    expect = "กากา หมา"
    assert " ".join([t.value for t in extract(text, THAI)]) == expect
Exemple #13
0
def test_pattern_radd():
    CN_EN_NUM = sum([CHINESE, ENGLISH, NUMBER])
    text = "佛是虚名,道亦妄立。एवं मया श्रुतम्। 1999 is not the end of the world. "
    expect = "佛是虚名,道亦妄立。 1999 is not the end of the world."
    assert " ".join([t.value for t in extract(text, CN_EN_NUM)]) == expect
Exemple #14
0
def test_pattern_add():
    ASCII = build_new_regex("ascii", r"[a-zA-Z0-9]+")
    WORDS = ASCII + CHINESE_WORDS
    text = "自然语言处理太难了!who can help me? (╯▔🔺▔)╯"
    expect = "自然语言处理太难了 who can help me"
    assert " ".join([t.value for t in extract(text, WORDS)]) == expect