Beispiel #1
0
def test_bo_nonbo():
    string = 'བཀྲ་་ཤིས་བདེ་ལེགས། 23PIEIUZLDVéjoldvép«»("«»%='
    cb = ChunkFramework(string)
    chunks = cb.chunk_bo_chars()

    output = cb.get_readable(chunks)
    assert output == [
        ("BO", "བཀྲ་་ཤིས་བདེ་ལེགས། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='),
    ]
Beispiel #2
0
def test_other():
    string = "བཀྲ་ཤིས་བདེ་ལེགས กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"
    cb = ChunkFramework(string)
    chunks = cb.chunk_bo_chars()

    output = cb.get_readable(chunks)
    assert output == [
        ("BO", "བཀྲ་ཤིས་བདེ་ལེགས "),
        ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]
Beispiel #3
0
def test_full_example():
    # Follows the order implemented in Chunks
    string = (
        '༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ '
        "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ")
    cb = ChunkFramework(string)

    # BO / OTHER
    chunks = cb.chunk_bo_chars()
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("BO", "༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='),
        ("BO", " ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ "),
        ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # BO / PUNCT
    chunks = cb.pipe_chunk(chunks, cb.chunk_punct, c.BO.value, c.PUNCT.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
        ("PUNCT", "།། །། "),  # NEW
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
        ("BO", "༪༫༝༜༛༚"),
        ("PUNCT", "༇༆ "),  # NEW
        ("BO", "༡༢༣༠༩༨ "),
        ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # BO / NUM
    chunks = cb.pipe_chunk(chunks, cb.chunk_number, c.BO.value, c.NUM.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
        ("PUNCT", "།། །། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
        ("BO", "༪༫༝༜༛༚"),
        ("PUNCT", "༇༆ "),
        ("NUM", "༡༢༣༠༩༨ "),  # NEW
        ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # BO / SYM
    chunks = cb.pipe_chunk(chunks, cb.chunk_symbol, c.BO.value, c.SYM.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
        ("PUNCT", "།། །། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
        ("SYM", "༪༫༝༜༛༚"),  # NEW
        ("PUNCT", "༇༆ "),
        ("NUM", "༡༢༣༠༩༨ "),
        ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # TEXT
    chunks = cb.pipe_chunk(chunks, cb.syllabify, c.BO.value, c.TEXT.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("TEXT", "བཀྲ་"),  # NEW
        ("TEXT", "ཤིས་"),  # NEW
        ("TEXT", "བདེ་"),  # NEW
        ("TEXT", "ལེགས"),  # NEW
        ("PUNCT", "།། །། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
        ("SYM", "༪༫༝༜༛༚"),
        ("PUNCT", "༇༆ "),
        ("NUM", "༡༢༣༠༩༨ "),
        ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # OTHER / CJK
    chunks = cb.pipe_chunk(chunks, cb.chunk_cjk, c.OTHER.value, c.CJK.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("TEXT", "བཀྲ་"),
        ("TEXT", "ཤིས་"),
        ("TEXT", "བདེ་"),
        ("TEXT", "ལེགས"),
        ("PUNCT", "།། །། "),
        ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
        ("SYM", "༪༫༝༜༛༚"),
        ("PUNCT", "༇༆ "),
        ("NUM", "༡༢༣༠༩༨ "),
        ("OTHER", "This is a test."),
        ("CJK", " 这是  什么 "),  # NEW
        ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]

    # OTHER / LATIN
    chunks = cb.pipe_chunk(chunks, cb.chunk_latin, c.OTHER.value,
                           c.LATIN.value)
    chunks = cb.clean_chunks(chunks)
    output = cb.get_readable(chunks)
    assert output == [
        ("PUNCT", "༆ "),
        ("TEXT", "བཀྲ་"),
        ("TEXT", "ཤིས་"),
        ("TEXT", "བདེ་"),
        ("TEXT", "ལེགས"),
        ("PUNCT", "།། །། "),
        ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '),  # NEW
        ("SYM", "༪༫༝༜༛༚"),
        ("PUNCT", "༇༆ "),
        ("NUM", "༡༢༣༠༩༨ "),
        ("LATIN", "This is a test."),  # NEW
        ("CJK", " 这是  什么 "),
        ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
    ]