예제 #1
0
def test_tokenization():
    text = bistr('The quick, brown fox jumps over the lazy dog')
    text = text.replace(',', '')

    tokens = Tokenization(text, [
        Token.slice(text, 0, 3),
        Token.slice(text, 4, 9),
        Token.slice(text, 10, 15),
        Token.slice(text, 16, 19),
        Token.slice(text, 20, 25),
        Token.slice(text, 26, 30),
        Token.slice(text, 31, 34),
        Token.slice(text, 35, 39),
        Token.slice(text, 40, 43),
    ])

    tokens = tokens[1:-1]
    assert tokens.text.original == 'quick, brown fox jumps over the lazy'
    assert tokens.text.modified == 'quick brown fox jumps over the lazy'
    assert tokens.text_bounds(1, 3) == (6, 15)
    assert tokens.original_bounds(1, 3) == (7, 16)
    assert tokens.bounds_for_text(8, 14) == (1, 3)
    assert tokens.bounds_for_original(9, 15) == (1, 3)
    assert tokens.slice_by_text(8, 14).text == bistr('brown fox')
    assert tokens.slice_by_original(9, 15).text == bistr('brown fox')
    assert tokens.snap_text_bounds(8, 14) == (6, 15)
    assert tokens.snap_original_bounds(9, 15) == (7, 16)
예제 #2
0
def test_alternative_regex():
    import regex

    bs = bistr('The quick, brown 🦊 jumps over the lazy 🐶')
    bs = bs.sub(regex.compile(r'\pS'), lambda m: unicodedata.name(m.group()))
    assert bs[17:25] == bistr('🦊', 'FOX FACE')
    assert bs[46:] == bistr('🐶', 'DOG FACE')
예제 #3
0
def test_tokenization():
    text = bistr('  The quick, brown fox jumps over the lazy dog  ')
    text = text.replace(',', '')
    text = text.sub(r'^ +| +$', '')

    tokens = Tokenization(text, [
        Token.slice(text, 0, 3),
        Token.slice(text, 4, 9),
        Token.slice(text, 10, 15),
        Token.slice(text, 16, 19),
        Token.slice(text, 20, 25),
        Token.slice(text, 26, 30),
        Token.slice(text, 31, 34),
        Token.slice(text, 35, 39),
        Token.slice(text, 40, 43),
    ])
    assert tokens.text == text
    assert tokens.text_bounds(1, 3) == (4, 15)
    assert tokens.original_bounds(1, 3) == (6, 18)
    assert tokens.bounds_for_text(0, 13) == (0, 3)
    assert tokens.bounds_for_original(0, 13) == (0, 2)
    assert tokens.slice_by_text(34, 43).substring() == bistr('lazy dog')
    assert tokens.slice_by_original(36,
                                    48).substring() == bistr('the lazy dog')
    assert tokens.snap_text_bounds(2, 13) == (0, 15)
    assert tokens.snap_original_bounds(36, 47) == (34, 46)
예제 #4
0
def test_title():
    bs = bistr('istanbul').title('en_US')
    assert bs.original == 'istanbul'
    assert bs.modified == 'Istanbul'

    bs = bistr('istanbul').title('tr_TR')
    assert bs.original == 'istanbul'
    assert bs.modified == 'İstanbul'
예제 #5
0
def test_lower():
    bs = bistr('DİYARBAKIR').lower('en_US')
    assert bs.original == 'DİYARBAKIR'
    assert bs.modified == 'di̇yarbakir'

    bs = bistr('DİYARBAKIR').lower('tr_TR')
    assert bs.original == 'DİYARBAKIR'
    assert bs.modified == 'diyarbakır'
예제 #6
0
def test_append():
    builder = BistrBuilder('hello WORLD')
    builder.append(bistr(builder.peek(5)).upper('en_US'))
    builder.skip(1)
    builder.append(bistr(builder.peek(5)).lower('en_US'))

    bs = builder.build()
    assert bs[1:4] == bistr('ell', 'ELL', Alignment.identity(3))
    assert bs[7:10] == bistr('ORL', 'orl', Alignment.identity(3))
예제 #7
0
def test_upper():
    bs = bistr('straße').upper('de_DE')
    assert bs.original == 'straße'
    assert bs.modified == 'STRASSE'
    assert bs[4:6].original == 'ß'
    assert bs[4:6].modified == 'SS'

    bs = bistr('Diyarbakır').upper('tr_TR')
    assert bs.original == 'Diyarbakır'
    assert bs.modified == 'DİYARBAKIR'
예제 #8
0
def test_strip():
    bs = bistr('  Hello  world!  ')
    assert bs.original == '  Hello  world!  '
    assert bs.modified == '  Hello  world!  '

    bs = bs.strip()
    assert bs.original == '  Hello  world!  '
    assert bs.modified == 'Hello  world!'

    bs = bistr('    ').strip()
    assert bs.modified == ''
    assert bs.original == '    '
예제 #9
0
def test_upper():
    bs = bistr('straße').upper('de_DE')
    assert bs.original == 'straße'
    assert bs.modified == 'STRASSE'
    assert bs[4:6].original == 'ß'
    assert bs[4:6].modified == 'SS'

    bs = bistr('Diyarbakır').upper('tr_TR')
    assert bs.original == 'Diyarbakır'
    assert bs.modified == 'DİYARBAKIR'

    # Odysseus
    bs = bistr('Ὀδυσσεύς').upper('und')
    assert bs.original == 'Ὀδυσσεύς'
    assert bs.modified == 'ὈΔΥΣΣΕΎΣ'
예제 #10
0
def test_capitalize():
    bs = bistr('hello WORLD').capitalize('en_US')
    assert bs.original == 'hello WORLD'
    assert bs.modified == 'Hello world'
    assert bs.alignment == Alignment.identity(11)

    bs = bistr('τελικός').capitalize('el_GR')
    assert bs.original == 'τελικός'
    assert bs.modified == 'Τελικός'
    assert bs.alignment == Alignment.identity(7)

    bs = bistr('ἴΣ').capitalize('el_GR')
    assert bs.original == 'ἴΣ'
    assert bs.modified == 'Ἴς'
    assert bs.alignment == Alignment.identity(2)
예제 #11
0
def test_infer():
    text = 'the quick, brown fox'
    tokens = Tokenization.infer(text, ['the', 'quick', 'brown', 'fox'])
    assert tokens.substring(1, 3) == bistr('quick, brown')

    pytest.raises(ValueError, Tokenization.infer, text,
                  ['the', 'quick', 'red', 'fox'])
예제 #12
0
def test_swapcase():
    bs = bistr('hello WORLD').swapcase('en_US')
    assert bs.original == 'hello WORLD'
    assert bs.modified == 'HELLO world'
    assert bs.alignment == Alignment.identity(11)

    # Ligatures/digraphs in title case don't have a swapped form
    bs = bistr('Ljepòta').swapcase('hr_HR')
    assert bs.original == 'Ljepòta'
    assert bs.modified == 'LjEPÒTA'
    assert bs.alignment == Alignment.identity(6)

    bs = bistr('Ljepòta').normalize('NFKC').swapcase('hr_HR')
    assert bs.original == 'Ljepòta'
    assert bs.modified == 'lJEPÒTA'
    assert bs[0:2] == bistr('Lj', 'lJ')
예제 #13
0
def test_concat():
    bs = bistr('  ', '')
    bs += 'Hello'
    bs += bistr('  ', ' ')
    bs += 'world!'
    bs += bistr('  ', '')

    assert bs.original == '  Hello  world!  '
    assert bs.modified == 'Hello world!'

    bs = bs[4:7]
    assert bs.original == 'o  w'
    assert bs.modified == 'o w'

    bs = bs[1:2]
    assert bs.original == '  '
    assert bs.modified == ' '
예제 #14
0
def test_infer():
    bs = bistr.infer('test', 'test')
    assert bs == bistr('test', 'test', Alignment.identity(4))

    bs = bistr.infer('color', 'colour')
    assert bs[3:5].original == 'o'

    assert bs.inverse() == bistr.infer('colour', 'color')
예제 #15
0
def test_lower():
    bs = bistr('DİYARBAKIR').lower('en_US')
    assert bs.original == 'DİYARBAKIR'
    assert bs.modified == 'di̇yarbakir'

    bs = bistr('DİYARBAKIR').lower('tr_TR')
    assert bs.original == 'DİYARBAKIR'
    assert bs.modified == 'diyarbakır'

    # Odysseus
    bs = bistr('ὈΔΥΣΣΕΎΣ').lower('el_GR')
    assert bs.original == 'ὈΔΥΣΣΕΎΣ'
    assert bs.modified == 'ὀδυσσεύς'

    # Examples from The Unicode Standard, Version 12.0, Chapter 3.13
    bs = bistr('ᾼΣͅ').lower('el_GR')
    assert bs.original == 'ᾼΣͅ'
    assert bs.modified == 'ᾳςͅ'

    bs = bistr('ͅΣͅ').lower('el_GR')
    assert bs.original == 'ͅΣͅ'
    assert bs.modified == 'ͅσͅ'

    bs = bistr('ᾼΣᾼ').lower('el_GR')
    assert bs.original == 'ᾼΣᾼ'
    assert bs.modified == 'ᾳσᾳ'

    bs = bistr('Σ').lower('el_GR')
    assert bs.original == 'Σ'
    assert bs.modified == 'σ'
예제 #16
0
def test_casefold():
    # 'Híffi'
    # í has a combining acute accent, ffi is a ligature
    bs = bistr('Hi\u0301\uFB03').casefold()
    assert bs.original == 'Hi\u0301\uFB03'
    assert bs.modified == 'hi\u0301ffi'
    assert bs.modified == bs.original.casefold()

    assert bs[:3].original == 'Hi\u0301'
    assert bs[:3].modified == 'hi\u0301'

    assert bs[4:5].original == '\uFB03'
    assert bs[4:5].modified == 'f'

    # Odysseus
    bs = bistr('Ὀδυσσεύς').casefold()
    assert bs.original == 'Ὀδυσσεύς'
    assert bs.modified == 'ὀδυσσεύσ'
예제 #17
0
def test_readme():
    bs = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶')
    bs = bs.normalize('NFKD')
    bs = bs.casefold()
    bs = bs.replace('🦊', 'fox')
    bs = bs.replace('🐶', 'dog')
    bs = bs.sub(r'[^\w\s]+', '')
    bs = bs[:19]
    assert bs.modified == 'the quick brown fox'
    assert bs.original == '𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊'
예제 #18
0
def test_expandtabs():
    bs = bistr(' \tHello\t\tworld!\n\tGoodbye \tworld!')
    bs = bs.expandtabs()

    assert bs.modified == bs.original.expandtabs()
    assert bs[0:1] == bistr(' ')
    assert bs[1:8] == bistr('\t', '       ')
    assert bs[8:13] == bistr('Hello')
    assert bs[13:16] == bistr('\t', '   ')
    assert bs[16:24] == bistr('\t', '        ')
    assert bs[24:30] == bistr('world!')
    assert bs[30:31] == bistr('\n')
예제 #19
0
def test_equality():
    bs1 = bistr('  Hello world  ').strip().casefold()
    bs2 = bistr('  Hello world  ', 'hello world', Alignment([
        (0, 0),
        (2, 0),
        (3, 1),
        (4, 2),
        (5, 3),
        (6, 4),
        (7, 5),
        (8, 6),
        (9, 7),
        (10, 8),
        (11, 9),
        (12, 10),
        (13, 11),
        (15, 11),
    ]))
    assert bs1 == bs2
예제 #20
0
def test_character_tokenizer():
    from bistring import CharacterTokenizer

    text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ')

    tokenizer = CharacterTokenizer('en_US')
    assert isinstance(tokenizer, Tokenizer)

    tokens = tokenizer.tokenize(text)
    assert tokens.text == text
    assert all(token.text == text[i:i+1] for i, token in enumerate(tokens))
예제 #21
0
def test_starts_ends_with():
    bs = bistr('Beginning, middle, ending')

    assert bs.startswith('Begin')
    assert bs.endswith('ing')

    assert not bs.startswith('ending')
    assert not bs.endswith('Beginning')

    assert bs.startswith(('Begin', 'End'))
    assert bs.endswith(('beginning', 'ending'))
예제 #22
0
def test_sentence_tokenizer():
    from bistring import SentenceTokenizer

    text = bistr('The following sentence is true.  The preceeding sentence, surprisingly, is false.')

    tokenizer = SentenceTokenizer('en_US')
    assert isinstance(tokenizer, Tokenizer)

    tokens = tokenizer.tokenize(text)
    assert tokens.text == text
    assert len(tokens) == 2
    assert tokens[0].text == text[:33]
    assert tokens[1].text == text[33:]
예제 #23
0
def test_find_index():
    bs = bistr('dysfunction')

    assert bs.find('dis') == -1
    assert bs.find('fun') == 3
    assert bs.find_bounds('dis') == (-1, -1)
    assert bs.find_bounds('fun') == (3, 6)

    pytest.raises(ValueError, bs.index, 'dis')
    pytest.raises(ValueError, bs.index_bounds, 'dis')

    assert bs.index('fun') == 3
    assert bs.index_bounds('fun') == (3, 6)
예제 #24
0
def test_justify():
    bs = bistr('Hello world!')

    assert bs.center(5) == bs
    assert bs.center(20) == bistr('', '    ') + bs + bistr('', '    ')
    assert bs.center(21) == bistr('', '    ') + bs + bistr('', '     ')

    assert bs.ljust(5) == bs
    assert bs.ljust(16) == bs + bistr('', '    ')

    assert bs.rjust(5) == bs
    assert bs.rjust(16) == bistr('', '    ') + bs
예제 #25
0
def test_word_tokenizer():
    from bistring import WordTokenizer

    text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ')

    tokenizer = WordTokenizer('en_US')
    assert isinstance(tokenizer, Tokenizer)

    tokens = tokenizer.tokenize(text)
    assert tokens.text == text
    assert len(tokens) == 9
    assert tokens.text_bounds(0, 2) == (1, 10)
    assert tokens[0:2].text == text[1:10]
    assert len(tokens.slice_by_text(5, 10)) == 1
    assert len(tokens.slice_by_text(5, 11)) == 1
    assert len(tokens.slice_by_text(3, 13)) == 3
예제 #26
0
def test_infer():
    bs = bistr.infer('test', 'test')
    assert bs == bistr('test', 'test', Alignment.identity(4))

    bs = bistr.infer('color', 'colour')
    assert bs[3:5].original == 'o'

    assert bs.inverse() == bistr.infer('colour', 'color')

    bs = bistr.infer(
        '🅃🄷🄴 🅀🅄🄸🄲🄺, 🄱🅁🄾🅆🄽 🦊 🄹🅄🄼🄿🅂 🄾🅅🄴🅁 🅃🄷🄴 🄻🄰🅉🅈 🐶',
        'the quick brown fox jumps over the lazy dog',
    )
    assert bs[0:3] == bistr('🅃🄷🄴', 'the', Alignment.identity(3))
    assert bs[4:9] == bistr('🅀🅄🄸🄲🄺', 'quick', Alignment.identity(5))
    assert bs[10:15] == bistr('🄱🅁🄾🅆🄽', 'brown', Alignment.identity(5))
    assert bs[16:19].original == '🦊'
    assert bs[16:19].modified == 'fox'
    assert bs[20:25] == bistr('🄹🅄🄼🄿🅂', 'jumps', Alignment.identity(5))
    assert bs[40:43].original == '🐶'
    assert bs[40:43].modified == 'dog'

    bs = bistr.infer(
        'Ṫḧë qüïċḳ, ḅṛöẅṅ 🦊 jüṁṗṡ öṿëṛ ẗḧë ḷäżÿ 🐶',
        'the quick brown fox jumps over the lazy dog',
    )
    assert bs[0:3] == bistr('Ṫḧë', 'the', Alignment.identity(3))
    assert bs[4:9] == bistr('qüïċḳ', 'quick', Alignment.identity(5))
    assert bs[10:15] == bistr('ḅṛöẅṅ', 'brown', Alignment.identity(5))
    assert bs[16:19].original == '🦊'
    assert bs[16:19].modified == 'fox'
    assert bs[20:25] == bistr('jüṁṗṡ', 'jumps', Alignment.identity(5))
    assert bs[40:43].original == '🐶'
    assert bs[40:43].modified == 'dog'

    bs = bistr.infer(
        'Z̴̡̪̫̖̥̔̿̃̈̏̎͠͝á̸̪̠̖̻̬̖̪̞͙͇̮̠͎̆͋́̐͌̒͆̓l̶͉̭̳̤̬̮̩͎̟̯̜͇̥̠̘͑͐̌͂̄́̀̂̌̈͛̊̄̚͜ģ̸̬̼̞̙͇͕͎̌̾̒̐̿̎̆̿̌̃̏̌́̾̈͘͜o̶̢̭͕͔̩͐ ̴̡̡̜̥̗͔̘̦͉̣̲͚͙̐̈́t̵͈̰̉̀͒̎̈̿̔̄̽͑͝͠ẹ̵̫̲̫̄͜͜x̵͕̳͈̝̤̭̼̼̻͓̿̌̽̂̆̀̀̍̒͐́̈̀̚͝t̸̡̨̥̺̣̟͎̝̬̘̪͔͆́̄̅̚',
        'Zalgo text')
    for i, c in enumerate(bs):
        assert bs[i:i + 1].original.startswith(c)
예제 #27
0
def test_normalize():
    # é is composed but ö has a combining diaeresis
    bs = bistr('H\u00E9llo\u0308')

    bs = bs.normalize('NFC')
    assert bs.original == 'H\u00E9llo\u0308'
    assert bs.modified == 'H\u00E9ll\u00F6'
    assert bs.modified == unicodedata.normalize('NFC', bs.original)
    assert bs[4:5].original == 'o\u0308'
    assert bs[4:5].modified == '\u00F6'

    bs = bs.normalize('NFD')
    assert bs.original == 'H\u00E9llo\u0308'
    assert bs.modified == 'He\u0301llo\u0308'
    assert bs.modified == unicodedata.normalize('NFD', bs.original)
    assert bs[1:3].original == '\u00E9'
    assert bs[1:3].modified == 'e\u0301'
예제 #28
0
def test_splitting_tokenizer():
    from bistring import SplittingTokenizer

    text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ')
    text = text.normalize('NFKD')
    text = text.casefold()

    tokenizer = SplittingTokenizer(r'\s+')
    assert isinstance(tokenizer, Tokenizer)

    tokens = tokenizer.tokenize(text)
    assert tokens.text == text
    assert len(tokens) == 9
    assert tokens.text_bounds(0, 2) == (1, 11)
    assert tokens[0:2].text == text[1:11]
    assert len(tokens.slice_by_text(5, 10)) == 1
    assert len(tokens.slice_by_text(5, 11)) == 1
    assert len(tokens.slice_by_text(3, 13)) == 3
예제 #29
0
def test_rfind_rindex():
    bs = bistr('dysfunction')

    assert bs.rfind('dis') == -1
    assert bs.rfind('fun') == 3
    assert bs.rfind('n') == 10
    assert bs.rfind('n', None, 9) == 5

    assert bs.rfind_bounds('dis') == (-1, -1)
    assert bs.rfind_bounds('fun') == (3, 6)
    assert bs.rfind_bounds('n') == (10, 11)
    assert bs.rfind_bounds('n', None, 9) == (5, 6)

    pytest.raises(ValueError, bs.index, 'dis')
    pytest.raises(ValueError, bs.index_bounds, 'dis')

    assert bs.rindex('fun') == 3
    assert bs.rindex_bounds('fun') == (3, 6)
    assert bs.rindex_bounds('n') == (10, 11)
    assert bs.rindex_bounds('n', None, 9) == (5, 6)
예제 #30
0
def test_normalize():
    # "Héllö" -- é is composed but ö has a combining diaeresis
    bs = bistr('H\u00E9llo\u0308').normalize('NFC')
    assert bs.original == 'H\u00E9llo\u0308'
    assert bs.modified == 'H\u00E9ll\u00F6'
    assert bs.modified == unicodedata.normalize('NFC', bs.original)
    assert bs[1:2] == bistr('\u00E9')
    assert bs[4:5] == bistr('o\u0308', '\u00F6')

    bs = bistr('H\u00E9llo\u0308').normalize('NFD')
    assert bs.original == 'H\u00E9llo\u0308'
    assert bs.modified == 'He\u0301llo\u0308'
    assert bs.modified == unicodedata.normalize('NFD', bs.original)
    assert bs[1:3] == bistr('\u00E9', 'e\u0301')
    assert bs[5:7] == bistr('o\u0308')