def test_tokenization(): text = bistr('The quick, brown fox jumps over the lazy dog') text = text.replace(',', '') tokens = Tokenization(text, [ Token.slice(text, 0, 3), Token.slice(text, 4, 9), Token.slice(text, 10, 15), Token.slice(text, 16, 19), Token.slice(text, 20, 25), Token.slice(text, 26, 30), Token.slice(text, 31, 34), Token.slice(text, 35, 39), Token.slice(text, 40, 43), ]) tokens = tokens[1:-1] assert tokens.text.original == 'quick, brown fox jumps over the lazy' assert tokens.text.modified == 'quick brown fox jumps over the lazy' assert tokens.text_bounds(1, 3) == (6, 15) assert tokens.original_bounds(1, 3) == (7, 16) assert tokens.bounds_for_text(8, 14) == (1, 3) assert tokens.bounds_for_original(9, 15) == (1, 3) assert tokens.slice_by_text(8, 14).text == bistr('brown fox') assert tokens.slice_by_original(9, 15).text == bistr('brown fox') assert tokens.snap_text_bounds(8, 14) == (6, 15) assert tokens.snap_original_bounds(9, 15) == (7, 16)
def test_alternative_regex(): import regex bs = bistr('The quick, brown 🦊 jumps over the lazy 🐶') bs = bs.sub(regex.compile(r'\pS'), lambda m: unicodedata.name(m.group())) assert bs[17:25] == bistr('🦊', 'FOX FACE') assert bs[46:] == bistr('🐶', 'DOG FACE')
def test_tokenization(): text = bistr(' The quick, brown fox jumps over the lazy dog ') text = text.replace(',', '') text = text.sub(r'^ +| +$', '') tokens = Tokenization(text, [ Token.slice(text, 0, 3), Token.slice(text, 4, 9), Token.slice(text, 10, 15), Token.slice(text, 16, 19), Token.slice(text, 20, 25), Token.slice(text, 26, 30), Token.slice(text, 31, 34), Token.slice(text, 35, 39), Token.slice(text, 40, 43), ]) assert tokens.text == text assert tokens.text_bounds(1, 3) == (4, 15) assert tokens.original_bounds(1, 3) == (6, 18) assert tokens.bounds_for_text(0, 13) == (0, 3) assert tokens.bounds_for_original(0, 13) == (0, 2) assert tokens.slice_by_text(34, 43).substring() == bistr('lazy dog') assert tokens.slice_by_original(36, 48).substring() == bistr('the lazy dog') assert tokens.snap_text_bounds(2, 13) == (0, 15) assert tokens.snap_original_bounds(36, 47) == (34, 46)
def test_title(): bs = bistr('istanbul').title('en_US') assert bs.original == 'istanbul' assert bs.modified == 'Istanbul' bs = bistr('istanbul').title('tr_TR') assert bs.original == 'istanbul' assert bs.modified == 'İstanbul'
def test_lower(): bs = bistr('DİYARBAKIR').lower('en_US') assert bs.original == 'DİYARBAKIR' assert bs.modified == 'di̇yarbakir' bs = bistr('DİYARBAKIR').lower('tr_TR') assert bs.original == 'DİYARBAKIR' assert bs.modified == 'diyarbakır'
def test_append(): builder = BistrBuilder('hello WORLD') builder.append(bistr(builder.peek(5)).upper('en_US')) builder.skip(1) builder.append(bistr(builder.peek(5)).lower('en_US')) bs = builder.build() assert bs[1:4] == bistr('ell', 'ELL', Alignment.identity(3)) assert bs[7:10] == bistr('ORL', 'orl', Alignment.identity(3))
def test_upper(): bs = bistr('straße').upper('de_DE') assert bs.original == 'straße' assert bs.modified == 'STRASSE' assert bs[4:6].original == 'ß' assert bs[4:6].modified == 'SS' bs = bistr('Diyarbakır').upper('tr_TR') assert bs.original == 'Diyarbakır' assert bs.modified == 'DİYARBAKIR'
def test_strip(): bs = bistr(' Hello world! ') assert bs.original == ' Hello world! ' assert bs.modified == ' Hello world! ' bs = bs.strip() assert bs.original == ' Hello world! ' assert bs.modified == 'Hello world!' bs = bistr(' ').strip() assert bs.modified == '' assert bs.original == ' '
def test_upper(): bs = bistr('straße').upper('de_DE') assert bs.original == 'straße' assert bs.modified == 'STRASSE' assert bs[4:6].original == 'ß' assert bs[4:6].modified == 'SS' bs = bistr('Diyarbakır').upper('tr_TR') assert bs.original == 'Diyarbakır' assert bs.modified == 'DİYARBAKIR' # Odysseus bs = bistr('Ὀδυσσεύς').upper('und') assert bs.original == 'Ὀδυσσεύς' assert bs.modified == 'ὈΔΥΣΣΕΎΣ'
def test_capitalize(): bs = bistr('hello WORLD').capitalize('en_US') assert bs.original == 'hello WORLD' assert bs.modified == 'Hello world' assert bs.alignment == Alignment.identity(11) bs = bistr('τελικός').capitalize('el_GR') assert bs.original == 'τελικός' assert bs.modified == 'Τελικός' assert bs.alignment == Alignment.identity(7) bs = bistr('ἴΣ').capitalize('el_GR') assert bs.original == 'ἴΣ' assert bs.modified == 'Ἴς' assert bs.alignment == Alignment.identity(2)
def test_infer(): text = 'the quick, brown fox' tokens = Tokenization.infer(text, ['the', 'quick', 'brown', 'fox']) assert tokens.substring(1, 3) == bistr('quick, brown') pytest.raises(ValueError, Tokenization.infer, text, ['the', 'quick', 'red', 'fox'])
def test_swapcase(): bs = bistr('hello WORLD').swapcase('en_US') assert bs.original == 'hello WORLD' assert bs.modified == 'HELLO world' assert bs.alignment == Alignment.identity(11) # Ligatures/digraphs in title case don't have a swapped form bs = bistr('Ljepòta').swapcase('hr_HR') assert bs.original == 'Ljepòta' assert bs.modified == 'LjEPÒTA' assert bs.alignment == Alignment.identity(6) bs = bistr('Ljepòta').normalize('NFKC').swapcase('hr_HR') assert bs.original == 'Ljepòta' assert bs.modified == 'lJEPÒTA' assert bs[0:2] == bistr('Lj', 'lJ')
def test_concat(): bs = bistr(' ', '') bs += 'Hello' bs += bistr(' ', ' ') bs += 'world!' bs += bistr(' ', '') assert bs.original == ' Hello world! ' assert bs.modified == 'Hello world!' bs = bs[4:7] assert bs.original == 'o w' assert bs.modified == 'o w' bs = bs[1:2] assert bs.original == ' ' assert bs.modified == ' '
def test_infer(): bs = bistr.infer('test', 'test') assert bs == bistr('test', 'test', Alignment.identity(4)) bs = bistr.infer('color', 'colour') assert bs[3:5].original == 'o' assert bs.inverse() == bistr.infer('colour', 'color')
def test_lower(): bs = bistr('DİYARBAKIR').lower('en_US') assert bs.original == 'DİYARBAKIR' assert bs.modified == 'di̇yarbakir' bs = bistr('DİYARBAKIR').lower('tr_TR') assert bs.original == 'DİYARBAKIR' assert bs.modified == 'diyarbakır' # Odysseus bs = bistr('ὈΔΥΣΣΕΎΣ').lower('el_GR') assert bs.original == 'ὈΔΥΣΣΕΎΣ' assert bs.modified == 'ὀδυσσεύς' # Examples from The Unicode Standard, Version 12.0, Chapter 3.13 bs = bistr('ᾼΣͅ').lower('el_GR') assert bs.original == 'ᾼΣͅ' assert bs.modified == 'ᾳςͅ' bs = bistr('ͅΣͅ').lower('el_GR') assert bs.original == 'ͅΣͅ' assert bs.modified == 'ͅσͅ' bs = bistr('ᾼΣᾼ').lower('el_GR') assert bs.original == 'ᾼΣᾼ' assert bs.modified == 'ᾳσᾳ' bs = bistr('Σ').lower('el_GR') assert bs.original == 'Σ' assert bs.modified == 'σ'
def test_casefold(): # 'Híffi' # í has a combining acute accent, ffi is a ligature bs = bistr('Hi\u0301\uFB03').casefold() assert bs.original == 'Hi\u0301\uFB03' assert bs.modified == 'hi\u0301ffi' assert bs.modified == bs.original.casefold() assert bs[:3].original == 'Hi\u0301' assert bs[:3].modified == 'hi\u0301' assert bs[4:5].original == '\uFB03' assert bs[4:5].modified == 'f' # Odysseus bs = bistr('Ὀδυσσεύς').casefold() assert bs.original == 'Ὀδυσσεύς' assert bs.modified == 'ὀδυσσεύσ'
def test_readme(): bs = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶') bs = bs.normalize('NFKD') bs = bs.casefold() bs = bs.replace('🦊', 'fox') bs = bs.replace('🐶', 'dog') bs = bs.sub(r'[^\w\s]+', '') bs = bs[:19] assert bs.modified == 'the quick brown fox' assert bs.original == '𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊'
def test_expandtabs(): bs = bistr(' \tHello\t\tworld!\n\tGoodbye \tworld!') bs = bs.expandtabs() assert bs.modified == bs.original.expandtabs() assert bs[0:1] == bistr(' ') assert bs[1:8] == bistr('\t', ' ') assert bs[8:13] == bistr('Hello') assert bs[13:16] == bistr('\t', ' ') assert bs[16:24] == bistr('\t', ' ') assert bs[24:30] == bistr('world!') assert bs[30:31] == bistr('\n')
def test_equality(): bs1 = bistr(' Hello world ').strip().casefold() bs2 = bistr(' Hello world ', 'hello world', Alignment([ (0, 0), (2, 0), (3, 1), (4, 2), (5, 3), (6, 4), (7, 5), (8, 6), (9, 7), (10, 8), (11, 9), (12, 10), (13, 11), (15, 11), ])) assert bs1 == bs2
def test_character_tokenizer(): from bistring import CharacterTokenizer text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') tokenizer = CharacterTokenizer('en_US') assert isinstance(tokenizer, Tokenizer) tokens = tokenizer.tokenize(text) assert tokens.text == text assert all(token.text == text[i:i+1] for i, token in enumerate(tokens))
def test_starts_ends_with(): bs = bistr('Beginning, middle, ending') assert bs.startswith('Begin') assert bs.endswith('ing') assert not bs.startswith('ending') assert not bs.endswith('Beginning') assert bs.startswith(('Begin', 'End')) assert bs.endswith(('beginning', 'ending'))
def test_sentence_tokenizer(): from bistring import SentenceTokenizer text = bistr('The following sentence is true. The preceeding sentence, surprisingly, is false.') tokenizer = SentenceTokenizer('en_US') assert isinstance(tokenizer, Tokenizer) tokens = tokenizer.tokenize(text) assert tokens.text == text assert len(tokens) == 2 assert tokens[0].text == text[:33] assert tokens[1].text == text[33:]
def test_find_index(): bs = bistr('dysfunction') assert bs.find('dis') == -1 assert bs.find('fun') == 3 assert bs.find_bounds('dis') == (-1, -1) assert bs.find_bounds('fun') == (3, 6) pytest.raises(ValueError, bs.index, 'dis') pytest.raises(ValueError, bs.index_bounds, 'dis') assert bs.index('fun') == 3 assert bs.index_bounds('fun') == (3, 6)
def test_justify(): bs = bistr('Hello world!') assert bs.center(5) == bs assert bs.center(20) == bistr('', ' ') + bs + bistr('', ' ') assert bs.center(21) == bistr('', ' ') + bs + bistr('', ' ') assert bs.ljust(5) == bs assert bs.ljust(16) == bs + bistr('', ' ') assert bs.rjust(5) == bs assert bs.rjust(16) == bistr('', ' ') + bs
def test_word_tokenizer(): from bistring import WordTokenizer text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') tokenizer = WordTokenizer('en_US') assert isinstance(tokenizer, Tokenizer) tokens = tokenizer.tokenize(text) assert tokens.text == text assert len(tokens) == 9 assert tokens.text_bounds(0, 2) == (1, 10) assert tokens[0:2].text == text[1:10] assert len(tokens.slice_by_text(5, 10)) == 1 assert len(tokens.slice_by_text(5, 11)) == 1 assert len(tokens.slice_by_text(3, 13)) == 3
def test_infer(): bs = bistr.infer('test', 'test') assert bs == bistr('test', 'test', Alignment.identity(4)) bs = bistr.infer('color', 'colour') assert bs[3:5].original == 'o' assert bs.inverse() == bistr.infer('colour', 'color') bs = bistr.infer( '🅃🄷🄴 🅀🅄🄸🄲🄺, 🄱🅁🄾🅆🄽 🦊 🄹🅄🄼🄿🅂 🄾🅅🄴🅁 🅃🄷🄴 🄻🄰🅉🅈 🐶', 'the quick brown fox jumps over the lazy dog', ) assert bs[0:3] == bistr('🅃🄷🄴', 'the', Alignment.identity(3)) assert bs[4:9] == bistr('🅀🅄🄸🄲🄺', 'quick', Alignment.identity(5)) assert bs[10:15] == bistr('🄱🅁🄾🅆🄽', 'brown', Alignment.identity(5)) assert bs[16:19].original == '🦊' assert bs[16:19].modified == 'fox' assert bs[20:25] == bistr('🄹🅄🄼🄿🅂', 'jumps', Alignment.identity(5)) assert bs[40:43].original == '🐶' assert bs[40:43].modified == 'dog' bs = bistr.infer( 'Ṫḧë qüïċḳ, ḅṛöẅṅ 🦊 jüṁṗṡ öṿëṛ ẗḧë ḷäżÿ 🐶', 'the quick brown fox jumps over the lazy dog', ) assert bs[0:3] == bistr('Ṫḧë', 'the', Alignment.identity(3)) assert bs[4:9] == bistr('qüïċḳ', 'quick', Alignment.identity(5)) assert bs[10:15] == bistr('ḅṛöẅṅ', 'brown', Alignment.identity(5)) assert bs[16:19].original == '🦊' assert bs[16:19].modified == 'fox' assert bs[20:25] == bistr('jüṁṗṡ', 'jumps', Alignment.identity(5)) assert bs[40:43].original == '🐶' assert bs[40:43].modified == 'dog' bs = bistr.infer( 'Z̴̡̪̫̖̥̔̿̃̈̏̎͠͝á̸̪̠̖̻̬̖̪̞͙͇̮̠͎̆͋́̐͌̒͆̓l̶͉̭̳̤̬̮̩͎̟̯̜͇̥̠̘͑͐̌͂̄́̀̂̌̈͛̊̄̚͜ģ̸̬̼̞̙͇͕͎̌̾̒̐̿̎̆̿̌̃̏̌́̾̈͘͜o̶̢̭͕͔̩͐ ̴̡̡̜̥̗͔̘̦͉̣̲͚͙̐̈́t̵͈̰̉̀͒̎̈̿̔̄̽͑͝͠ẹ̵̫̲̫̄͜͜x̵͕̳͈̝̤̭̼̼̻͓̿̌̽̂̆̀̀̍̒͐́̈̀̚͝t̸̡̨̥̺̣̟͎̝̬̘̪͔͆́̄̅̚', 'Zalgo text') for i, c in enumerate(bs): assert bs[i:i + 1].original.startswith(c)
def test_normalize(): # é is composed but ö has a combining diaeresis bs = bistr('H\u00E9llo\u0308') bs = bs.normalize('NFC') assert bs.original == 'H\u00E9llo\u0308' assert bs.modified == 'H\u00E9ll\u00F6' assert bs.modified == unicodedata.normalize('NFC', bs.original) assert bs[4:5].original == 'o\u0308' assert bs[4:5].modified == '\u00F6' bs = bs.normalize('NFD') assert bs.original == 'H\u00E9llo\u0308' assert bs.modified == 'He\u0301llo\u0308' assert bs.modified == unicodedata.normalize('NFD', bs.original) assert bs[1:3].original == '\u00E9' assert bs[1:3].modified == 'e\u0301'
def test_splitting_tokenizer(): from bistring import SplittingTokenizer text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') text = text.normalize('NFKD') text = text.casefold() tokenizer = SplittingTokenizer(r'\s+') assert isinstance(tokenizer, Tokenizer) tokens = tokenizer.tokenize(text) assert tokens.text == text assert len(tokens) == 9 assert tokens.text_bounds(0, 2) == (1, 11) assert tokens[0:2].text == text[1:11] assert len(tokens.slice_by_text(5, 10)) == 1 assert len(tokens.slice_by_text(5, 11)) == 1 assert len(tokens.slice_by_text(3, 13)) == 3
def test_rfind_rindex(): bs = bistr('dysfunction') assert bs.rfind('dis') == -1 assert bs.rfind('fun') == 3 assert bs.rfind('n') == 10 assert bs.rfind('n', None, 9) == 5 assert bs.rfind_bounds('dis') == (-1, -1) assert bs.rfind_bounds('fun') == (3, 6) assert bs.rfind_bounds('n') == (10, 11) assert bs.rfind_bounds('n', None, 9) == (5, 6) pytest.raises(ValueError, bs.index, 'dis') pytest.raises(ValueError, bs.index_bounds, 'dis') assert bs.rindex('fun') == 3 assert bs.rindex_bounds('fun') == (3, 6) assert bs.rindex_bounds('n') == (10, 11) assert bs.rindex_bounds('n', None, 9) == (5, 6)
def test_normalize(): # "Héllö" -- é is composed but ö has a combining diaeresis bs = bistr('H\u00E9llo\u0308').normalize('NFC') assert bs.original == 'H\u00E9llo\u0308' assert bs.modified == 'H\u00E9ll\u00F6' assert bs.modified == unicodedata.normalize('NFC', bs.original) assert bs[1:2] == bistr('\u00E9') assert bs[4:5] == bistr('o\u0308', '\u00F6') bs = bistr('H\u00E9llo\u0308').normalize('NFD') assert bs.original == 'H\u00E9llo\u0308' assert bs.modified == 'He\u0301llo\u0308' assert bs.modified == unicodedata.normalize('NFD', bs.original) assert bs[1:3] == bistr('\u00E9', 'e\u0301') assert bs[5:7] == bistr('o\u0308')