def test_slice_start_from_space(self): offff = 20 txt = ' ' * offff + '''основании Устава, с одной стороны, и Фонд «Благо»''' tm = TextMap(txt) print(tm.map[0]) print(tm.tokens[11]) print(tm.map[11]) # print(f'[{doc.tokens_map.text}]') print(f'[{tm.text}]') print(len(tm)) tm_sliced = tm.slice(slice(0, len(tm))) print('span-0') print(tm.map[0]) print(tm_sliced.map[0]) self.assertEqual(len(tm), len(tm_sliced)) self.assertEqual(tm.map[0], tm_sliced.map[0]) for c in range(len(tm.tokens[0])): print(c) self.assertEqual(0, tm.token_index_by_char(c)) self.assertEqual(0, tm_sliced.token_index_by_char(c)) self.assertEqual(tm.text, tm_sliced.text) self.assertEqual(0, tm.token_index_by_char(0))
def test_token_indices_by_char_range_sliced(self): text = 'm йe qwert' __tm = TextMap(text) # tokenization tm = __tm.slice(slice(1, 2)) self.assertEqual('йe', tm.tokens[0]) self.assertEqual('йe', tm.text) char_range = tm.char_range([0, 1]) ti = tm.token_indices_by_char_range(char_range) self.assertEqual(0, ti[0]) self.assertEqual(1, ti[1]) ti = tm.token_indices_by_char_range([1, 2]) self.assertEqual(0, ti[0])
def test_concat_then_slice(self): text1 = 'этилен мама' text2 = 'этилен папа' tm0 = TextMap('') tm1 = TextMap(text1) tm2 = TextMap(text2) tm0 += tm1 tm0 += tm2 print(tm1.tokens) self.assertEqual(text1 + text2, tm0.text) self.assertEqual('мамаэтилен', tm0.text_range([1, 3])) tm3 = tm0.slice(slice(1, 3)) self.assertEqual('мамаэтилен', tm3.text)
def test_slice(self): text = 'этилен мама ಶ್ರೀರಾಮ' tm = TextMap(text) tm2: TextMap = tm.slice(slice(1, 2)) self.assertEqual(tm2[0], 'мама') self.assertEqual(tm2.text, 'мама') tm3 = tm2.slice(slice(0, 1)) self.assertEqual(tm3[0], 'мама') self.assertEqual(0, tm.token_index_by_char(1)) self.assertEqual(0, tm2.token_index_by_char(1)) self.assertEqual(0, tm3.token_index_by_char(1)) self.assertEqual('мама', tm3.text) self.assertEqual('мама', tm3.text_range([0, 1])) self.assertEqual('мама', tm3.text_range([0, 2]))