def read(self) -> list: words = [] buffer_text = [] buffer_signs = [] dash = [False] start = False with open(self._file_path, encoding=self._encoding) as input: while True: sym = input.read(1) if not sym: if len(buffer_text): word = TextPunctuation(''.join(buffer_text), buffer_signs) words.append(word) break if sym in spaces: if not start: continue if len(buffer_text): if dash[0] and sym != ' ': buffer_signs[-1] = constants.HYPHEN continue word = TextPunctuation(''.join(buffer_text), buffer_signs) words.append(word) buffer_text = [] buffer_signs = [] else: start = True dash[0] = False to_text, to_signs = to_buffer(sym, dash) buffer_text.extend(to_text) buffer_signs.extend(to_signs) return words
def test_union_with_foll_upper(self): words = [ TextPunctuation('З', [None]), TextPunctuation('собою', [None, None, None, None, None]), End() ] result = run_through_module(words) self.assertEqual(result, [Text('зсобою'), End()])
def get_attachment(self, word: TextPunctuation) -> dict: text = word.get_text() sign = word.get_punctuation() buffer_attach = dict() attachment = self.get_data().zero_syll_words[text] buffer_attach['attachment'] = attachment buffer_attach['text'] = text buffer_attach['sign'] = sign return buffer_attach
def test_union_with_prec_upper(self): words = [ TextPunctuation('українського', [ None, None, None, None, None, None, None, None, None, None, None, None ]), TextPunctuation('Ж', [None]), End() ] result = run_through_module(words) self.assertEqual(result, [Text('українськогож'), End()])
def test_hypnen_at_the_end(self): words = [ TextPunctuation('сло-', [None, None, None, constants.HYPHEN]), End() ] result = run_through_module(words) self.assertEqual(result, [End()])
def test_punctuation_within(self): words = [ TextPunctuation('сло!во', [None, None, None, constants.PUNCT, None, None]), End() ] result = run_through_module(words) self.assertEqual(result, [End()])
def test_non_alphabet(self): words = [ TextPunctuation('foreign', [None, None, None, None, None, None, None]), End() ] result = run_through_module(words) self.assertEqual(result, [End()])
def test_hyphen(self): words = [ TextPunctuation('сло-во', [None, None, None, constants.HYPHEN, None, None]), End() ] result = run_through_module(words) self.assertEqual(result, [Text('слово'), End()])
def test_quotation(self): words = [ TextPunctuation('«слово»', [ constants.PUNCT, None, None, None, None, None, constants.PUNCT ]), End() ] result = run_through_module(words) self.assertEqual(result, [Text('слово'), End()])
def test_punctuation_only(self): words = [TextPunctuation('—', [constants.PUNCT]), End()] result = run_through_module(words) self.assertEqual(result, [End()])
def test_capital(self): words = [TextPunctuation('Київ', [None, None, None, None]), End()] result = run_through_module(words) self.assertEqual(result, [Text('київ'), End()])
def test_apostrophe_U02BC(self): words = [TextPunctuation('вʼю', [None, None, None]), End()] result = run_through_module(words) self.assertEqual(result, [Text('вʼю'), End()])
def test_union_with_foll_last(self): words = [TextPunctuation('з', [None]), End()] result = run_through_module(words) self.assertEqual(result, [Text('з'), End()])
def clean(self, words: list) -> list: curr, foll = words[0], words[1] buffer_text, buffer_signs = [], [] if len(curr.get_text()) != 1 and curr.get_text().isupper(): return [None, foll] for i in range(len(curr.get_text())): sym = curr.get_text()[i] sign = curr.get_punctuation()[i] if sym == '.' and isinstance( foll, TextPunctuation) and not foll.get_text().istitle(): return [None, foll] if not len(buffer_text) and sign == constants.PUNCT: if sym in punctuation_to_erase: continue elif sym in dashes: return [None, foll] if sign == constants.HYPHEN: if len(curr.get_text()) == i + 1: return [None, foll] else: continue if sign == constants.PUNCT and sym not in hyphen_dashes: for j in range(i + 1, len(curr.get_text())): next_sign = curr.get_punctuation()[j] if next_sign != constants.PUNCT: return [None, foll] return [ TextPunctuation(''.join(buffer_text), buffer_signs), foll ] sym_low = sym.lower() if sym_low not in self.get_data().letters: return [None, foll] buffer_text.append(sym_low) buffer_signs.append(sign) curr = TextPunctuation(''.join(buffer_text), buffer_signs) if isinstance(foll, TextPunctuation) and len(foll.get_text()) == 1: foll_low = TextPunctuation(foll.get_text().lower(), foll.get_punctuation()) if self.is_zero_syll(foll_low.get_text()): buffer_attach = self.get_attachment(foll_low) if buffer_attach['attachment'] == 'to_preceding': buffer_text.append(foll_low.get_text()) buffer_signs.append(foll_low.get_punctuation()) foll = None elif isinstance(foll, TextPunctuation) and len( curr.get_text()) == 1 and self.is_zero_syll(curr.get_text()): buffer_attach = self.get_attachment(curr) if buffer_attach['attachment'] == 'to_following': foll.set_text(curr.get_text() + foll.get_text()) foll.set_punctuation(curr.get_punctuation() + foll.get_punctuation()) return [None, foll] return [TextPunctuation(''.join(buffer_text), buffer_signs), foll]
import constants from config_data import ConfigData from end import End from pipe import * from read_module import ReadModule from word import TextPunctuation file_path = '../test_files/belarusian/test_belarusian.txt' encoding = 'utf-8-sig' data = ConfigData('../../../py_scripts/configs/conf_be_cyr.json') pipe_out = Pipe(queue.Queue(), threading.Condition()) module = ReadModule([pipe_out], file_path, encoding, data) expected_result = [TextPunctuation('У', [None]), TextPunctuation('беларускай', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('мове', [None, None, None, None]), TextPunctuation('зычныя', [None, None, None, None, None, None]), TextPunctuation('могуць', [None, None, None, None, None, None]), TextPunctuation('адрознівацца', [None, None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('даўжынёй', [None, None, None, None, None, None, None, None]), TextPunctuation('гучання,', [None, None, None, None, None, None, None, constants.PUNCT]), TextPunctuation('якая', [None, None, None, None]), TextPunctuation('пака-звае', [None, None, None, None, constants.HYPHEN, None, None, None, None]), TextPunctuation('на', [None, None]), TextPunctuation('стык', [None, None, None, None]), TextPunctuation('марфем...', [None, None, None, None, None, None, constants.PUNCT, constants.PUNCT, constants.PUNCT]), TextPunctuation('Пераважная', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('‚колькасць‘', [constants.PUNCT, None, None, None, None, None, None, None, None, None, constants.PUNCT]), TextPunctuation('гукаў', [None, None, None, None, None]), TextPunctuation('утвараюцца', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('ў', [None]), TextPunctuation('цэнтры', [None, None, None, None, None, None]), TextPunctuation('ротавай', [None, None, None, None, None, None, None]), TextPunctuation('поласці', [None, None, None, None, None, None, None]), TextPunctuation('пры', [None, None, None]), TextPunctuation('высокім', [None, None, None, None, None, None, None]), TextPunctuation('агульным', [None, None, None, None, None, None, None, None]), TextPunctuation('пад’ёме', [None, None, None, None, None, None, None]), TextPunctuation('языка.', [None, None, None, None, None, constants.PUNCT]), TextPunctuation('Вялікае', [None, None, None, None, None, None, None]), TextPunctuation('Ducatus', [None, None, None, None, None, None, None]), TextPunctuation('Lithuaniae', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('знаходзілася', [None, None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('ў', [None]), TextPunctuation('дынастычнай', [None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('уніі', [None, None, None, None]), TextPunctuation('—', [constants.PUNCT]), TextPunctuation('з', [None]), TextPunctuation('Польскім', [None, None, None, None, None, None, None, None]), TextPunctuation('кара-леўствам!', [None, None, None, None, constants.HYPHEN, None, None, None, None, None, None, None, None, constants.PUNCT]), End()] def get_from_module(): module.run() result = [] while True: pipe_out.acquire() if pipe_out.empty(): pipe_out.wait() cleaned_word = pipe_out.get() result.append(cleaned_word) pipe_out.release()
def is_title(self, word: TextPunctuation) -> bool: if word.get_text().istitle() or word.get_text( )[0] in first_letter_exceptions: return True return False