Esempio n. 1
0
    def read(self) -> list:

        words = []
        buffer_text = []
        buffer_signs = []
        dash = [False]
        start = False

        with open(self._file_path, encoding=self._encoding) as input:
            while True:
                sym = input.read(1)
                if not sym:
                    if len(buffer_text):
                        word = TextPunctuation(''.join(buffer_text), buffer_signs)
                        words.append(word)
                    break
                if sym in spaces:
                    if not start:
                        continue
                    if len(buffer_text):
                        if dash[0] and sym != ' ':
                            buffer_signs[-1] = constants.HYPHEN
                            continue
                        word = TextPunctuation(''.join(buffer_text), buffer_signs)
                        words.append(word)
                    buffer_text = []
                    buffer_signs = []
                else:
                    start = True
                    dash[0] = False
                    to_text, to_signs = to_buffer(sym, dash)
                    buffer_text.extend(to_text)
                    buffer_signs.extend(to_signs)

        return words
Esempio n. 2
0
    def test_union_with_foll_upper(self):
        words = [
            TextPunctuation('З', [None]),
            TextPunctuation('собою', [None, None, None, None, None]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [Text('зсобою'), End()])
Esempio n. 3
0
    def test_union_with_prec_upper(self):
        words = [
            TextPunctuation('українського', [
                None, None, None, None, None, None, None, None, None, None,
                None, None
            ]),
            TextPunctuation('Ж', [None]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [Text('українськогож'), End()])
Esempio n. 4
0
    def test_hypnen_at_the_end(self):
        words = [
            TextPunctuation('сло-', [None, None, None, constants.HYPHEN]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [End()])
Esempio n. 5
0
    def test_punctuation_within(self):
        words = [
            TextPunctuation('сло!во',
                            [None, None, None, constants.PUNCT, None, None]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [End()])
Esempio n. 6
0
    def test_non_alphabet(self):
        words = [
            TextPunctuation('foreign',
                            [None, None, None, None, None, None, None]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [End()])
Esempio n. 7
0
    def test_hyphen(self):
        words = [
            TextPunctuation('сло-во',
                            [None, None, None, constants.HYPHEN, None, None]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [Text('слово'), End()])
Esempio n. 8
0
    def test_quotation(self):
        words = [
            TextPunctuation('«слово»', [
                constants.PUNCT, None, None, None, None, None, constants.PUNCT
            ]),
            End()
        ]
        result = run_through_module(words)

        self.assertEqual(result, [Text('слово'), End()])
Esempio n. 9
0
    def test_punctuation_only(self):
        words = [TextPunctuation('—', [constants.PUNCT]), End()]
        result = run_through_module(words)

        self.assertEqual(result, [End()])
Esempio n. 10
0
    def test_capital(self):
        words = [TextPunctuation('Київ', [None, None, None, None]), End()]
        result = run_through_module(words)

        self.assertEqual(result, [Text('київ'), End()])
Esempio n. 11
0
    def test_apostrophe_U02BC(self):
        words = [TextPunctuation('вʼю', [None, None, None]), End()]
        result = run_through_module(words)

        self.assertEqual(result, [Text('вʼю'), End()])
Esempio n. 12
0
    def test_union_with_foll_last(self):
        words = [TextPunctuation('з', [None]), End()]
        result = run_through_module(words)

        self.assertEqual(result, [Text('з'), End()])
Esempio n. 13
0
    def clean(self, words: list) -> list:

        curr, foll = words[0], words[1]
        buffer_text, buffer_signs = [], []

        if len(curr.get_text()) != 1 and curr.get_text().isupper():
            return [None, foll]

        for i in range(len(curr.get_text())):
            sym = curr.get_text()[i]
            sign = curr.get_punctuation()[i]

            if sym == '.' and isinstance(
                    foll, TextPunctuation) and not foll.get_text().istitle():
                return [None, foll]

            if not len(buffer_text) and sign == constants.PUNCT:
                if sym in punctuation_to_erase:
                    continue
                elif sym in dashes:
                    return [None, foll]

            if sign == constants.HYPHEN:
                if len(curr.get_text()) == i + 1:
                    return [None, foll]
                else:
                    continue

            if sign == constants.PUNCT and sym not in hyphen_dashes:
                for j in range(i + 1, len(curr.get_text())):
                    next_sign = curr.get_punctuation()[j]
                    if next_sign != constants.PUNCT:
                        return [None, foll]
                return [
                    TextPunctuation(''.join(buffer_text), buffer_signs), foll
                ]

            sym_low = sym.lower()
            if sym_low not in self.get_data().letters:
                return [None, foll]

            buffer_text.append(sym_low)
            buffer_signs.append(sign)

        curr = TextPunctuation(''.join(buffer_text), buffer_signs)

        if isinstance(foll, TextPunctuation) and len(foll.get_text()) == 1:
            foll_low = TextPunctuation(foll.get_text().lower(),
                                       foll.get_punctuation())
            if self.is_zero_syll(foll_low.get_text()):
                buffer_attach = self.get_attachment(foll_low)
                if buffer_attach['attachment'] == 'to_preceding':
                    buffer_text.append(foll_low.get_text())
                    buffer_signs.append(foll_low.get_punctuation())
                    foll = None
        elif isinstance(foll, TextPunctuation) and len(
                curr.get_text()) == 1 and self.is_zero_syll(curr.get_text()):
            buffer_attach = self.get_attachment(curr)
            if buffer_attach['attachment'] == 'to_following':
                foll.set_text(curr.get_text() + foll.get_text())
                foll.set_punctuation(curr.get_punctuation() +
                                     foll.get_punctuation())
                return [None, foll]

        return [TextPunctuation(''.join(buffer_text), buffer_signs), foll]
Esempio n. 14
0
import constants
from config_data import ConfigData
from end import End
from pipe import *
from read_module import ReadModule
from word import TextPunctuation


file_path = '../test_files/belarusian/test_belarusian.txt'
encoding = 'utf-8-sig'
data = ConfigData('../../../py_scripts/configs/conf_be_cyr.json')

pipe_out = Pipe(queue.Queue(), threading.Condition())
module = ReadModule([pipe_out], file_path, encoding, data)

expected_result = [TextPunctuation('У', [None]), TextPunctuation('беларускай', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('мове', [None, None, None, None]), TextPunctuation('зычныя', [None, None, None, None, None, None]), TextPunctuation('могуць', [None, None, None, None, None, None]), TextPunctuation('адрознівацца', [None, None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('даўжынёй', [None, None, None, None, None, None, None, None]), TextPunctuation('гучання,', [None, None, None, None, None, None, None, constants.PUNCT]), TextPunctuation('якая', [None, None, None, None]), TextPunctuation('пака-звае', [None, None, None, None, constants.HYPHEN, None, None, None, None]), TextPunctuation('на', [None, None]), TextPunctuation('стык', [None, None, None, None]), TextPunctuation('марфем...', [None, None, None, None, None, None, constants.PUNCT, constants.PUNCT, constants.PUNCT]), TextPunctuation('Пераважная', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('‚колькасць‘', [constants.PUNCT, None, None, None, None, None, None, None, None, None, constants.PUNCT]), TextPunctuation('гукаў', [None, None, None, None, None]), TextPunctuation('утвараюцца', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('ў', [None]), TextPunctuation('цэнтры', [None, None, None, None, None, None]), TextPunctuation('ротавай', [None, None, None, None, None, None, None]), TextPunctuation('поласці', [None, None, None, None, None, None, None]), TextPunctuation('пры', [None, None, None]), TextPunctuation('высокім', [None, None, None, None, None, None, None]), TextPunctuation('агульным', [None, None, None, None, None, None, None, None]), TextPunctuation('пад’ёме', [None, None, None, None, None, None, None]), TextPunctuation('языка.', [None, None, None, None, None, constants.PUNCT]), TextPunctuation('Вялікае', [None, None, None, None, None, None, None]), TextPunctuation('Ducatus', [None, None, None, None, None, None, None]), TextPunctuation('Lithuaniae', [None, None, None, None, None, None, None, None, None, None]), TextPunctuation('знаходзілася', [None, None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('ў', [None]), TextPunctuation('дынастычнай', [None, None, None, None, None, None, None, None, None, None, None]), TextPunctuation('уніі', [None, None, None, None]), TextPunctuation('—', [constants.PUNCT]), TextPunctuation('з', [None]), TextPunctuation('Польскім', [None, None, None, None, None, None, None, None]), TextPunctuation('кара-леўствам!', [None, None, None, None, constants.HYPHEN, None, None, None, None, None, None, None, None, constants.PUNCT]), End()]


def get_from_module():

    module.run()
    result = []

    while True:
        pipe_out.acquire()
        if pipe_out.empty():
            pipe_out.wait()
        cleaned_word = pipe_out.get()
        result.append(cleaned_word)
        pipe_out.release()