Beispiel #1
0
 def test_from_to(self):
     clean_markup = Markup()
     self.assertEqual(MARKUP_EXAMPLE,
                      clean_markup.from_xml(MARKUP_EXAMPLE.to_xml()))
     clean_markup = Markup()
     self.assertEqual(MARKUP_EXAMPLE,
                      clean_markup.from_json(MARKUP_EXAMPLE.to_json()))
Beispiel #2
0
    def read_vocabulary(path: str):
        """
        Считывание словаря.

        :param path: путь к словарю.
        :return: слово и его индекс.
        """
        paths = Reader.get_paths(path, FileType.VOCAB.value)
        for filename in paths:
            with open(filename, "r", encoding="utf-8") as file:
                for line in file:
                    fields = line.strip().split('\t')
                    yield Markup().from_raw(fields[0]).lines[0].words[0], int(fields[1])
Beispiel #3
0
    def read_markups(path: str, source_type: FileType, is_processed: bool,
                     stress_predictor: StressPredictor=None) -> Iterator[Markup]:
        """
        Считывание разметок (включая разметку по сырым текстам).

        :param path: путь к файлу/папке.
        :param source_type: тип файлов.
        :param is_processed: уже размеченные тексты?
        :param stress_predictor: классификатор ударений (для неразмеченных текстов).
        """
        paths = Reader.get_paths(path, source_type.value)
        for filename in paths:
            with open(filename, "r", encoding="utf-8") as file:
                if is_processed:
                    if source_type == FileType.XML:
                        for elem in Reader.__xml_iter(file, 'markup'):
                            yield Markup().from_xml(etree.tostring(elem, encoding='utf-8', method='xml'))
                    elif source_type == FileType.JSON:
                        j = json.load(file)
                        for item in j['items']:
                            yield Markup().from_dict(item)
                    elif source_type == FileType.RAW:
                        separator_count = 0
                        text = ""
                        for line in file:
                            if line == "\n":
                                separator_count += 1
                            else:
                                text += line
                            if separator_count == 3:
                                separator_count = 0
                                yield Markup().from_raw(text)
                        if text != "":
                            yield Markup().from_raw(text)
                else:
                    assert stress_predictor is not None
                    for text in Reader.read_texts(filename, source_type):
                        yield Reader.__markup_text(text, stress_predictor)
Beispiel #4
0
from rupo.main.markup import Markup, Line, Word, Syllable

MARKUP_EXAMPLE = Markup("Соломка король себя.\n Пора виться майкой в.", [
            Line(0, 20, "Соломка король себя.", [
                Word(0, 7, "Соломка",
                     [Syllable(0, 2, 0, "Со"),
                      Syllable(2, 5, 1, "лом", 3),
                      Syllable(5, 7, 2, "ка")]),
                Word(8, 14, "король",
                     [Syllable(0, 2, 0, "ко"),
                      Syllable(2, 6, 1, "роль", 3)]),
                Word(15, 19, "себя",
                     [Syllable(0, 2, 0, "се"),
                      Syllable(2, 4, 1, "бя", 3)])]),
            Line(21, 43, " Пора виться майкой в.",[
                Word(22, 26, "Пора",
                     [Syllable(0, 2, 0, "По", 1),
                      Syllable(2, 4, 1, "ра", 3)]),
                Word(27, 33, "виться",
                     [Syllable(0, 2, 0, "ви", 1),
                      Syllable(2, 6, 1, "ться")]),
                Word(34, 40, "майкой",
                     [Syllable(0, 3, 0, "май", 1),
                      Syllable(3, 6, 1, "кой")]),
                Word(41, 42, "в", [])
                ])])