def test_write(self):
        temp_file = os.path.join(EXAMPLES_DIR, "temp.xml")
        markup = MARKUP_EXAMPLE
        Writer.write_markups(FileType.XML, [markup], temp_file)
        processed_xml = Reader.read_markups(temp_file,
                                            FileType.XML,
                                            is_processed=True)
        self.assertEqual(next(processed_xml), markup)
        processed_xml.close()
        os.remove(temp_file)

        temp_file = os.path.join(EXAMPLES_DIR, "temp.txt")
        Writer.write_markups(FileType.RAW, [markup], temp_file)
        processed_raw = Reader.read_markups(temp_file,
                                            FileType.RAW,
                                            is_processed=True)
        self.assertIsInstance((next(processed_raw)), Markup)
        processed_raw.close()
        os.remove(temp_file)
Exemple #2
0
    def generate_markups(self, input_path: str, input_type: FileType,
                         output_path: str, output_type: FileType) -> None:
        """
        Генерация разметок по текстам.

        :param input_path: путь к папке/файлу с текстом.
        :param input_type: тип файлов с текстов.
        :param output_path: путь к файлу с итоговыми разметками.
        :param output_type: тип итогового файла.
        """
        markups = Reader.read_markups(input_path, input_type, False,
                                      self.get_stress_predictor())
        writer = Writer(output_type, output_path)
        writer.open()
        for markup in markups:
            writer.write_markup(markup)
        writer.close()
Exemple #3
0
def run(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path,
        stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict):
    raw_writer = Writer(FileType.RAW, MARKUPS_DUMP_RAW_PATH)
    raw_writer.open()
    i = 0
    path = "/media/data/stihi_ru_clean"
    paths = get_paths(path, "")
    engine = Engine(language="ru")
    engine.load(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path,
        stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict)
    for filename in paths:
        with open(filename, "r", encoding="utf-8") as file:
            text = ""
            is_text = False
            try:
                for file_line in file:
                    if "<div" in file_line:
                        is_text = True
                    elif "</div>" in file_line:
                        is_text = False
                        clean_text = ""
                        skip = False
                        lines = text.split("\n")
                        for line in lines:
                            if line == "":
                                continue
                            for ch in line:
                                if "a" < ch < "z" or "A" < ch < "Z" or ch == "Ј":
                                    skip = True
                                    break
                            clean_text += line.strip() + "\n"
                        if not skip:
                            print(clean_text.split("\n")[:2])
                            markup, result = engine.get_improved_markup(clean_text)
                            raw_writer.write_markup(markup)
                        else:
                            print("Skipped")
                        i += 1
                        print(i)
                        text = ""
                    elif is_text:
                        text += file_line.strip() + "\n"
            except Exception as e:
                pass
    raw_writer.close()
Exemple #4
0
    def handle(self, *args, **options):
        engine = Engine(language="ru")
        poems = Poem.objects.all()
        begin = int(options.get('from'))
        end = int(
            options.get('to')) if options.get('to') is not None else len(poems)
        poems = Poem.objects.all()[begin:end]

        xml_path = str(
            options.get('xml')) if options.get('xml') is not None else None
        raw_path = str(
            options.get('raw')) if options.get('raw') is not None else None

        db = options.get('db')
        author = options.get("author")
        markup_version = MarkupVersion.objects.get_or_create(name=author)[0]
        ModelMarkup.objects.filter(markup_version=markup_version).delete()

        xml_writer = None
        raw_writer = None
        if xml_path is not None:
            xml_path = os.path.join(BASE_DIR, xml_path)
            xml_writer = Writer(FileType.XML, xml_path)
            xml_writer.open()
        if raw_path is not None:
            raw_path = os.path.join(BASE_DIR, raw_path)
            raw_writer = Writer(FileType.RAW, raw_path)
            raw_writer.open()
        i = 0
        stress_predictor = engine.get_stress_predictor(
            stress_model_path=STRESS_MODEL,
            zalyzniak_dict=ZALYZNYAK_DICT,
            stress_trie_path=TRIE_PATH,
            raw_stress_dict_path=RAW_DICT_PATH)
        for p in poems:
            if "Automatic" in author:
                markup = Markup.process_text(p.text, stress_predictor)
                markup, result = MetreClassifier.improve_markup(markup)
                if xml_writer is not None:
                    xml_writer.write_markup(markup)
                if raw_writer is not None:
                    raw_writer.write_markup(markup)
                if db:
                    ModelMarkup.objects.create(poem=p,
                                               text=markup.to_json(),
                                               author="Automatic2",
                                               additional=result.to_json(),
                                               markup_version=markup_version)
            else:
                markup = p.markups.filter(author=author)[0]
                if xml_writer is not None:
                    xml_writer.write_markup(markup.get_markup())
                if raw_writer is not None:
                    raw_writer.write_markup(markup.get_markup())
            i += 1
            print(i)
        if raw_writer is not None:
            raw_writer.close()
        if xml_writer is not None:
            xml_writer.close()