def test_write(self): temp_file = os.path.join(EXAMPLES_DIR, "temp.xml") markup = MARKUP_EXAMPLE Writer.write_markups(FileType.XML, [markup], temp_file) processed_xml = Reader.read_markups(temp_file, FileType.XML, is_processed=True) self.assertEqual(next(processed_xml), markup) processed_xml.close() os.remove(temp_file) temp_file = os.path.join(EXAMPLES_DIR, "temp.txt") Writer.write_markups(FileType.RAW, [markup], temp_file) processed_raw = Reader.read_markups(temp_file, FileType.RAW, is_processed=True) self.assertIsInstance((next(processed_raw)), Markup) processed_raw.close() os.remove(temp_file)
def generate_markups(self, input_path: str, input_type: FileType, output_path: str, output_type: FileType) -> None: """ Генерация разметок по текстам. :param input_path: путь к папке/файлу с текстом. :param input_type: тип файлов с текстов. :param output_path: путь к файлу с итоговыми разметками. :param output_type: тип итогового файла. """ markups = Reader.read_markups(input_path, input_type, False, self.get_stress_predictor()) writer = Writer(output_type, output_path) writer.open() for markup in markups: writer.write_markup(markup) writer.close()
def run(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path, stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict): raw_writer = Writer(FileType.RAW, MARKUPS_DUMP_RAW_PATH) raw_writer.open() i = 0 path = "/media/data/stihi_ru_clean" paths = get_paths(path, "") engine = Engine(language="ru") engine.load(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path, stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict) for filename in paths: with open(filename, "r", encoding="utf-8") as file: text = "" is_text = False try: for file_line in file: if "<div" in file_line: is_text = True elif "</div>" in file_line: is_text = False clean_text = "" skip = False lines = text.split("\n") for line in lines: if line == "": continue for ch in line: if "a" < ch < "z" or "A" < ch < "Z" or ch == "Ј": skip = True break clean_text += line.strip() + "\n" if not skip: print(clean_text.split("\n")[:2]) markup, result = engine.get_improved_markup(clean_text) raw_writer.write_markup(markup) else: print("Skipped") i += 1 print(i) text = "" elif is_text: text += file_line.strip() + "\n" except Exception as e: pass raw_writer.close()
def handle(self, *args, **options): engine = Engine(language="ru") poems = Poem.objects.all() begin = int(options.get('from')) end = int( options.get('to')) if options.get('to') is not None else len(poems) poems = Poem.objects.all()[begin:end] xml_path = str( options.get('xml')) if options.get('xml') is not None else None raw_path = str( options.get('raw')) if options.get('raw') is not None else None db = options.get('db') author = options.get("author") markup_version = MarkupVersion.objects.get_or_create(name=author)[0] ModelMarkup.objects.filter(markup_version=markup_version).delete() xml_writer = None raw_writer = None if xml_path is not None: xml_path = os.path.join(BASE_DIR, xml_path) xml_writer = Writer(FileType.XML, xml_path) xml_writer.open() if raw_path is not None: raw_path = os.path.join(BASE_DIR, raw_path) raw_writer = Writer(FileType.RAW, raw_path) raw_writer.open() i = 0 stress_predictor = engine.get_stress_predictor( stress_model_path=STRESS_MODEL, zalyzniak_dict=ZALYZNYAK_DICT, stress_trie_path=TRIE_PATH, raw_stress_dict_path=RAW_DICT_PATH) for p in poems: if "Automatic" in author: markup = Markup.process_text(p.text, stress_predictor) markup, result = MetreClassifier.improve_markup(markup) if xml_writer is not None: xml_writer.write_markup(markup) if raw_writer is not None: raw_writer.write_markup(markup) if db: ModelMarkup.objects.create(poem=p, text=markup.to_json(), author="Automatic2", additional=result.to_json(), markup_version=markup_version) else: markup = p.markups.filter(author=author)[0] if xml_writer is not None: xml_writer.write_markup(markup.get_markup()) if raw_writer is not None: raw_writer.write_markup(markup.get_markup()) i += 1 print(i) if raw_writer is not None: raw_writer.close() if xml_writer is not None: xml_writer.close()