def __init__(self, dump_filename: str, markup_path: str = None, from_voc: bool = False) -> None: """ :param dump_filename: файл, в который сохранется словарь. :param markup_path: файл/папка с разметками. """ self.dump_filename = dump_filename self.word_to_index = {} # type: Dict[StressedWord, int] self.index_to_word = {} # type: Dict[int, StressedWord] if os.path.isfile(self.dump_filename): self.load() elif markup_path is not None: if from_voc: word_indexes = Reader.read_vocabulary(markup_path) for word, index in word_indexes: self.add_word(word.to_stressed_word(), index) else: markups = Reader.read_markups(markup_path, FileType.XML, is_processed=True) for markup in markups: self.add_markup(markup) self.save()
def parse(self, markup_path: str, from_voc: bool = False): if from_voc: word_indexes = Reader.read_vocabulary(markup_path) for word, index in word_indexes: self.add_word(word.to_stressed_word(), index) else: markups = Reader.read_markups(markup_path, FileType.XML, is_processed=True) for markup in markups: self.add_markup(markup)
def __init__(self, dump_filename: str, vocabulary: StressVocabulary, markup_dump_path: str = None, n_poems: int = None, n_grams: int = 2): self.n_grams = n_grams self.transitions = defaultdict(Counter) # type: Dict[Tuple, Counter] self.vocabulary = vocabulary self.dump_filename = dump_filename # Делаем дамп модели для ускорения загрузки. if os.path.exists(self.dump_filename) and os.path.isfile( self.dump_filename): self.load() else: i = 0 markups = Reader.read_markups(markup_dump_path, FileType.XML, is_processed=True) for markup in markups: self.add_markup(markup) i += 1 if n_poems is not None and n_poems == i: break if i % 500 == 0: print(i) self.save()
def test_read(self): processed_xml = Reader.read_markups(MARKUP_XML_EXAMPLE, FileType.XML, is_processed=True) self.__assert_markup_is_correct(next(processed_xml)) unprocessed_xml = Reader.read_markups( TEXT_XML_EXAMPLE, FileType.XML, is_processed=False, stress_predictor=self.stress_predictor) self.__assert_markup_is_correct(next(unprocessed_xml)) processed_json = Reader.read_markups(MARKUP_JSON_EXAMPLE, FileType.JSON, is_processed=True) self.__assert_markup_is_correct(next(processed_json))
def test_write(self): temp_file = os.path.join(EXAMPLES_DIR, "temp.xml") markup = MARKUP_EXAMPLE Writer.write_markups(FileType.XML, [markup], temp_file) processed_xml = Reader.read_markups(temp_file, FileType.XML, is_processed=True) self.assertEqual(next(processed_xml), markup) processed_xml.close() os.remove(temp_file) temp_file = os.path.join(EXAMPLES_DIR, "temp.txt") Writer.write_markups(FileType.RAW, [markup], temp_file) processed_raw = Reader.read_markups(temp_file, FileType.RAW, is_processed=True) self.assertIsInstance((next(processed_raw)), Markup) processed_raw.close() os.remove(temp_file)
def generate_markups(self, input_path: str, input_type: FileType, output_path: str, output_type: FileType) -> None: """ Генерация разметок по текстам. :param input_path: путь к папке/файлу с текстом. :param input_type: тип файлов с текстов. :param output_path: путь к файлу с итоговыми разметками. :param output_type: тип итогового файла. """ markups = Reader.read_markups(input_path, input_type, False, self.get_stress_predictor()) writer = Writer(output_type, output_path) writer.open() for markup in markups: writer.write_markup(markup) writer.close()