def process_text(text: str, stress_predictor) -> 'Markup': """ Получение начального варианта разметки по слогам и ударениям. :param text: текст для разметки :param stress_predictor: предсказатель ударений. :return markup: разметка по слогам и ударениям """ from rupo.g2p.graphemes import Graphemes begin_line = 0 lines = [] words = [] text_lines = text.split("\n") for text_line in text_lines: tokens = [token for token in Tokenizer.tokenize(text_line) if token.token_type == Token.TokenType.WORD] for token in tokens: word = Word(begin_line + token.begin, begin_line + token.end, token.text, Graphemes.get_syllables(token.text)) # Проставляем ударения. stresses = stress_predictor.predict(token.text) # Сопоставляем ударения слогам. if len(word.syllables) > 1: word.set_stresses(stresses) words.append(word) end_line = begin_line + len(text_line) lines.append(Line(begin_line, end_line, text_line, words)) words = [] begin_line = end_line + 1 return Markup(text, lines)
def inflate_vocab(self, top_n=None) -> None: """ Получение словаря с ударениями по этому словарю. :param top_n: сколько первых записей взять? """ vocab = Vocabulary(GENERATOR_VOCAB_PATH) stress_predictor = CombinedStressPredictor() forms = self.word_forms if top_n is not None: forms = forms[:top_n] for index, word_form in tqdm(enumerate(forms), desc="Accenting words"): text = word_form.text stresses = stress_predictor.predict(text) word = Word(-1, -1, text, Graphemes.get_syllables(text)) word.set_stresses(stresses) vocab.add_word(word, index) vocab.save()
def from_raw(self, text: str) -> 'Markup': """ Импорт из сырого текста с ударениями в конце слов :param text: текст. :return: разметка. """ pos = 0 lines = [] for line in text.split("\n"): if line == "": continue line_tokens = [] for word in line.split(" "): i = -1 ch = word[i] stress = "" while ch.isdigit() or ch == "-": stress += ch i -= 1 ch = word[i] line_tokens.append((word[:i + 1], int(stress[::-1]))) words = [] line_begin = pos for pair in line_tokens: token = pair[0] stress = pair[1] from rupo.g2p.graphemes import Graphemes syllables = Graphemes.get_syllables(token) for j in range(len(syllables)): syllables[j].begin += pos syllables[j].end += pos word = Word(pos, pos + len(token), token, syllables) word.set_stresses([stress]) words.append(word) pos += len(token) + 1 lines.append( Line(line_begin, pos, " ".join([pair[0] for pair in line_tokens]), words)) self.text = "\n".join([line.text for line in lines]) self.lines = lines return self
def inflate_vocab(self, dump_path, top_n=None) -> None: """ Получение словаря с ударениями по этому словарю. :param top_n: сколько первых записей взять? :param dump_path: путь, куда сохранить словарь. """ from rupo.main.vocabulary import Vocabulary from rupo.stress.predictor import CombinedStressPredictor vocab = Vocabulary(dump_path) stress_predictor = CombinedStressPredictor() forms = self.word_forms if top_n is not None: forms = forms[:top_n] for index, word_form in tqdm(enumerate(forms), desc="Accenting words"): text = word_form.text stresses = stress_predictor.predict(text) word = Word(-1, -1, text, Graphemes.get_syllables(text)) word.set_stresses(stresses) vocab.add_word(word, index) vocab.save()
def count_syllables(word: str) -> int: """ :param word: слово. :return: количество слогов в нём. """ return len(Graphemes.get_syllables(word))
def get_word_syllables(word: str) -> List[str]: """ :param word: слово. :return: его слоги. """ return [syllable.text for syllable in Graphemes.get_syllables(word)]
def test_syllables(self): checks = { 'я': [Syllable(0, 1, 0, 'я')], 'в': [], 'лдж': [], 'кронв': [Syllable(0, 5, 0, 'кронв')], 'он': [Syllable(0, 2, 0, 'он')], 'когда': [Syllable(0, 2, 0, 'ко'), Syllable(2, 5, 1, 'гда')], 'майка': [Syllable(0, 3, 0, 'май'), Syllable(3, 5, 1, 'ка')], 'сонька': [Syllable(0, 4, 0, 'сонь'), Syllable(4, 6, 1, 'ка')], 'соломка': [ Syllable(0, 2, 0, 'со'), Syllable(2, 5, 1, 'лом'), Syllable(5, 7, 2, 'ка') ], 'изжить': [Syllable(0, 1, 0, 'и'), Syllable(1, 6, 1, 'зжить')], 'виться': [Syllable(0, 2, 0, 'ви'), Syllable(2, 6, 1, 'ться')], 'данный': [Syllable(0, 2, 0, 'да'), Syllable(2, 6, 1, 'нный')], 'марка': [Syllable(0, 3, 0, 'мар'), Syllable(3, 5, 1, 'ка')], 'зорька': [Syllable(0, 4, 0, 'зорь'), Syllable(4, 6, 1, 'ка')], 'банка': [Syllable(0, 3, 0, 'бан'), Syllable(3, 5, 1, 'ка')], 'банька': [Syllable(0, 4, 0, 'бань'), Syllable(4, 6, 1, 'ка')], 'лайка': [Syllable(0, 3, 0, 'лай'), Syllable(3, 5, 1, 'ка')], 'оттечь': [Syllable(0, 1, 0, 'о'), Syllable(1, 6, 1, 'ттечь')], 'дяденька': [ Syllable(0, 2, 0, 'дя'), Syllable(2, 6, 1, 'день'), Syllable(6, 8, 2, 'ка') ], 'подъезд': [Syllable(0, 2, 0, 'по'), Syllable(2, 7, 1, 'дъезд')], 'морские': [ Syllable(0, 3, 0, 'мор'), Syllable(3, 6, 1, 'ски'), Syllable(6, 7, 2, 'е') ], 'мерзкие': [ Syllable(0, 3, 0, 'мер'), Syllable(3, 6, 1, 'зки'), Syllable(6, 7, 2, 'е') ], 'полный': [Syllable(0, 2, 0, 'по'), Syllable(2, 6, 1, 'лный')], 'зародыш': [ Syllable(0, 2, 0, 'за'), Syllable(2, 4, 1, 'ро'), Syllable(4, 7, 2, 'дыш') ], 'война': [Syllable(0, 3, 0, 'вой'), Syllable(3, 5, 1, 'на')], 'когда-нибудь': [ Syllable(0, 2, 0, 'ко'), Syllable(2, 5, 1, 'гда'), Syllable(6, 8, 2, 'ни'), Syllable(8, 12, 3, 'будь') ], } for word, borders in checks.items(): self.assertEqual(Graphemes.get_syllables(word), borders)
def __init__(self, text: str, stresses: Set[Stress]) -> None: self.stresses = stresses self.text = text self.syllables = Graphemes.get_syllables(text) self.__accent_syllables()