def g2p_en(): clf = RNNG2PModel(EN_G2P_DICT_PATH, 40, language="en", rnn=LSTM, units1=256, dropout=0.5) clf.build() clf.train(G2P_CURRENT_MODEL_DIR, enable_checkpoints=True)
def get_g2p_model(self, language="ru", model_path=None): if self.g2p_models.get(language) is None: self.g2p_models[language] = RNNG2PModel(language=language) if language == "ru" and model_path is None: model_path = RU_G2P_DEFAULT_MODEL elif language == "en" and model_path is None: model_path = EN_G2P_DEFAULT_MODEL else: return None self.g2p_models[language].load(model_path) return self.g2p_models[language]
def g2p_ru(): clf = RNNG2PModel(RU_G2P_DICT_PATH, 30, language="ru", rnn=LSTM, units1=512, units2=512, dropout=0.4, batch_size=128, emb_dimension=50) clf.build() clf.train(G2P_CURRENT_MODEL_DIR, enable_checkpoints=True)
def __init__(self, language: str = "ru", g2p_model_path: str = None): self.language = language self.g2p_model_path = g2p_model_path if language == "ru": self.__init_language_defaults(RU_G2P_DEFAULT_MODEL) elif language == "en": self.__init_language_defaults(EN_G2P_DEFAULT_MODEL) else: raise RuntimeError("Wrong language") if not os.path.exists(self.g2p_model_path): raise RuntimeError("No g2p model available (or wrong path)") self.g2p_model = RNNG2PModel(language=language) self.g2p_model.load(self.g2p_model_path)
def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path, g2p_model): from rupo.g2p.rnn import RNNG2PModel from rupo.g2p.aligner import Aligner from rupo.stress.dict import StressDict g2p_predictor = RNNG2PModel(g2p_dict_path) g2p_predictor.load(g2p_model) aligner = Aligner() grapheme_stress_dict_path = os.path.join( os.path.dirname(os.path.abspath(source_file)), "ru_grapheme_stress.txt") ZalyzniakDict.convert_to_accent_only(source_file, grapheme_stress_dict_path) d = StressDict(raw_dict_path=grapheme_stress_dict_path) vowels = set(Phonemes.VOWELS) with open(destination_file, 'w', encoding='utf-8') as w: samples = 0 for word, accents in d.get_all(): primary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.PRIMARY ] secondary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.SECONDARY ] phonemes = g2p_predictor.predict([word])[0] g, p = aligner.align(word, phonemes) primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict) secondary = ZalyzniakDict.align_stresses( g, p, secondary_in_dict) is_valid = True for stress in primary + secondary: if p[stress] not in vowels: print(g, p, stress, p[stress]) is_valid = False if is_valid: w.write(phonemes + "\t" + ",".join([str(i) for i in primary]) + "\t" + ",".join([str(i) for i in secondary]) + "\n") samples += 1 if samples % 1000 == 0: print(samples)
def convert_to_g2p_only(dict_file, g2p_dict_path, g2p_model): from rupo.g2p.rnn import RNNG2PModel g2p_predictor = RNNG2PModel() g2p_predictor.load(g2p_model) with open(dict_file, 'r', encoding='utf-8') as r: lines = r.readlines() with open(g2p_dict_path, 'w', encoding='utf-8') as w: words = [] for line in lines: for word in line.split("#")[1].split(","): word = word.strip() clean_word = "" for i, ch in enumerate(word): if ch == "'" or ch == "`": continue clean_word += ch words.append(clean_word) phonetic_words = g2p_predictor.predict(words) for i, word in enumerate(words): w.write(word + "\t" + phonetic_words[i] + "\n")
def __init__(self, language: str = "ru", stress_model_path: str = None, g2p_model_path: str = None, grapheme_set=RU_GRAPHEME_SET, g2p_dict_path=None, aligner_dump_path=None, ru_wiki_dict=RU_WIKI_DICT, cmu_dict=CMU_DICT): self.language = language self.stress_model_path = stress_model_path self.g2p_model_path = g2p_model_path if language == "ru": self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL, RU_G2P_DEFAULT_MODEL) elif language == "en": self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL, EN_G2P_DEFAULT_MODEL) else: raise RuntimeError("Wrong language") if not os.path.exists(self.stress_model_path) or not os.path.exists( self.g2p_model_path): raise RuntimeError( "No stress or g2p models available (or wrong paths)") self.stress_model = RNNStressModel(language=language) self.stress_model.load(self.stress_model_path) self.g2p_model = RNNG2PModel(language=language) self.g2p_model.load(self.g2p_model_path) self.aligner = Aligner(language, grapheme_set, g2p_dict_path, aligner_dump_path, ru_wiki_dict=ru_wiki_dict, cmu_dict=cmu_dict)