def test_aligner(self): aligner = Aligner() with open(RU_G2P_DICT_PATH, 'r', encoding='utf-8') as r: lines = r.readlines()[:50] pairs = [tuple(line.strip().split("\t")) for line in lines] for g, p in pairs: print(aligner.align(g, p))
def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path, g2p_model): from rupo.g2p.rnn import RNNG2PModel from rupo.g2p.aligner import Aligner from rupo.stress.dict import StressDict g2p_predictor = RNNG2PModel(g2p_dict_path) g2p_predictor.load(g2p_model) aligner = Aligner() grapheme_stress_dict_path = os.path.join( os.path.dirname(os.path.abspath(source_file)), "ru_grapheme_stress.txt") ZalyzniakDict.convert_to_accent_only(source_file, grapheme_stress_dict_path) d = StressDict(raw_dict_path=grapheme_stress_dict_path) vowels = set(Phonemes.VOWELS) with open(destination_file, 'w', encoding='utf-8') as w: samples = 0 for word, accents in d.get_all(): primary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.PRIMARY ] secondary_in_dict = [ int(stress[0]) for stress in accents if stress[1] == StressDict.StressType.SECONDARY ] phonemes = g2p_predictor.predict([word])[0] g, p = aligner.align(word, phonemes) primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict) secondary = ZalyzniakDict.align_stresses( g, p, secondary_in_dict) is_valid = True for stress in primary + secondary: if p[stress] not in vowels: print(g, p, stress, p[stress]) is_valid = False if is_valid: w.write(phonemes + "\t" + ",".join([str(i) for i in primary]) + "\t" + ",".join([str(i) for i in secondary]) + "\n") samples += 1 if samples % 1000 == 0: print(samples)
def __init__(self, language: str = "ru", stress_model_path: str = None, g2p_model_path: str = None, grapheme_set=RU_GRAPHEME_SET, g2p_dict_path=None, aligner_dump_path=None, ru_wiki_dict=RU_WIKI_DICT, cmu_dict=CMU_DICT): self.language = language self.stress_model_path = stress_model_path self.g2p_model_path = g2p_model_path if language == "ru": self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL, RU_G2P_DEFAULT_MODEL) elif language == "en": self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL, EN_G2P_DEFAULT_MODEL) else: raise RuntimeError("Wrong language") if not os.path.exists(self.stress_model_path) or not os.path.exists( self.g2p_model_path): raise RuntimeError( "No stress or g2p models available (or wrong paths)") self.stress_model = RNNStressModel(language=language) self.stress_model.load(self.stress_model_path) self.g2p_model = RNNG2PModel(language=language) self.g2p_model.load(self.g2p_model_path) self.aligner = Aligner(language, grapheme_set, g2p_dict_path, aligner_dump_path, ru_wiki_dict=ru_wiki_dict, cmu_dict=cmu_dict)
def test_aligner(self): aligner = Aligner() self.assertEqual(aligner.align('абазия', 'ɐbɐzʲijə'), ('абаз и я', 'ɐbɐzʲijə')) self.assertEqual(aligner.align('аахенец', 'aəxʲɪnʲɪʦ'), ('аах ен ец', 'aəxʲɪnʲɪʦ')) self.assertEqual(aligner.align('абатский', 'ɐbaʦkʲɪj'), ('абатск ий', 'ɐbaʦ kʲɪj')) self.assertEqual(aligner.align('абазинско-русский', 'ɐbɐzʲinskəruskʲɪj'), ('абаз инско-русск ий', 'ɐbɐzʲinskə rus kʲɪj')) with open(RU_G2P_DICT_PATH, 'r', encoding='utf-8') as r: lines = r.readlines()[:50] pairs = [tuple(line.strip().split("\t")) for line in lines] for g, p in pairs: logging.debug(aligner.align(g, p))
class RNNStressPredictor(StressPredictor): def __init__(self, language: str = "ru", stress_model_path: str = None, g2p_model_path: str = None, grapheme_set=RU_GRAPHEME_SET, g2p_dict_path=None, aligner_dump_path=None, ru_wiki_dict=RU_WIKI_DICT, cmu_dict=CMU_DICT): self.language = language self.stress_model_path = stress_model_path self.g2p_model_path = g2p_model_path if language == "ru": self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL, RU_G2P_DEFAULT_MODEL) elif language == "en": self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL, EN_G2P_DEFAULT_MODEL) else: raise RuntimeError("Wrong language") if not os.path.exists(self.stress_model_path) or not os.path.exists( self.g2p_model_path): raise RuntimeError( "No stress or g2p models available (or wrong paths)") self.stress_model = RNNStressModel(language=language) self.stress_model.load(self.stress_model_path) self.g2p_model = RNNG2PModel(language=language) self.g2p_model.load(self.g2p_model_path) self.aligner = Aligner(language, grapheme_set, g2p_dict_path, aligner_dump_path, ru_wiki_dict=ru_wiki_dict, cmu_dict=cmu_dict) def __init_language_defaults(self, stress_model_path, g2p_model_path): if self.stress_model_path is None: self.stress_model_path = stress_model_path if self.g2p_model_path is None: self.g2p_model_path = g2p_model_path def predict(self, word: str) -> List[int]: word = word.lower() phonemes = self.g2p_model.predict([word])[0].replace(" ", "") stresses = self.stress_model.predict([phonemes])[0] stresses = [ i for i, stress in enumerate(stresses) if stress == 1 or stress == 2 ] g, p = self.aligner.align(word, phonemes) stresses = self.aligner.align_stresses(g, p, stresses, is_grapheme=False) for i, stress in enumerate(stresses): stresses[i] -= len([ch for ch in g[:stress] if ch == " "]) stresses = [i for i in stresses if i < len(word)] return stresses