Beispiel #1
0
 def test_aligner(self):
     aligner = Aligner()
     with open(RU_G2P_DICT_PATH, 'r', encoding='utf-8') as r:
         lines = r.readlines()[:50]
         pairs = [tuple(line.strip().split("\t")) for line in lines]
         for g, p in pairs:
             print(aligner.align(g, p))
Beispiel #2
0
 def convert_to_phoneme_stress(source_file, destination_file, g2p_dict_path,
                               g2p_model):
     from rupo.g2p.rnn import RNNG2PModel
     from rupo.g2p.aligner import Aligner
     from rupo.stress.dict import StressDict
     g2p_predictor = RNNG2PModel(g2p_dict_path)
     g2p_predictor.load(g2p_model)
     aligner = Aligner()
     grapheme_stress_dict_path = os.path.join(
         os.path.dirname(os.path.abspath(source_file)),
         "ru_grapheme_stress.txt")
     ZalyzniakDict.convert_to_accent_only(source_file,
                                          grapheme_stress_dict_path)
     d = StressDict(raw_dict_path=grapheme_stress_dict_path)
     vowels = set(Phonemes.VOWELS)
     with open(destination_file, 'w', encoding='utf-8') as w:
         samples = 0
         for word, accents in d.get_all():
             primary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.PRIMARY
             ]
             secondary_in_dict = [
                 int(stress[0]) for stress in accents
                 if stress[1] == StressDict.StressType.SECONDARY
             ]
             phonemes = g2p_predictor.predict([word])[0]
             g, p = aligner.align(word, phonemes)
             primary = ZalyzniakDict.align_stresses(g, p, primary_in_dict)
             secondary = ZalyzniakDict.align_stresses(
                 g, p, secondary_in_dict)
             is_valid = True
             for stress in primary + secondary:
                 if p[stress] not in vowels:
                     print(g, p, stress, p[stress])
                     is_valid = False
             if is_valid:
                 w.write(phonemes + "\t" +
                         ",".join([str(i) for i in primary]) + "\t" +
                         ",".join([str(i) for i in secondary]) + "\n")
             samples += 1
             if samples % 1000 == 0:
                 print(samples)
Beispiel #3
0
    def __init__(self,
                 language: str = "ru",
                 stress_model_path: str = None,
                 g2p_model_path: str = None,
                 grapheme_set=RU_GRAPHEME_SET,
                 g2p_dict_path=None,
                 aligner_dump_path=None,
                 ru_wiki_dict=RU_WIKI_DICT,
                 cmu_dict=CMU_DICT):
        self.language = language
        self.stress_model_path = stress_model_path
        self.g2p_model_path = g2p_model_path

        if language == "ru":
            self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL,
                                          RU_G2P_DEFAULT_MODEL)
        elif language == "en":
            self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL,
                                          EN_G2P_DEFAULT_MODEL)
        else:
            raise RuntimeError("Wrong language")

        if not os.path.exists(self.stress_model_path) or not os.path.exists(
                self.g2p_model_path):
            raise RuntimeError(
                "No stress or g2p models available (or wrong paths)")

        self.stress_model = RNNStressModel(language=language)
        self.stress_model.load(self.stress_model_path)
        self.g2p_model = RNNG2PModel(language=language)
        self.g2p_model.load(self.g2p_model_path)
        self.aligner = Aligner(language,
                               grapheme_set,
                               g2p_dict_path,
                               aligner_dump_path,
                               ru_wiki_dict=ru_wiki_dict,
                               cmu_dict=cmu_dict)
Beispiel #4
0
 def test_aligner(self):
     aligner = Aligner()
     self.assertEqual(aligner.align('абазия', 'ɐbɐzʲijə'), ('абаз и я', 'ɐbɐzʲijə'))
     self.assertEqual(aligner.align('аахенец', 'aəxʲɪnʲɪʦ'), ('аах ен ец', 'aəxʲɪnʲɪʦ'))
     self.assertEqual(aligner.align('абатский', 'ɐbaʦkʲɪj'), ('абатск ий', 'ɐbaʦ kʲɪj'))
     self.assertEqual(aligner.align('абазинско-русский', 'ɐbɐzʲinskəruskʲɪj'),
                      ('абаз инско-русск ий', 'ɐbɐzʲinskə rus kʲɪj'))
     with open(RU_G2P_DICT_PATH, 'r', encoding='utf-8') as r:
         lines = r.readlines()[:50]
         pairs = [tuple(line.strip().split("\t")) for line in lines]
         for g, p in pairs:
             logging.debug(aligner.align(g, p))
Beispiel #5
0
class RNNStressPredictor(StressPredictor):
    def __init__(self,
                 language: str = "ru",
                 stress_model_path: str = None,
                 g2p_model_path: str = None,
                 grapheme_set=RU_GRAPHEME_SET,
                 g2p_dict_path=None,
                 aligner_dump_path=None,
                 ru_wiki_dict=RU_WIKI_DICT,
                 cmu_dict=CMU_DICT):
        self.language = language
        self.stress_model_path = stress_model_path
        self.g2p_model_path = g2p_model_path

        if language == "ru":
            self.__init_language_defaults(RU_STRESS_DEFAULT_MODEL,
                                          RU_G2P_DEFAULT_MODEL)
        elif language == "en":
            self.__init_language_defaults(EN_STRESS_DEFAULT_MODEL,
                                          EN_G2P_DEFAULT_MODEL)
        else:
            raise RuntimeError("Wrong language")

        if not os.path.exists(self.stress_model_path) or not os.path.exists(
                self.g2p_model_path):
            raise RuntimeError(
                "No stress or g2p models available (or wrong paths)")

        self.stress_model = RNNStressModel(language=language)
        self.stress_model.load(self.stress_model_path)
        self.g2p_model = RNNG2PModel(language=language)
        self.g2p_model.load(self.g2p_model_path)
        self.aligner = Aligner(language,
                               grapheme_set,
                               g2p_dict_path,
                               aligner_dump_path,
                               ru_wiki_dict=ru_wiki_dict,
                               cmu_dict=cmu_dict)

    def __init_language_defaults(self, stress_model_path, g2p_model_path):
        if self.stress_model_path is None:
            self.stress_model_path = stress_model_path
        if self.g2p_model_path is None:
            self.g2p_model_path = g2p_model_path

    def predict(self, word: str) -> List[int]:
        word = word.lower()
        phonemes = self.g2p_model.predict([word])[0].replace(" ", "")
        stresses = self.stress_model.predict([phonemes])[0]
        stresses = [
            i for i, stress in enumerate(stresses)
            if stress == 1 or stress == 2
        ]
        g, p = self.aligner.align(word, phonemes)
        stresses = self.aligner.align_stresses(g,
                                               p,
                                               stresses,
                                               is_grapheme=False)
        for i, stress in enumerate(stresses):
            stresses[i] -= len([ch for ch in g[:stress] if ch == " "])
        stresses = [i for i in stresses if i < len(word)]
        return stresses