def main(): if sys.argv.__len__() > 1: init_dir_name = os.path.normpath(sys.argv[1]) assert os.path.isdir(init_dir_name), 'Directory `{0}` does not exist!'.format(init_dir_name) all_prompts = sorted(list(get_all_prompts(init_dir_name))) accentor = Accentor() morpho_predictor = RNNMorphPredictor() i = 0 for cur_prompt in all_prompts[:100]: trouble = False unknown_words = [] for cur_subsentence in select_subsentences(cur_prompt): morphotags = ['{0} {1}'.format(cur_morpho.pos, cur_morpho.tag) for cur_morpho in morpho_predictor.predict_sentence_tags(cur_subsentence)] accent_variants = accentor.do_accents(cur_subsentence, morphotags) if len(accent_variants) > 1: trouble = True else: accented_phrase = accent_variants[0] for cur_word in accented_phrase: vowels_counter = 0 for cur_char in cur_word.lower(): if cur_char in VOWEL_LETTERS: vowels_counter += 1 if '+' not in cur_word and vowels_counter > 1: unknown_words += [cur_word] if trouble: print('`{0}`: this phrase cannot be unambiguously accented!'.format(cur_prompt)) i += 1 if unknown_words: for unknown_word in list(set(unknown_words)): print('`{0}`: word `{1}` in this this phrase is unknown!'.format(cur_prompt, unknown_word)) print(i) else: print("Usage: input_directory_with_voxforge_ru")
def transcribe_words(source_words_list): n_words = len(source_words_list) n_parts = 100 part_size = n_words // n_parts while (part_size * n_parts) < n_words: part_size += 1 transcriptions = [] bad_words = [] to_ud2 = converters.converter('opencorpora-int', 'ud20') morph = pymorphy2.MorphAnalyzer() accentor = Accentor(exception_for_unknown=True, use_wiki=False) g2p = Grapheme2Phoneme(exception_for_nonaccented=True) russian_letters = set( 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя') russian_consonants = set('БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШшЩщЪъЬь') part_counter = 0 for word_idx in range(len(source_words_list)): cur_word = source_words_list[word_idx].strip().lower() err_msg = 'Word {0} is wrong!'.format(word_idx) assert len(cur_word) > 0, err_msg + ' It is empty!' assert set(cur_word) <= (russian_letters | {'-'}), \ err_msg + ' "{0}" contains an inadmissible characters.'.format(cur_word) assert set(cur_word) != {'-'}, err_msg + ' It is empty!' if (len(cur_word) > 1) and (set(cur_word) <= russian_consonants): bad_words.append(cur_word) else: morpho_variants = set( [to_ud2(str(it.tag)) for it in morph.parse(cur_word)]) try: accentuation_variants = [] for it in morpho_variants: accentuation_variants += accentor.do_accents( [[cur_word, it]])[0] variants_of_transcriptions = list( set( filter( lambda it2: len(it2) > 0, map(lambda it: tuple(g2p.word_to_phonemes(it)), accentuation_variants)))) if len(variants_of_transcriptions) > 0: transcriptions.append( (cur_word, ' '.join(variants_of_transcriptions[0]))) if len(variants_of_transcriptions) > 1: for variant_idx in range( 1, len(variants_of_transcriptions)): transcriptions.append( ('{0}({1})'.format(cur_word, variant_idx + 1), ' '.join( variants_of_transcriptions[variant_idx]))) else: bad_words.append(cur_word) except: bad_words.append(cur_word) if ((word_idx + 1) % part_size) == 0: part_counter += 1 print('{0:.2%} of words have been processed...'.format( part_counter / float(n_parts))) if part_counter < n_parts: print('100.00% of words have been processed...') return transcriptions, bad_words
def __init__(self, raise_exceptions: bool = False, batch_size: int = 64, verbose: bool = False, use_wiki: bool = False): self.__preprocessor = Preprocessor(batch_size=batch_size) self.__accentor = Accentor(exception_for_unknown=raise_exceptions, use_wiki=use_wiki) self.__g2p = Grapheme2Phoneme( exception_for_nonaccented=raise_exceptions) self.verbose = verbose
class Transcription: def __init__(self, raise_exceptions: bool = False, batch_size: int = 64, verbose: bool = False, use_wiki: bool = False): self.__preprocessor = Preprocessor(batch_size=batch_size) self.__accentor = Accentor(exception_for_unknown=raise_exceptions, use_wiki=use_wiki) self.__g2p = Grapheme2Phoneme( exception_for_nonaccented=raise_exceptions) self.verbose = verbose def transcribe(self, texts: list): all_words_and_tags = self.__preprocessor.preprocessing(texts) if self.verbose: print('All texts have been preprocessed...') n_texts = len(texts) n_data_parts = 100 part_size = n_texts // n_data_parts while (part_size * n_data_parts) < n_texts: part_size += 1 data_counter = 0 part_counter = 0 total_result = [] for cur_words_and_tags in all_words_and_tags: try: accented_text = self.__accentor.do_accents(cur_words_and_tags) except: accented_text = [] if len(accented_text) > 0: tmp = ' '.join(accented_text[0]) tmp = ' ' + tmp phonetic_words = tmp.split(' <sil>') try: result = [] for phonetic_word in phonetic_words: if len(phonetic_word) != 0: phonemes = self.__g2p.phrase_to_phonemes( phonetic_word) result.append(phonemes) except: result = [] else: result = [] total_result.append(result) data_counter += 1 if (part_size > 0) and self.verbose: if (data_counter % part_size) == 0: part_counter += 1 print('{0}% of texts have been processed...'.format( part_counter)) if (part_counter < n_data_parts) and self.verbose: print('100% of texts have been processed...') return total_result
def __init__(self, raise_exceptions: bool = False, batch_size: int = 64, verbose: bool = False, use_wiki: bool = False): """[summary] Args: raise_exceptions (bool, optional): [description]. Defaults to False. batch_size (int, optional): [description]. Defaults to 64. verbose (bool, optional): [description]. Defaults to False. use_wiki (bool, optional): [description]. Defaults to False. """ self.__preprocessor = Preprocessor(batch_size=batch_size) self.__accentor = Accentor(exception_for_unknown=raise_exceptions, use_wiki=use_wiki) self.__g2p = Grapheme2Phoneme( exception_for_nonaccented=raise_exceptions) self.verbose = verbose
def transcribe(self, text: str): words_and_tags = Preprocessor().preprocessing(text) accented_text = Accentor().do_accents(words_and_tags) tmp = ' '.join(accented_text[0]) tmp = ' ' + tmp phonetic_words = tmp.split(' <sil>') result = [] for phonetic_word in phonetic_words: if len(phonetic_word) != 0: phonemes = Grapheme2Phoneme().phrase_to_phonemes(phonetic_word) result.append(phonemes) return result
class TestRussianAccentor2(unittest.TestCase): def setUp(self): self.__accentor = Accentor(mode='many') def tearDown(self): del self.__accentor def test_do_accents_positive01(self): source_phrase = [['оружие'], ['для'], ['кубы']] target_variants = [['ору+жие', 'для', 'ку+бы'], ['ору+жие', 'для', 'кубы+']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants)
def __init__(self, path_to_w2v='modelphonemes.model', path_to_annoy='annoy_index.ann', path_to_dict='data.pkl'): self.your_transcriptor = Transcription() self.your_accentor = Accentor() if os.path.isfile(path_to_w2v): self.model = gensim.models.Word2Vec.load(path_to_w2v) else: raise IOError("File {} does not exist!".format(path_to_w2v)) if os.path.isfile(path_to_dict): with open(path_to_dict, 'rb') as f: self.dict_of_acc = pickle.load(f) else: raise IOError("File {} does not exist!".format(path_to_dict)) self.accents = list(self.dict_of_acc.keys()) f = len(self.accents[0]) self.t = AnnoyIndex(f, 'hamming') if os.path.isfile(path_to_annoy): self.t.load(path_to_annoy) else: raise IOError("File {} does not exist!".format(path_to_annoy))
def test_do_accents_positive11(self): accentor = Accentor(exception_for_unknown=True) source_phrase = [['зеленого'], ['камня']] target_variants = [['зелё+ного', 'ка+мня']] real_variants = accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants)
def setUp(self): self.__accentor = Accentor()
def test_do_accents_positive08(self): accentor = Accentor(exception_for_unknown=True) source_phrase = [['хракозябр'], ['впулил'], ['куздру']] with self.assertRaises(ValueError): _ = accentor.do_accents(source_phrase)
class TestRussianAccentor1(unittest.TestCase): def setUp(self): self.__accentor = Accentor() def tearDown(self): del self.__accentor def test_do_accents_positive01(self): source_phrase = [['мама'], ['мыла'], ['раму']] target_variants = [['ма+ма', 'мы+ла', 'ра+му']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive02(self): source_phrase_n_morphotags = [[ 'привет', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing' ], ['кума', 'NOUN Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing']] target_variants = [['приве+т', 'кума+']] real_variants = self.__accentor.do_accents(source_phrase_n_morphotags) self.assertEqual(target_variants, real_variants) def test_do_accents_positive03(self): source_phrase_n_morphotags = [ ['подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'], ['для', 'ADP _'], ['кума', 'NOUN Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing'] ] target_variants = [['пода+рок', 'для', 'ку+ма']] real_variants = self.__accentor.do_accents(source_phrase_n_morphotags) self.assertEqual(target_variants, real_variants) def test_do_accents_positive04(self): source_phrase = [['оружие'], ['для'], ['кубы']] target_variants = [['ору+жие', 'для', 'кубы']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive05(self): source_phrase = [['машинисты'], ['любят'], ['кофе']] target_variants = [['машини+сты', 'лю+бят', 'ко+фе']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive06(self): source_phrase = [['во-первых'], ['кто-то'], ['вот-вот']] target_variants = [['во-пе+рвых', 'кто+-то', 'вот-во+т']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive07(self): source_phrase = [['хракозябр'], ['впулил'], ['куздру']] target_variants = [['хракозябр', 'впулил', 'куздру']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive08(self): accentor = Accentor(exception_for_unknown=True) source_phrase = [['хракозябр'], ['впулил'], ['куздру']] with self.assertRaises(ValueError): _ = accentor.do_accents(source_phrase) def test_do_accents_positive09(self): source_phrase = [['серебристо-белый'], ['цвет']] target_variants = [['серебри+сто-бе+лый', 'цве+т']] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive10(self): source_phrase = [['озеро'], ['так'], ['серебристо'], ['в'], ['свете'], ['солнца']] target_variants = [[ 'о+зеро', 'та+к', 'серебри+сто', 'в', 'све+те', 'со+лнца' ]] real_variants = self.__accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_positive11(self): accentor = Accentor(exception_for_unknown=True) source_phrase = [['зеленого'], ['камня']] target_variants = [['зелё+ного', 'ка+мня']] real_variants = accentor.do_accents(source_phrase) self.assertEqual(target_variants, real_variants) def test_do_accents_negative01(self): source_phrase_n_morphotags = [[ 'подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing' ], ['для', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'], ['кума']] target_err_msg = re.escape( '`подарок для кума`: morphotags do not correspond to words!') with self.assertRaisesRegex(AssertionError, target_err_msg): self.__accentor.do_accents(source_phrase_n_morphotags) def test_do_accents_negative02(self): source_phrase = [['подарок'], [''], ['кума']] target_err_msg = re.escape( '`(\'подарок\', \'\', \'кума\')`: this phrase is wrong!') with self.assertRaisesRegex(AssertionError, target_err_msg): self.__accentor.do_accents(source_phrase) def test_do_accents_negative03(self): source_phrase = [] target_err_msg = re.escape('Source phrase is empty!') with self.assertRaisesRegex(AssertionError, target_err_msg): self.__accentor.do_accents(source_phrase) def test_do_accents_negative04(self): source_phrase = [['а-зе']] target_err_msg = re.escape('Word `а-зе` is unknown!') accentor = Accentor(exception_for_unknown=True, use_wiki=False) with self.assertRaisesRegex(ValueError, target_err_msg): accentor.do_accents(source_phrase) def test_check_source_wordform_positive01(self): self.assertTrue(self.__accentor.check_source_wordform('абвг')) self.assertTrue(self.__accentor.check_source_wordform('аб-вг')) self.assertFalse(self.__accentor.check_source_wordform('-')) self.assertFalse(self.__accentor.check_source_wordform('')) self.assertFalse(self.__accentor.check_source_wordform('-абвг')) self.assertFalse(self.__accentor.check_source_wordform('аб--вг')) self.assertFalse(self.__accentor.check_source_wordform('abcабвг')) self.assertFalse(self.__accentor.check_source_wordform('abc')) self.assertFalse(self.__accentor.check_source_wordform('abcабвг123')) def test_check_accented_wordform_positive01(self): self.assertTrue(self.__accentor.check_accented_wordform('абвг')) self.assertTrue(self.__accentor.check_accented_wordform('аб-вг')) self.assertFalse(self.__accentor.check_accented_wordform('-')) self.assertFalse(self.__accentor.check_accented_wordform('')) self.assertFalse(self.__accentor.check_accented_wordform('-абвг')) self.assertFalse(self.__accentor.check_accented_wordform('аб--вг')) self.assertFalse(self.__accentor.check_accented_wordform('abcабвг')) self.assertFalse(self.__accentor.check_accented_wordform('abc')) self.assertFalse(self.__accentor.check_accented_wordform('abcабвг123')) self.assertTrue(self.__accentor.check_accented_wordform('а+бвг')) self.assertTrue(self.__accentor.check_accented_wordform('а+бвгде+жз')) self.assertTrue(self.__accentor.check_accented_wordform('а+б-вг')) self.assertFalse(self.__accentor.check_accented_wordform('-')) self.assertFalse(self.__accentor.check_accented_wordform('+-')) self.assertFalse(self.__accentor.check_accented_wordform('+')) self.assertFalse(self.__accentor.check_accented_wordform('')) self.assertFalse(self.__accentor.check_accented_wordform('-а+бвг')) self.assertFalse(self.__accentor.check_accented_wordform('а+б--вг')) self.assertFalse(self.__accentor.check_accented_wordform('a+bcа+бвг')) self.assertFalse(self.__accentor.check_accented_wordform('a+bc')) self.assertFalse( self.__accentor.check_accented_wordform('a+bcа+бвг123')) def test_check_morphotag_positive01(self): self.assertTrue(self.__accentor.check_morphotag('a,b c,d,e')) self.assertTrue(self.__accentor.check_morphotag('12')) self.assertTrue(self.__accentor.check_morphotag('a,b c,d,e(2)')) self.assertFalse(self.__accentor.check_morphotag('a,b c,d,e()')) self.assertFalse(self.__accentor.check_morphotag('a,b(1) c,d,e(2)')) self.assertFalse(self.__accentor.check_morphotag('a,1,b c,d,e')) self.assertFalse(self.__accentor.check_morphotag('a,&,b c,d,e')) self.assertTrue(self.__accentor.check_morphotag('a|b c|d|e')) self.assertTrue(self.__accentor.check_morphotag('a|b c|d|e(2)')) self.assertFalse(self.__accentor.check_morphotag('a|b c|d|e()')) self.assertFalse(self.__accentor.check_morphotag('a|b(1) c|d|e(2)')) self.assertFalse(self.__accentor.check_morphotag('a|1|b c|d|e')) self.assertFalse(self.__accentor.check_morphotag('a|&|b c|d|e')) self.assertTrue( self.__accentor.check_morphotag( 'VERB Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin' )) def test_prepare_morphotag_positive01(self): self.assertEqual('a,b c,d,e', self.__accentor.prepare_morphotag('a,b c,d,e(2)')) self.assertEqual('a,b c,d,e', self.__accentor.prepare_morphotag('a,b c,d,e')) self.assertNotEqual('a,b c,d,e', self.__accentor.prepare_morphotag('a,b c,d(2)')) self.assertNotEqual('a c,d,e', self.__accentor.prepare_morphotag('a,b c,d,e(2)')) def test_calculate_morpho_similarity_positive01(self): self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( '1', 'a,b'), 0.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a,b c,d,e', 'a,b c,d,e'), 1.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a,b c,d,e', 'f,g h,i,j'), 0.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a,b c,d,e', 'f,b h,d,j'), 0.25, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( '1', 'a|b'), 0.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a|b c|d|e', 'a|b c|d|e'), 1.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a|b c|d|e', 'f|g h|i|j'), 0.0, places=7) self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity( 'a|b c|d|e', 'f|b h|d|j'), 0.25, places=7)
def setUp(self): self.__accentor = Accentor(mode='many')
def test_do_accents_negative04(self): source_phrase = [['а-зе']] target_err_msg = re.escape('Word `а-зе` is unknown!') accentor = Accentor(exception_for_unknown=True, use_wiki=False) with self.assertRaisesRegex(ValueError, target_err_msg): accentor.do_accents(source_phrase)
class PhoneticIndex(object): def __init__(self, path_to_w2v='modelphonemes.model', path_to_annoy='annoy_index.ann', path_to_dict='data.pkl'): self.your_transcriptor = Transcription() self.your_accentor = Accentor() if os.path.isfile(path_to_w2v): self.model = gensim.models.Word2Vec.load(path_to_w2v) else: raise IOError("File {} does not exist!".format(path_to_w2v)) if os.path.isfile(path_to_dict): with open(path_to_dict, 'rb') as f: self.dict_of_acc = pickle.load(f) else: raise IOError("File {} does not exist!".format(path_to_dict)) self.accents = list(self.dict_of_acc.keys()) f = len(self.accents[0]) self.t = AnnoyIndex(f, 'hamming') if os.path.isfile(path_to_annoy): self.t.load(path_to_annoy) else: raise IOError("File {} does not exist!".format(path_to_annoy)) def transform(self, sentence, acc_number=10, sent_number=1): assert acc_number >= sent_number, "number of variants for nearest neighbors should be bigger than number of nearest sentences" phonemes = self.get_phonemes(sentence) accents = self.get_accents(sentence) closest_vectors = self.get_closest_vecs(accents, number=acc_number) closest_sentences = self.get_embeddings(closest_vectors, phonemes, number=sent_number) return closest_sentences def get_phonemes(self, sentence): # выдает транскрипцию with warnings.catch_warnings(): warnings.simplefilter('ignore') new_sentence = self.transcriptor(sentence) text = [] for string in new_sentence[0]: for phoneme in string: text.append(phoneme) if len(text) != 0: try: # строит эмбеддинги пакетно phoneme_sent = self.model[text] except: # если символа нет в словаре эмбеддингов, строит поэлементно, заменяя неизвестный на вектор из 100 нулей phoneme_sent = [] for word in text: try: phoneme_word = self.model[word] except: print("unknown word", word) phoneme_word = np.zeros(100) phoneme_sent.append(phoneme_word) phoneme_sent = np.array(phoneme_sent) if len(phoneme_sent) < 100: # приведение к единому размеру 100 difference = 100 - len(phoneme_sent) part = np.zeros((difference, 100)) phoneme_sent = np.concatenate((part, phoneme_sent)) assert len(phoneme_sent ) == 100, "len of vector is inappropriate: {}".format( sentence) else: phoneme_sent = np.zeros((100, 100)) return phoneme_sent def get_accents(self, sentence): # выдает вектор из 0 и 1 - ударений в предложении vector = [] sentence = sentence.translate( sentence.maketrans( '', '', '!&?\./(){}[]"$%^*+=@№<>|–—_€£±•`≠…§~«»₽,:;')).lower() for word in sentence.split(): # ставит ударение в слове, если слово неизвестное, возвращается без ударения try: with warnings.catch_warnings(): warnings.simplefilter('ignore') accents = self.accentor(word) except: #print("unknown accent word: ", word) accents = [[word]] s = accents[0][0] vowels = "эоуиаеёюыяЭОУАЕИЁЫЮЯ" for letter, next_letter in zip(s, s[1:] + " "): # преобразование слов в бинарные вектора, где ударная гласная - 1, безударная 0 if letter in vowels: if next_letter == "+": vector.append(1) else: vector.append(0) if len(vector) < 29: # приведение векторов к стандартному размеру - 29 difference = 29 - len(vector) part = [0 for n in range(difference)] vector = part + vector assert len(vector) == 29, "len of vector is inappropriate: {}".format( sentence) return tuple(vector) def get_closest_vecs(self, vector, number=10): # возвращает список ближайших векторов в количестве number closest = [ self.t.get_item_vector(x) for x in self.t.get_nns_by_vector(vector, number) ] closest_int = [[int(x) for x in vector] for vector in closest] return closest_int def get_embeddings(self, vectors, source_embedding, number=1): # возвращает список ближайших предложений в количестве number possible_sentences = [] for vector in vectors: possible_sentences += self.dict_of_acc[tuple(vector)] possible_embs = [] embs_sentences = {} for sentence in possible_sentences: emb_sentence = self.get_phonemes(sentence) full_emb = np.concatenate(tuple(emb_sentence)) possible_embs.append(full_emb) full_emb = tuple(full_emb) if full_emb not in embs_sentences: embs_sentences[full_emb] = list() embs_sentences[full_emb].append(sentence) else: embs_sentences[full_emb].append(sentence) assert len( possible_embs ) >= number, "Number of nearest neighbors should be less than number of possible neighbors" source_embedding = np.concatenate(tuple(source_embedding)) final_sentences = [] neigh = NearestNeighbors(number) neigh.fit(possible_embs) nearest_neighbors = neigh.kneighbors([source_embedding], return_distance=False).tolist() for element in nearest_neighbors[0]: for sentence in embs_sentences[tuple(possible_embs[element])]: final_sentences.append(sentence.replace('\xa0', ' ')) return final_sentences @functools.lru_cache(maxsize=None) def accentor(self, word): return self.your_accentor.do_accents([[word]]) @functools.lru_cache(maxsize=None) def transcriptor(self, sentence): return self.your_transcriptor.transcribe([sentence])
class Transcription: """[summary] """ def __init__(self, raise_exceptions: bool = False, batch_size: int = 64, verbose: bool = False, use_wiki: bool = False): """[summary] Args: raise_exceptions (bool, optional): [description]. Defaults to False. batch_size (int, optional): [description]. Defaults to 64. verbose (bool, optional): [description]. Defaults to False. use_wiki (bool, optional): [description]. Defaults to False. """ self.__preprocessor = Preprocessor(batch_size=batch_size) self.__accentor = Accentor(exception_for_unknown=raise_exceptions, use_wiki=use_wiki) self.__g2p = Grapheme2Phoneme( exception_for_nonaccented=raise_exceptions) self.verbose = verbose def __call__(self, texts: list): """[summary] Args: texts (list): [description] Returns: [type]: [description] """ return self.transcribe(texts) def transcribe(self, texts: list) -> list: """[summary] Args: texts (list): [description] Returns: list: [description] """ all_words_and_tags = self.__preprocessor.preprocessing(texts) if self.verbose: print('All texts have been preprocessed...') n_texts = len(texts) n_data_parts = 100 part_size = n_texts // n_data_parts while (part_size * n_data_parts) < n_texts: part_size += 1 data_counter = 0 part_counter = 0 total_result = [] for cur_words_and_tags in all_words_and_tags: try: accented_text = self.__accentor.do_accents(cur_words_and_tags) except: accented_text = [] if len(accented_text) > 0: tmp = ' '.join(accented_text[0]) tmp = ' ' + tmp phonetic_words = tmp.split(' <sil>') try: result = [] for phonetic_word in phonetic_words: if len(phonetic_word) != 0: phonemes = self.__g2p.phrase_to_phonemes( phonetic_word) result.append(phonemes) except: result = [] else: result = [] total_result.append(result) data_counter += 1 if (part_size > 0) and self.verbose: if (data_counter % part_size) == 0: part_counter += 1 print(f'{part_counter}% of texts have been processed...') if (part_counter < n_data_parts) and self.verbose: print('100% of texts have been processed...') return total_result def bad_words(self): return sorted(list(set(self.__accentor.get_bad_words())))