def main():
    if sys.argv.__len__() > 1:
        init_dir_name = os.path.normpath(sys.argv[1])
        assert os.path.isdir(init_dir_name), 'Directory `{0}` does not exist!'.format(init_dir_name)
        all_prompts = sorted(list(get_all_prompts(init_dir_name)))
        accentor = Accentor()
        morpho_predictor = RNNMorphPredictor()
        i = 0
        for cur_prompt in all_prompts[:100]:
            trouble = False
            unknown_words = []
            for cur_subsentence in select_subsentences(cur_prompt):
                morphotags = ['{0} {1}'.format(cur_morpho.pos, cur_morpho.tag)
                              for cur_morpho in morpho_predictor.predict_sentence_tags(cur_subsentence)]
                accent_variants = accentor.do_accents(cur_subsentence, morphotags)
                if len(accent_variants) > 1:
                    trouble = True
                else:
                    accented_phrase = accent_variants[0]
                    for cur_word in accented_phrase:
                        vowels_counter = 0
                        for cur_char in cur_word.lower():
                            if cur_char in VOWEL_LETTERS:
                                vowels_counter += 1
                        if '+' not in cur_word and vowels_counter > 1:
                            unknown_words += [cur_word]
            if trouble:
                print('`{0}`: this phrase cannot be unambiguously accented!'.format(cur_prompt))
                i += 1
            if unknown_words:
                for unknown_word in list(set(unknown_words)):
                    print('`{0}`: word `{1}` in this this phrase is unknown!'.format(cur_prompt, unknown_word))
        print(i)
    else:
        print("Usage: input_directory_with_voxforge_ru")
Exemple #2
0
def transcribe_words(source_words_list):
    n_words = len(source_words_list)
    n_parts = 100
    part_size = n_words // n_parts
    while (part_size * n_parts) < n_words:
        part_size += 1
    transcriptions = []
    bad_words = []
    to_ud2 = converters.converter('opencorpora-int', 'ud20')
    morph = pymorphy2.MorphAnalyzer()
    accentor = Accentor(exception_for_unknown=True, use_wiki=False)
    g2p = Grapheme2Phoneme(exception_for_nonaccented=True)
    russian_letters = set(
        'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя')
    russian_consonants = set('БбВвГгДдЖжЗзЙйКкЛлМмНнПпРрСсТтФфХхЦцЧчШшЩщЪъЬь')
    part_counter = 0
    for word_idx in range(len(source_words_list)):
        cur_word = source_words_list[word_idx].strip().lower()
        err_msg = 'Word {0} is wrong!'.format(word_idx)
        assert len(cur_word) > 0, err_msg + ' It is empty!'
        assert set(cur_word) <= (russian_letters | {'-'}), \
            err_msg + ' "{0}" contains an inadmissible characters.'.format(cur_word)
        assert set(cur_word) != {'-'}, err_msg + ' It is empty!'
        if (len(cur_word) > 1) and (set(cur_word) <= russian_consonants):
            bad_words.append(cur_word)
        else:
            morpho_variants = set(
                [to_ud2(str(it.tag)) for it in morph.parse(cur_word)])
            try:
                accentuation_variants = []
                for it in morpho_variants:
                    accentuation_variants += accentor.do_accents(
                        [[cur_word, it]])[0]
                variants_of_transcriptions = list(
                    set(
                        filter(
                            lambda it2: len(it2) > 0,
                            map(lambda it: tuple(g2p.word_to_phonemes(it)),
                                accentuation_variants))))
                if len(variants_of_transcriptions) > 0:
                    transcriptions.append(
                        (cur_word, ' '.join(variants_of_transcriptions[0])))
                    if len(variants_of_transcriptions) > 1:
                        for variant_idx in range(
                                1, len(variants_of_transcriptions)):
                            transcriptions.append(
                                ('{0}({1})'.format(cur_word, variant_idx + 1),
                                 ' '.join(
                                     variants_of_transcriptions[variant_idx])))
                else:
                    bad_words.append(cur_word)
            except:
                bad_words.append(cur_word)
        if ((word_idx + 1) % part_size) == 0:
            part_counter += 1
            print('{0:.2%} of words have been processed...'.format(
                part_counter / float(n_parts)))
    if part_counter < n_parts:
        print('100.00% of words have been processed...')
    return transcriptions, bad_words
 def __init__(self,
              raise_exceptions: bool = False,
              batch_size: int = 64,
              verbose: bool = False,
              use_wiki: bool = False):
     self.__preprocessor = Preprocessor(batch_size=batch_size)
     self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                use_wiki=use_wiki)
     self.__g2p = Grapheme2Phoneme(
         exception_for_nonaccented=raise_exceptions)
     self.verbose = verbose
Exemple #4
0
class Transcription:
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose

    def transcribe(self, texts: list):
        all_words_and_tags = self.__preprocessor.preprocessing(texts)
        if self.verbose:
            print('All texts have been preprocessed...')
        n_texts = len(texts)
        n_data_parts = 100
        part_size = n_texts // n_data_parts
        while (part_size * n_data_parts) < n_texts:
            part_size += 1
        data_counter = 0
        part_counter = 0
        total_result = []
        for cur_words_and_tags in all_words_and_tags:
            try:
                accented_text = self.__accentor.do_accents(cur_words_and_tags)
            except:
                accented_text = []
            if len(accented_text) > 0:
                tmp = ' '.join(accented_text[0])
                tmp = ' ' + tmp
                phonetic_words = tmp.split(' <sil>')
                try:
                    result = []
                    for phonetic_word in phonetic_words:
                        if len(phonetic_word) != 0:
                            phonemes = self.__g2p.phrase_to_phonemes(
                                phonetic_word)
                            result.append(phonemes)
                except:
                    result = []
            else:
                result = []
            total_result.append(result)
            data_counter += 1
            if (part_size > 0) and self.verbose:
                if (data_counter % part_size) == 0:
                    part_counter += 1
                    print('{0}% of texts have been processed...'.format(
                        part_counter))
        if (part_counter < n_data_parts) and self.verbose:
            print('100% of texts have been processed...')
        return total_result
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        """[summary]

        Args:
            raise_exceptions (bool, optional): [description]. Defaults to False.
            batch_size (int, optional): [description]. Defaults to 64.
            verbose (bool, optional): [description]. Defaults to False.
            use_wiki (bool, optional): [description]. Defaults to False.
        """

        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose
 def transcribe(self, text: str):
     words_and_tags = Preprocessor().preprocessing(text)
     accented_text = Accentor().do_accents(words_and_tags)
     tmp = ' '.join(accented_text[0])
     tmp = ' ' + tmp
     phonetic_words = tmp.split(' <sil>')
     result = []
     for phonetic_word in phonetic_words:
         if len(phonetic_word) != 0:
             phonemes = Grapheme2Phoneme().phrase_to_phonemes(phonetic_word)
             result.append(phonemes)
     return result
Exemple #7
0
class TestRussianAccentor2(unittest.TestCase):
    def setUp(self):
        self.__accentor = Accentor(mode='many')

    def tearDown(self):
        del self.__accentor

    def test_do_accents_positive01(self):
        source_phrase = [['оружие'], ['для'], ['кубы']]
        target_variants = [['ору+жие', 'для', 'ку+бы'],
                           ['ору+жие', 'для', 'кубы+']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)
Exemple #8
0
    def __init__(self,
                 path_to_w2v='modelphonemes.model',
                 path_to_annoy='annoy_index.ann',
                 path_to_dict='data.pkl'):
        self.your_transcriptor = Transcription()
        self.your_accentor = Accentor()
        if os.path.isfile(path_to_w2v):
            self.model = gensim.models.Word2Vec.load(path_to_w2v)
        else:
            raise IOError("File {} does not exist!".format(path_to_w2v))
        if os.path.isfile(path_to_dict):
            with open(path_to_dict, 'rb') as f:
                self.dict_of_acc = pickle.load(f)
        else:
            raise IOError("File {} does not exist!".format(path_to_dict))
        self.accents = list(self.dict_of_acc.keys())

        f = len(self.accents[0])
        self.t = AnnoyIndex(f, 'hamming')
        if os.path.isfile(path_to_annoy):
            self.t.load(path_to_annoy)
        else:
            raise IOError("File {} does not exist!".format(path_to_annoy))
Exemple #9
0
 def test_do_accents_positive11(self):
     accentor = Accentor(exception_for_unknown=True)
     source_phrase = [['зеленого'], ['камня']]
     target_variants = [['зелё+ного', 'ка+мня']]
     real_variants = accentor.do_accents(source_phrase)
     self.assertEqual(target_variants, real_variants)
Exemple #10
0
 def setUp(self):
     self.__accentor = Accentor()
Exemple #11
0
 def test_do_accents_positive08(self):
     accentor = Accentor(exception_for_unknown=True)
     source_phrase = [['хракозябр'], ['впулил'], ['куздру']]
     with self.assertRaises(ValueError):
         _ = accentor.do_accents(source_phrase)
Exemple #12
0
class TestRussianAccentor1(unittest.TestCase):
    def setUp(self):
        self.__accentor = Accentor()

    def tearDown(self):
        del self.__accentor

    def test_do_accents_positive01(self):
        source_phrase = [['мама'], ['мыла'], ['раму']]
        target_variants = [['ма+ма', 'мы+ла', 'ра+му']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive02(self):
        source_phrase_n_morphotags = [[
            'привет', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'
        ], ['кума', 'NOUN Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing']]
        target_variants = [['приве+т', 'кума+']]
        real_variants = self.__accentor.do_accents(source_phrase_n_morphotags)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive03(self):
        source_phrase_n_morphotags = [
            ['подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'],
            ['для', 'ADP _'],
            ['кума', 'NOUN Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing']
        ]
        target_variants = [['пода+рок', 'для', 'ку+ма']]
        real_variants = self.__accentor.do_accents(source_phrase_n_morphotags)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive04(self):
        source_phrase = [['оружие'], ['для'], ['кубы']]
        target_variants = [['ору+жие', 'для', 'кубы']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive05(self):
        source_phrase = [['машинисты'], ['любят'], ['кофе']]
        target_variants = [['машини+сты', 'лю+бят', 'ко+фе']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive06(self):
        source_phrase = [['во-первых'], ['кто-то'], ['вот-вот']]
        target_variants = [['во-пе+рвых', 'кто+-то', 'вот-во+т']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive07(self):
        source_phrase = [['хракозябр'], ['впулил'], ['куздру']]
        target_variants = [['хракозябр', 'впулил', 'куздру']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive08(self):
        accentor = Accentor(exception_for_unknown=True)
        source_phrase = [['хракозябр'], ['впулил'], ['куздру']]
        with self.assertRaises(ValueError):
            _ = accentor.do_accents(source_phrase)

    def test_do_accents_positive09(self):
        source_phrase = [['серебристо-белый'], ['цвет']]
        target_variants = [['серебри+сто-бе+лый', 'цве+т']]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive10(self):
        source_phrase = [['озеро'], ['так'], ['серебристо'], ['в'], ['свете'],
                         ['солнца']]
        target_variants = [[
            'о+зеро', 'та+к', 'серебри+сто', 'в', 'све+те', 'со+лнца'
        ]]
        real_variants = self.__accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_positive11(self):
        accentor = Accentor(exception_for_unknown=True)
        source_phrase = [['зеленого'], ['камня']]
        target_variants = [['зелё+ного', 'ка+мня']]
        real_variants = accentor.do_accents(source_phrase)
        self.assertEqual(target_variants, real_variants)

    def test_do_accents_negative01(self):
        source_phrase_n_morphotags = [[
            'подарок', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'
        ], ['для', 'NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing'],
                                      ['кума']]
        target_err_msg = re.escape(
            '`подарок для кума`: morphotags do not correspond to words!')
        with self.assertRaisesRegex(AssertionError, target_err_msg):
            self.__accentor.do_accents(source_phrase_n_morphotags)

    def test_do_accents_negative02(self):
        source_phrase = [['подарок'], [''], ['кума']]
        target_err_msg = re.escape(
            '`(\'подарок\', \'\', \'кума\')`: this phrase is wrong!')
        with self.assertRaisesRegex(AssertionError, target_err_msg):
            self.__accentor.do_accents(source_phrase)

    def test_do_accents_negative03(self):
        source_phrase = []
        target_err_msg = re.escape('Source phrase is empty!')
        with self.assertRaisesRegex(AssertionError, target_err_msg):
            self.__accentor.do_accents(source_phrase)

    def test_do_accents_negative04(self):
        source_phrase = [['а-зе']]
        target_err_msg = re.escape('Word `а-зе` is unknown!')
        accentor = Accentor(exception_for_unknown=True, use_wiki=False)
        with self.assertRaisesRegex(ValueError, target_err_msg):
            accentor.do_accents(source_phrase)

    def test_check_source_wordform_positive01(self):
        self.assertTrue(self.__accentor.check_source_wordform('абвг'))
        self.assertTrue(self.__accentor.check_source_wordform('аб-вг'))
        self.assertFalse(self.__accentor.check_source_wordform('-'))
        self.assertFalse(self.__accentor.check_source_wordform(''))
        self.assertFalse(self.__accentor.check_source_wordform('-абвг'))
        self.assertFalse(self.__accentor.check_source_wordform('аб--вг'))
        self.assertFalse(self.__accentor.check_source_wordform('abcабвг'))
        self.assertFalse(self.__accentor.check_source_wordform('abc'))
        self.assertFalse(self.__accentor.check_source_wordform('abcабвг123'))

    def test_check_accented_wordform_positive01(self):
        self.assertTrue(self.__accentor.check_accented_wordform('абвг'))
        self.assertTrue(self.__accentor.check_accented_wordform('аб-вг'))
        self.assertFalse(self.__accentor.check_accented_wordform('-'))
        self.assertFalse(self.__accentor.check_accented_wordform(''))
        self.assertFalse(self.__accentor.check_accented_wordform('-абвг'))
        self.assertFalse(self.__accentor.check_accented_wordform('аб--вг'))
        self.assertFalse(self.__accentor.check_accented_wordform('abcабвг'))
        self.assertFalse(self.__accentor.check_accented_wordform('abc'))
        self.assertFalse(self.__accentor.check_accented_wordform('abcабвг123'))
        self.assertTrue(self.__accentor.check_accented_wordform('а+бвг'))
        self.assertTrue(self.__accentor.check_accented_wordform('а+бвгде+жз'))
        self.assertTrue(self.__accentor.check_accented_wordform('а+б-вг'))
        self.assertFalse(self.__accentor.check_accented_wordform('-'))
        self.assertFalse(self.__accentor.check_accented_wordform('+-'))
        self.assertFalse(self.__accentor.check_accented_wordform('+'))
        self.assertFalse(self.__accentor.check_accented_wordform(''))
        self.assertFalse(self.__accentor.check_accented_wordform('-а+бвг'))
        self.assertFalse(self.__accentor.check_accented_wordform('а+б--вг'))
        self.assertFalse(self.__accentor.check_accented_wordform('a+bcа+бвг'))
        self.assertFalse(self.__accentor.check_accented_wordform('a+bc'))
        self.assertFalse(
            self.__accentor.check_accented_wordform('a+bcа+бвг123'))

    def test_check_morphotag_positive01(self):
        self.assertTrue(self.__accentor.check_morphotag('a,b c,d,e'))
        self.assertTrue(self.__accentor.check_morphotag('12'))
        self.assertTrue(self.__accentor.check_morphotag('a,b c,d,e(2)'))
        self.assertFalse(self.__accentor.check_morphotag('a,b c,d,e()'))
        self.assertFalse(self.__accentor.check_morphotag('a,b(1) c,d,e(2)'))
        self.assertFalse(self.__accentor.check_morphotag('a,1,b c,d,e'))
        self.assertFalse(self.__accentor.check_morphotag('a,&,b c,d,e'))
        self.assertTrue(self.__accentor.check_morphotag('a|b c|d|e'))
        self.assertTrue(self.__accentor.check_morphotag('a|b c|d|e(2)'))
        self.assertFalse(self.__accentor.check_morphotag('a|b c|d|e()'))
        self.assertFalse(self.__accentor.check_morphotag('a|b(1) c|d|e(2)'))
        self.assertFalse(self.__accentor.check_morphotag('a|1|b c|d|e'))
        self.assertFalse(self.__accentor.check_morphotag('a|&|b c|d|e'))
        self.assertTrue(
            self.__accentor.check_morphotag(
                'VERB Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Fut|VerbForm=Fin'
            ))

    def test_prepare_morphotag_positive01(self):
        self.assertEqual('a,b c,d,e',
                         self.__accentor.prepare_morphotag('a,b c,d,e(2)'))
        self.assertEqual('a,b c,d,e',
                         self.__accentor.prepare_morphotag('a,b c,d,e'))
        self.assertNotEqual('a,b c,d,e',
                            self.__accentor.prepare_morphotag('a,b c,d(2)'))
        self.assertNotEqual('a c,d,e',
                            self.__accentor.prepare_morphotag('a,b c,d,e(2)'))

    def test_calculate_morpho_similarity_positive01(self):
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            '1', 'a,b'),
                               0.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a,b c,d,e', 'a,b c,d,e'),
                               1.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a,b c,d,e', 'f,g h,i,j'),
                               0.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a,b c,d,e', 'f,b h,d,j'),
                               0.25,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            '1', 'a|b'),
                               0.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a|b c|d|e', 'a|b c|d|e'),
                               1.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a|b c|d|e', 'f|g h|i|j'),
                               0.0,
                               places=7)
        self.assertAlmostEqual(self.__accentor.calculate_morpho_similarity(
            'a|b c|d|e', 'f|b h|d|j'),
                               0.25,
                               places=7)
Exemple #13
0
 def setUp(self):
     self.__accentor = Accentor(mode='many')
Exemple #14
0
 def test_do_accents_negative04(self):
     source_phrase = [['а-зе']]
     target_err_msg = re.escape('Word `а-зе` is unknown!')
     accentor = Accentor(exception_for_unknown=True, use_wiki=False)
     with self.assertRaisesRegex(ValueError, target_err_msg):
         accentor.do_accents(source_phrase)
Exemple #15
0
class PhoneticIndex(object):
    def __init__(self,
                 path_to_w2v='modelphonemes.model',
                 path_to_annoy='annoy_index.ann',
                 path_to_dict='data.pkl'):
        self.your_transcriptor = Transcription()
        self.your_accentor = Accentor()
        if os.path.isfile(path_to_w2v):
            self.model = gensim.models.Word2Vec.load(path_to_w2v)
        else:
            raise IOError("File {} does not exist!".format(path_to_w2v))
        if os.path.isfile(path_to_dict):
            with open(path_to_dict, 'rb') as f:
                self.dict_of_acc = pickle.load(f)
        else:
            raise IOError("File {} does not exist!".format(path_to_dict))
        self.accents = list(self.dict_of_acc.keys())

        f = len(self.accents[0])
        self.t = AnnoyIndex(f, 'hamming')
        if os.path.isfile(path_to_annoy):
            self.t.load(path_to_annoy)
        else:
            raise IOError("File {} does not exist!".format(path_to_annoy))

    def transform(self, sentence, acc_number=10, sent_number=1):
        assert acc_number >= sent_number, "number of variants for nearest neighbors should be bigger than number of nearest sentences"

        phonemes = self.get_phonemes(sentence)

        accents = self.get_accents(sentence)

        closest_vectors = self.get_closest_vecs(accents, number=acc_number)

        closest_sentences = self.get_embeddings(closest_vectors,
                                                phonemes,
                                                number=sent_number)

        return closest_sentences

    def get_phonemes(self, sentence):
        # выдает транскрипцию
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            new_sentence = self.transcriptor(sentence)

        text = []

        for string in new_sentence[0]:
            for phoneme in string:
                text.append(phoneme)

        if len(text) != 0:
            try:
                # строит эмбеддинги пакетно
                phoneme_sent = self.model[text]

            except:
                # если символа нет в словаре эмбеддингов, строит поэлементно, заменяя неизвестный на вектор из 100 нулей
                phoneme_sent = []
                for word in text:
                    try:
                        phoneme_word = self.model[word]
                    except:
                        print("unknown word", word)
                        phoneme_word = np.zeros(100)
                    phoneme_sent.append(phoneme_word)
                phoneme_sent = np.array(phoneme_sent)

            if len(phoneme_sent) < 100:
                # приведение к единому размеру 100
                difference = 100 - len(phoneme_sent)
                part = np.zeros((difference, 100))
                phoneme_sent = np.concatenate((part, phoneme_sent))

            assert len(phoneme_sent
                       ) == 100, "len of vector is inappropriate: {}".format(
                           sentence)
        else:
            phoneme_sent = np.zeros((100, 100))

        return phoneme_sent

    def get_accents(self, sentence):
        # выдает вектор из 0 и 1 - ударений в предложении
        vector = []
        sentence = sentence.translate(
            sentence.maketrans(
                '', '', '!&?\./(){}[]"$%^*+=@№<>|–—_€£±•`≠…§~«»₽,:;')).lower()
        for word in sentence.split():
            # ставит ударение в слове, если слово неизвестное, возвращается без ударения
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    accents = self.accentor(word)

            except:
                #print("unknown accent word: ", word)
                accents = [[word]]

            s = accents[0][0]
            vowels = "эоуиаеёюыяЭОУАЕИЁЫЮЯ"
            for letter, next_letter in zip(s, s[1:] + " "):
                # преобразование слов в бинарные вектора, где ударная гласная - 1, безударная 0
                if letter in vowels:
                    if next_letter == "+":
                        vector.append(1)
                    else:
                        vector.append(0)

        if len(vector) < 29:
            # приведение векторов к стандартному размеру - 29
            difference = 29 - len(vector)
            part = [0 for n in range(difference)]
            vector = part + vector

        assert len(vector) == 29, "len of vector is inappropriate: {}".format(
            sentence)
        return tuple(vector)

    def get_closest_vecs(self, vector, number=10):
        # возвращает список ближайших векторов в количестве number
        closest = [
            self.t.get_item_vector(x)
            for x in self.t.get_nns_by_vector(vector, number)
        ]

        closest_int = [[int(x) for x in vector] for vector in closest]

        return closest_int

    def get_embeddings(self, vectors, source_embedding, number=1):
        # возвращает список ближайших предложений в количестве number

        possible_sentences = []
        for vector in vectors:
            possible_sentences += self.dict_of_acc[tuple(vector)]
        possible_embs = []
        embs_sentences = {}
        for sentence in possible_sentences:
            emb_sentence = self.get_phonemes(sentence)
            full_emb = np.concatenate(tuple(emb_sentence))
            possible_embs.append(full_emb)
            full_emb = tuple(full_emb)
            if full_emb not in embs_sentences:
                embs_sentences[full_emb] = list()
                embs_sentences[full_emb].append(sentence)
            else:
                embs_sentences[full_emb].append(sentence)

        assert len(
            possible_embs
        ) >= number, "Number of nearest neighbors should be less than number of possible neighbors"
        source_embedding = np.concatenate(tuple(source_embedding))
        final_sentences = []
        neigh = NearestNeighbors(number)
        neigh.fit(possible_embs)
        nearest_neighbors = neigh.kneighbors([source_embedding],
                                             return_distance=False).tolist()
        for element in nearest_neighbors[0]:
            for sentence in embs_sentences[tuple(possible_embs[element])]:
                final_sentences.append(sentence.replace('\xa0', ' '))
        return final_sentences

    @functools.lru_cache(maxsize=None)
    def accentor(self, word):
        return self.your_accentor.do_accents([[word]])

    @functools.lru_cache(maxsize=None)
    def transcriptor(self, sentence):
        return self.your_transcriptor.transcribe([sentence])
class Transcription:
    """[summary]
    """
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        """[summary]

        Args:
            raise_exceptions (bool, optional): [description]. Defaults to False.
            batch_size (int, optional): [description]. Defaults to 64.
            verbose (bool, optional): [description]. Defaults to False.
            use_wiki (bool, optional): [description]. Defaults to False.
        """

        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose

    def __call__(self, texts: list):
        """[summary]

        Args:
            texts (list): [description]

        Returns:
            [type]: [description]
        """

        return self.transcribe(texts)

    def transcribe(self, texts: list) -> list:
        """[summary]

        Args:
            texts (list): [description]

        Returns:
            list: [description]
        """

        all_words_and_tags = self.__preprocessor.preprocessing(texts)
        if self.verbose:
            print('All texts have been preprocessed...')
        n_texts = len(texts)
        n_data_parts = 100
        part_size = n_texts // n_data_parts
        while (part_size * n_data_parts) < n_texts:
            part_size += 1
        data_counter = 0
        part_counter = 0
        total_result = []
        for cur_words_and_tags in all_words_and_tags:
            try:
                accented_text = self.__accentor.do_accents(cur_words_and_tags)
            except:
                accented_text = []
            if len(accented_text) > 0:
                tmp = ' '.join(accented_text[0])
                tmp = ' ' + tmp
                phonetic_words = tmp.split(' <sil>')
                try:
                    result = []
                    for phonetic_word in phonetic_words:
                        if len(phonetic_word) != 0:
                            phonemes = self.__g2p.phrase_to_phonemes(
                                phonetic_word)
                            result.append(phonemes)
                except:
                    result = []
            else:
                result = []
            total_result.append(result)
            data_counter += 1
            if (part_size > 0) and self.verbose:
                if (data_counter % part_size) == 0:
                    part_counter += 1
                    print(f'{part_counter}% of texts have been processed...')
        if (part_counter < n_data_parts) and self.verbose:
            print('100% of texts have been processed...')
        return total_result

    def bad_words(self):
        return sorted(list(set(self.__accentor.get_bad_words())))