Exemple #1
0
class TestPrep(unittest.TestCase):
    def setUp(self):
        self.__prep = Preprocessor()

    def tearDown(self):
        del self.__prep

    def test_tags(self):
        source_phrase = ' - Нет, - сказал он (звали его Андреем Николаевичем).'
        target_variants = [
            ['<sil>', 'SIL _'], ['нет', 'PART _'], ['<sil>', 'SIL _'],
            [
                'сказал',
                'VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
            ], ['он', 'PRON Case=Nom|Gender=Masc|Number=Sing|Person=3'],
            ['<sil>', 'SIL _'],
            [
                'звали',
                'VERB Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act'
            ], ['его', 'PRON Case=Acc|Gender=Masc|Number=Sing|Person=3'],
            ['андреем', 'NOUN Case=Ins|Gender=Masc|Number=Sing'],
            ['николаевичем', 'NOUN Case=Ins|Gender=Masc|Number=Sing'],
            ['<sil>', 'SIL _']
        ]
        real_variants = self.__prep.preprocessing([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_hyphen(self):
        source_phrase = 'Кто-нибудь выучил фразео-, нео- и прочие измы? Я - нет.'
        target_variants = [
            ['<sil>', 'SIL _'],
            ['кто-нибудь', 'PRON Case=Nom|Gender=Masc|Number=Sing'],
            [
                'выучил',
                'VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
            ], ['фразео-', 'ADV Degree=Cmp'], ['<sil>', 'SIL _'],
            ['нео-', 'ADV Degree=Pos'], ['и', 'CONJ _'],
            ['прочие', 'DET Case=Acc|Number=Plur'],
            ['измы', 'NOUN Case=Acc|Gender=Masc|Number=Plur'],
            ['<sil>', 'SIL _'], ['я', 'PRON Case=Nom|Number=Sing|Person=1'],
            ['<sil>', 'SIL _'],
            [
                'нет',
                'VERB Mood=Ind|Number=Sing|Person=3|Tense=Notpast|VerbForm=Fin'
            ], ['<sil>', 'SIL _']
        ]

        real_variants = self.__prep.preprocessing([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)

    def test_nothing(self):
        source_phrase = '...'
        target_variants = [['<sil>', 'SIL _']]
        real_variants = self.__prep.preprocessing([source_phrase])[0]
        self.assertEqual(target_variants, real_variants)
 def __init__(self,
              raise_exceptions: bool = False,
              batch_size: int = 64,
              verbose: bool = False,
              use_wiki: bool = False):
     self.__preprocessor = Preprocessor(batch_size=batch_size)
     self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                use_wiki=use_wiki)
     self.__g2p = Grapheme2Phoneme(
         exception_for_nonaccented=raise_exceptions)
     self.verbose = verbose
Exemple #3
0
class Transcription:
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose

    def transcribe(self, texts: list):
        all_words_and_tags = self.__preprocessor.preprocessing(texts)
        if self.verbose:
            print('All texts have been preprocessed...')
        n_texts = len(texts)
        n_data_parts = 100
        part_size = n_texts // n_data_parts
        while (part_size * n_data_parts) < n_texts:
            part_size += 1
        data_counter = 0
        part_counter = 0
        total_result = []
        for cur_words_and_tags in all_words_and_tags:
            try:
                accented_text = self.__accentor.do_accents(cur_words_and_tags)
            except:
                accented_text = []
            if len(accented_text) > 0:
                tmp = ' '.join(accented_text[0])
                tmp = ' ' + tmp
                phonetic_words = tmp.split(' <sil>')
                try:
                    result = []
                    for phonetic_word in phonetic_words:
                        if len(phonetic_word) != 0:
                            phonemes = self.__g2p.phrase_to_phonemes(
                                phonetic_word)
                            result.append(phonemes)
                except:
                    result = []
            else:
                result = []
            total_result.append(result)
            data_counter += 1
            if (part_size > 0) and self.verbose:
                if (data_counter % part_size) == 0:
                    part_counter += 1
                    print('{0}% of texts have been processed...'.format(
                        part_counter))
        if (part_counter < n_data_parts) and self.verbose:
            print('100% of texts have been processed...')
        return total_result
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        """[summary]

        Args:
            raise_exceptions (bool, optional): [description]. Defaults to False.
            batch_size (int, optional): [description]. Defaults to 64.
            verbose (bool, optional): [description]. Defaults to False.
            use_wiki (bool, optional): [description]. Defaults to False.
        """

        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose
 def transcribe(self, text: str):
     words_and_tags = Preprocessor().preprocessing(text)
     accented_text = Accentor().do_accents(words_and_tags)
     tmp = ' '.join(accented_text[0])
     tmp = ' ' + tmp
     phonetic_words = tmp.split(' <sil>')
     result = []
     for phonetic_word in phonetic_words:
         if len(phonetic_word) != 0:
             phonemes = Grapheme2Phoneme().phrase_to_phonemes(phonetic_word)
             result.append(phonemes)
     return result
Exemple #6
0
 def setUp(self):
     self.__prep = Preprocessor()
class Transcription:
    """[summary]
    """
    def __init__(self,
                 raise_exceptions: bool = False,
                 batch_size: int = 64,
                 verbose: bool = False,
                 use_wiki: bool = False):
        """[summary]

        Args:
            raise_exceptions (bool, optional): [description]. Defaults to False.
            batch_size (int, optional): [description]. Defaults to 64.
            verbose (bool, optional): [description]. Defaults to False.
            use_wiki (bool, optional): [description]. Defaults to False.
        """

        self.__preprocessor = Preprocessor(batch_size=batch_size)
        self.__accentor = Accentor(exception_for_unknown=raise_exceptions,
                                   use_wiki=use_wiki)
        self.__g2p = Grapheme2Phoneme(
            exception_for_nonaccented=raise_exceptions)
        self.verbose = verbose

    def __call__(self, texts: list):
        """[summary]

        Args:
            texts (list): [description]

        Returns:
            [type]: [description]
        """

        return self.transcribe(texts)

    def transcribe(self, texts: list) -> list:
        """[summary]

        Args:
            texts (list): [description]

        Returns:
            list: [description]
        """

        all_words_and_tags = self.__preprocessor.preprocessing(texts)
        if self.verbose:
            print('All texts have been preprocessed...')
        n_texts = len(texts)
        n_data_parts = 100
        part_size = n_texts // n_data_parts
        while (part_size * n_data_parts) < n_texts:
            part_size += 1
        data_counter = 0
        part_counter = 0
        total_result = []
        for cur_words_and_tags in all_words_and_tags:
            try:
                accented_text = self.__accentor.do_accents(cur_words_and_tags)
            except:
                accented_text = []
            if len(accented_text) > 0:
                tmp = ' '.join(accented_text[0])
                tmp = ' ' + tmp
                phonetic_words = tmp.split(' <sil>')
                try:
                    result = []
                    for phonetic_word in phonetic_words:
                        if len(phonetic_word) != 0:
                            phonemes = self.__g2p.phrase_to_phonemes(
                                phonetic_word)
                            result.append(phonemes)
                except:
                    result = []
            else:
                result = []
            total_result.append(result)
            data_counter += 1
            if (part_size > 0) and self.verbose:
                if (data_counter % part_size) == 0:
                    part_counter += 1
                    print(f'{part_counter}% of texts have been processed...')
        if (part_counter < n_data_parts) and self.verbose:
            print('100% of texts have been processed...')
        return total_result

    def bad_words(self):
        return sorted(list(set(self.__accentor.get_bad_words())))