コード例 #1
0
ファイル: Tokenisers.py プロジェクト: arnacadia/Oscar
 def safetext_token(self, instring):
     ## Special handling of terminal token:
     if instring == c.TERMINAL:
         return c.TERMINAL
     else:
         if self.lowercase_safetext == 'True':
             return naive_util.safetext(instring.lower())
         else:
             return naive_util.safetext(instring)
コード例 #2
0
 def get_custom_segments(self, word):
     # pairs letters together
     # use this function if your original text is written in phones instead of letters
     safetext_pairs = []
     list_pairs = zip(*[list(word.lower())[i::2] for i in range(2)])
     for x, y in list_pairs:
         first_letter = naive_util.safetext(x).encode("utf-8")
         second_letter = naive_util.safetext(y).encode("utf-8")
         safetext_pairs.append(unicode(first_letter + second_letter))
     return safetext_pairs
コード例 #3
0
    def safetext_token(self, instring):
        """
        把unicode串用一个唯一英文串表示,以方便之后处理
        :param instring: unicode串
        :return: 英文串
        """

        ## Special handling of terminal token:
        if instring == c.TERMINAL:
            return c.TERMINAL
        else:
            if self.lowercase_safetext == 'True':
                return naive_util.safetext(instring.lower())
            else:
                return naive_util.safetext(instring)
コード例 #4
0
ファイル: Phonetisers.py プロジェクト: DungLuongTuan/Ossian
 def get_phonetic_segments(self, word):
     word = word.lower()
     safetext_word = []
     unsafetext_word = []
     chars = '@'
     for i, char in enumerate(word + "@"):
         if (chars + char in self.vi_consonants):
             chars += char
         else:
             if (chars == 'g') and (naive_util.safetext(char) in [
                     'i', '_LATINSMALLLETTERIWITHGRAVE_',
                     '_LATINSMALLLETTERIWITHACUTE_',
                     '_LATINSMALLLETTERIWITHHOOKABOVE_',
                     '_LATINSMALLLETTERIWITHTILDE_',
                     '_LATINSMALLLETTERIWITHDOTBELOW_'
             ]):
                 chars += 'i'
             safetext_char = ''
             if (chars in self.vi_consonants):
                 for c in self.vi_cons_phone[self.vi_consonants.index(
                         chars)]:
                     safetext_char += self.get_safetext(c)
             elif chars in self.name_reps.keys():
                 safetext_char += self.name_reps[chars]
             else:
                 safetext_char += self.get_safetext(chars)
             safetext_word.append(safetext_char)
             unsafetext_word.append(chars)
             chars = char
     if (unsafetext_word[-1] in self.vi_consonants):
         safetext_word[-1] = "END" + safetext_word[-1]
     for i in range(len(safetext_word)):
         safetext_word[i] = "_" + safetext_word[i].replace(" ", "") + "_"
     return safetext_word[1:]
コード例 #5
0
    def process_utterance(self, utt):
        for node in utt.xpath(self.target_nodes):
            assert node.has_attribute(self.input_attribute)
            word = node.get(self.input_attribute)

            ## for now, do indic->latin conversion within lexicon:--
            word_lat = latinise_indian_script_string(word)
            word = [safetext(lett.lower()) for lett in word_lat]

            ## handle OOV phones:--
            word = [
                letter for letter in word if letter in self.phone_inventory
            ]

            word = self.phone_delimiter.join(word)

            if word in self.entries:
                node.set('phones_from', 'lex')
                pronunciation = self.entries[word]

            else:
                pronunciation = self.get_oov_pronunciation(word)
                node.set('phones_from', 'lts')
                if pronunciation == None:
                    pronunciation = self.backoff_pronunciation
                    node.set('phones_from', 'default')
            node.set(self.output_attribute, pronunciation)
コード例 #6
0
ファイル: Phonetisers.py プロジェクト: DungLuongTuan/Ossian
    def get_phonetic_segments(self, word):
        # consonants = ['q', 'w', 'r', 't', 'p', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm']
        # list_words = list(word.lower())
        # safetext_letters = [naive_util.safetext(list_words[0])]
        # for letter in list_words[1:]:
        #     if (letter in consonants) and (safetext_letters[-1] in consonants):
        #         safetext_letters[-1] += letter
        #         continue
        #     if (letter == 'u') and (safetext_letters[-1] == 'q'):
        #         safetext_letters[-1] += 'u'
        #         continue
        #     if (naive_util.safetext(letter) in ['i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_']) and (safetext_letters[-1] == 'g'):
        #         safetext_letters[-1] += 'i'
        #     safetext_letters.append(naive_util.safetext(letter))

        # letters = list(word.lower())
        # safetext_letters = [letters[0]]
        # for letter in letters[1:]:
        #     if (safetext_letters[-1] + letter in self.vi_consonants):
        #         safetext_letters[-1] += letter
        #         continue
        #     if (safetext_letters[-1] == 'g') and (naive_util.safetext(letter) in ['i', '_LATINSMALLLETTERIWITHGRAVE_', '_LATINSMALLLETTERIWITHACUTE_', '_LATINSMALLLETTERIWITHHOOKABOVE_', '_LATINSMALLLETTERIWITHTILDE_', '_LATINSMALLLETTERIWITHDOTBELOW_']):
        #         safetext_letters[-1] += 'i'
        #     safetext_letters.append(letter)

        # for i in range(len(safetext_letters)):
        #     if (safetext_letters[i] in self.vi_consonants):
        #         safetext_letters[i] = self.vi_cons_phone[self.vi_consonants.index(safetext_letters[i])]

        # for i in range(len(safetext_letters)):
        #     safetext_letters[i] = naive_util.safetext(safetext_letters[i])

        letters = [naive_util.safetext(l) for l in list(word.lower())]
        safetext_letters = [letters[0]]
        for letter in letters[1:]:
            if (safetext_letters[-1] + letter in self.vi_consonants):
                safetext_letters[-1] += letter
                continue
            if (safetext_letters[-1] == 'g') and (letter in [
                    'i', '_LATINSMALLLETTERIWITHGRAVE_',
                    '_LATINSMALLLETTERIWITHACUTE_',
                    '_LATINSMALLLETTERIWITHHOOKABOVE_',
                    '_LATINSMALLLETTERIWITHTILDE_',
                    '_LATINSMALLLETTERIWITHDOTBELOW_'
            ]):
                safetext_letters[-1] += 'i'
            safetext_letters.append(letter)

        for i in range(len(safetext_letters)):
            if (safetext_letters[i] in self.vi_consonants):
                safetext_letters[i] = self.vi_cons_phone[
                    self.vi_consonants.index(safetext_letters[i])]

        return safetext_letters
コード例 #7
0
 def get_phonetic_segments(self, word):
     """
     获取单词word的发音表示
     :param word: 一个单词
     :return: 发音表示
     """
     safetext_letters = []
     if self.use_pinyin:
         return pinyin.look_up(word)
     else:
         for letter in list(word.lower()):
             safetext_letters.append(naive_util.safetext(letter))
         return safetext_letters
コード例 #8
0
 def word_2_safetext(self, word):
     if any(c not in "qwertyuiopasdfghjklmnbvcxz123456" for c in word):
         return safetext(word)
     name_reps = {
         "1": "ONE",
         "2": "TWO",
         "3": "THREE",
         "4": "FOUR",
         "5": "FIVE",
         "6": "SIX"}
     for key in name_reps.keys():
         word = word.replace(key, name_reps[key])
     return "_" + word + "_"
コード例 #9
0
 def process_utterance(self, utt):
     for node in utt.xpath(self.target_nodes):
         assert node.has_attribute(self.target_attribute)
         word = node.get(self.target_attribute)
         word = [safetext(lett.lower()) for lett in word]
         current_class = node.attrib[self.class_attribute]
         phones_from = None
         if current_class in self.word_classes:
             word = node.attrib[self.target_attribute]
             if word.lower() in self.entries:
                 phones_from = 'lex'
                 pronunciation = self.entries[word.lower()]
             else:
                 pronunciation = self.get_oov_pronunciation(word)
                 phones_from = 'lts'
                 if pronunciation == None:
                     pronunciation = self.backoff_pronunciation
                     phones_from = 'default'
         elif current_class in self.probable_pause_classes:
             pronunciation = c.PROB_PAUSE # [c.PROB_PAUSE]
             child = Element('segment')
             child.set('pronunciation', pronunciation)
             node.add_child(child)
             continue
         elif current_class in self.possible_pause_classes:
             pronunciation = c.POSS_PAUSE # [c.POSS_PAUSE]
             child = Element('segment')
             child.set('pronunciation', pronunciation)
             node.add_child(child)
             continue
         if phones_from == 'lts':
             phones = [ipa2sampa[x.encode('utf8')] if x.encode('utf8') in ipa2sampa.keys() else x for x in pronunciation.split(' ')]
         else:
             phones = [x for x in pronunciation.split(' ')]
         for phone in phones:
             child = Element('segment')
             child.set('pronunciation', phone)
             if phones_from:
                 child.set('phones_from', phones_from)
             node.add_child(child)
コード例 #10
0
ファイル: Phonetisers.py プロジェクト: DungLuongTuan/Ossian
 def get_phonetic_segments(self, word):
     safetext_letters = []
     for letter in list(word.lower()):
         safetext_letters.append(naive_util.safetext(letter))
     return safetext_letters