Example #1
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    vowels = [
        'a',
        'A',
        'e',
        'E',
        'i',
        'I',
        'o',
        'O',
        'u',
        'U',
        u'\N{CYRILLIC SMALL LETTER A}',  # looks like normal a
        # u'\N{CYRILLIC SMALL LETTER IE}',  # looks like normal e
        u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}',  # looks something like small Euro symbol with one cross-piece
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal i
        # u'\N{CYRILLIC SMALL LETTER YI}',  # i with diaresis
        u'\N{CYRILLIC SMALL LETTER I}',  # looks like small backwards capital N
        u'\N{CYRILLIC SMALL LETTER SHORT I}',  # looks like small backwards capital N with tilde
        u'\N{CYRILLIC SMALL LETTER O}',  # looks like normal o
        u'\N{CYRILLIC SMALL LETTER U}',  # looks like normal y
        u'\N{CYRILLIC CAPITAL LETTER A}',  # looks like normal A
        # u'\N{CYRILLIC CAPITAL LETTER IE}',  # looks like normal E
        u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}',  # looks something like Euro symbol with one cross-piece
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal I
        # u'\N{CYRILLIC CAPITAL LETTER YI}',  # I with diaresis
        u'\N{CYRILLIC CAPITAL LETTER I}',  # looks like backwards capital N
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}',  # looks like backwards capital N with tilde
        u'\N{CYRILLIC CAPITAL LETTER O}',  # looks like normal O
        u'\N{CYRILLIC CAPITAL LETTER U}',  # looks like normal Y
    ]
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'v': 'V',
        'g': 'G',
        'd': 'D',
        'e': 'EH0',
        'j': 'Y',
        'y': 'IH0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'f': 'F',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER IE}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y',
        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                pass
        elif letter == 'i':
            if previous == 'j':
                phonemes.append('IY0')
            else:
                phonemes.append('IH0')
        elif letter == 'h':
            if letter == 'h':
                if previous in ['z', 's', 'c']:
                    pass
                else:
                    phonemes.append('HH')
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'j':
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('ZH')
            else:
                phonemes.append('Z')
        elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC SMALL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC SMALL LETTER YA}':
            phonemes.append('Y')
            phonemes.append('AO0')  # not if unstressed - drop this line ?
        elif letter == u'\N{CYRILLIC SMALL LETTER YU}':
            phonemes.append('Y')
            phonemes.append('UW0')
        elif letter == u'\N{CYRILLIC SMALL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('AO0')  # not if unstressed - drop this line ?
            else:
                phonemes.append('AA0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #2
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys('aeiouäöü')
    phonemes = []
    simple_convert = {
        'f': 'F',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',  # use AH0 or ER0 for final letter in word ??
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',
        't': 'T',
        'v': 'F',  # non-native loan-words, 'V'
        'w': 'V',
        'y': 'IH0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == previous and letter not in isvowel:
            pass
        elif letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] == 'i':  # ai
                phonemes.append('AY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'u':  # au
                phonemes.append('AW0')
            elif previous == 'a':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and word[pos + 1] not in isvowel:
                phonemes.append('AH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('AH0')
            elif len(word) == pos + 1 and previous not in isvowel:
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if len(word) > pos + 1 and word[pos + 1] == 'u':  # äu
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and word[pos + 1] not in isvowel:
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos + 1:
                phonemes.append('EY0')
            else:
                phonemes.append('EY0')
        elif letter == 'b':
            if len(word) == pos + 1:
                phonemes.append('P')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if previous == 's' and len(word) > pos + 1 and word[pos +
                                                                1] == 'h':
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('HH')  # use 'K'??
            else:
                phonemes.append('K')
        elif letter == 'd':
            if len(word) == pos + 1:
                phonemes.append('T')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('T')
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'i':
                pass  # covered under 'i'
            elif len(word) == pos + 2 and word[pos + 1] in ['l', 'n', 'r'
                                                            ]:  # -en, -er, -el
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[pos + 1] == 'i':  # ei
                phonemes.append('AY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'u':  # eu
                phonemes.append('OY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'e':  # ee
                phonemes.append('EY0')
            elif previous == 'e':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and word[pos + 1] not in isvowel:
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos + 1 and previous not in isvowel:
                phonemes.append('EH0')
            else:
                phonemes.append('EY0')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) == pos + 1 and previous == 'i':
                phonemes.append('HH')
            elif len(word) == pos + 1:
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('K')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if previous in isvowel:
                pass  # silent
            elif previous == 'c':
                pass  # covered under 'c'
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous in ['a', 'e']:
                pass  # covered under other vowel
            elif len(word) > pos + 1 and word[pos + 1] == 'e':  # ie
                phonemes.append('IY0')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not word[pos + 1] in isvowel:
                phonemes.append('IH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('IH0')
            elif len(word) == pos + 1 and previous not in isvowel:
                phonemes.append('IY0')  # also use IH0 here instead?
            elif pos == 0:
                phonemes.append('IH0')
            else:
                phonemes.append('IH0')  # also use IH0 here instead?
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':
                pass  # covered under 'g'
            else:
                phonemes.append('N')
        elif letter == 'o':
            if previous == 'o':
                pass
            elif len(word) == pos + 1 and previous not in isvowel:
                phonemes.append('AO0')
            else:
                phonemes.append(
                    'AO0')  # somtimes o in on, not covered in CMU/USA
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
            phonemes.append('ER0')
        elif letter == 's':
            if pos == 0 and len(word) > pos + 1 and word[pos +
                                                         1] in ['p', 't']:
                phonemes.append('SH')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'c' and word[pos +
                                                                 2] == 'h':
                pass  # covered under 'c'
            elif pos == 0:
                phonemes.append('Z')  # at beginning of word
            elif len(word) == pos + 1:
                phonemes.append('S')  # at end of word
            else:
                phonemes.append('S')  # default sound - or 'Z' ??
        elif letter == 'u':
            if previous in [
                    'a', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', 'e'
            ]:
                pass
            elif previous == 'q':
                phonemes.append('V')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not word[pos + 1] in isvowel:
                phonemes.append('UH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('UH0')
            elif len(word) == pos + 1 and previous not in isvowel:
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
            phonemes.append('UW0')
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'z':
            phonemes.append('T')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #3
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        'f': 'F',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        'k': 'K',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0',  # ER0 ? AO0 ?
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0',
        'v': 'V',
        'w': 'V',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('AY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handle under 'z'
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'e':
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'e':
                phonemes.append('EY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('JH')
            else:
                phonemes.append('G')
        elif letter == 'i':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('IH0')  # IY0?
        elif letter == 'j':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                pass  # handled under y - ly is close enough to just IY
            else:
                phonemes.append('L')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == 'q':  # loan words
            phonemes.append('K')
            phonemes.append('W')
        elif letter == 's':
            if previous == 'c':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 's' and word[
                    pos + 2] == 'z':  # ssz
                pass
            elif len(word) > pos + 1 and word[
                    pos + 1] == 'z' and previous == 's':  # ssz
                phonemes.append('S')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':  # sz
                phonemes.append('S')
            else:
                phonemes.append('SH')
        elif letter == 'x':  # loan words only
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            elif previous == 'g':
                pass  # handled under g
            elif previous == 't':
                phonemes.append('Y')
            elif previous == 'n':
                pass  # close enough to just n, although more like Spanish ñ
            else:
                phonemes.append('IY0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[
                    pos + 1] == 's' and previous == 'd':  # dzs
                phonemes.append('JH')
            elif previous == 'z' and len(word) > pos + 1 and word[
                    pos + 1] == 's':  # zzs
                phonemes.append('ZH')
                phonemes.append('ZH')
            elif len(word) > pos + 1 and word[pos + 1] == 's':  # zs
                phonemes.append('ZH')
            elif len(word) > pos + 2 and word[pos + 1] == 'z' and word[
                    pos + 2] == 's':  # probably zzs
                pass
            elif previous == 'd':  # dz
                phonemes.append('D')
                phonemes.append('S')
            elif previous == 's':
                pass  # handled under s
            elif previous == 'c':
                pass  # handled under c
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #4
0
def breakdownWord(input_word, recursive=False):
    """breaks down a word into phonemes
    """
    # word = input_word.decode(input_encoding)  # decode input into Python default internal format (utf-16) from the GUI input format
    word = input_word
    word = word.lower()
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word:
        if letter == u'b':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'c':
            if word_index < len(word) - 1 and word[word_index + 1] == u'h':
                breakdown_word.append('CH')
            elif previous == u'c':
                breakdown_word.append('S')
            elif word_index < len(word) - 1 and word[word_index + 1] == u's':
                pass
            elif word_index < len(word) - 1 and word[word_index +
                                                     1] in [u'e', u'i']:
                # should this be SH before 'e', S before 'i' ??
                breakdown_word.append(
                    'S')  # South American, Castilian Spanish uses 'TH'
            else:
                breakdown_word.append('K')
        elif letter == u'd':
            if word_index == 0 or previous in [u'l', u'n']:
                breakdown_word.append('D')
            else:
                breakdown_word.append('DH')
        elif letter == u'e':
            if word_index == len(word) - 1 or word[word_index + 1] in [
                    u'a', u'e', u'i', u'o', u'u'
            ]:
                breakdown_word.append('EY0')
            else:
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            if word_index == len(word) - 1 or word[word_index + 1] in [
                    u'a', u'e', u'i', u'o', u'u'
            ]:
                breakdown_word.append('EY1')
            else:
                breakdown_word.append('EH1')
        elif letter == u'g':
            if word_index < len(word) - 1 and word[
                    word_index +
                    1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
                breakdown_word.append('V')
            elif word_index < len(word) - 1 and word[word_index +
                                                     1] in [u'e', u'i']:
                breakdown_word.append('HH')
            else:
                breakdown_word.append('G')
        elif letter == u'h':
            pass
        elif letter == u'l':
            if word_index < len(word) - 1 and word[word_index + 1] == u'l':
                pass
            elif previous == u'l':
                breakdown_word.append('Y')
            else:
                breakdown_word.append('L')
        elif letter == u'n':
            if word_index < len(word) - 1 and word[word_index + 1] == u'v':
                breakdown_word.append('M')
            else:
                breakdown_word.append('N')
        elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}':
            breakdown_word.append('N')
            breakdown_word.append('Y')
        elif letter == u'o':
            if word_index < len(word) - 1 and word[word_index + 1] not in [
                    u'a', u'e', u'i', u'o', u'u'
            ]:  # last bit necessary ?
                breakdown_word.append('AO0')
            else:
                breakdown_word.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            if word_index < len(word) - 1 and word[word_index + 1] not in [
                    u'a', u'e', u'i', u'o', u'u'
            ]:  # last bit necessary ?
                breakdown_word.append('AO1')
            else:
                breakdown_word.append('OW1')
        elif letter == u'p':
            if word_index == len(word) - 1:
                pass
            else:
                breakdown_word.append('P')
        elif letter == u'r':
            if previous == u'r':
                pass
            elif word_index < len(word) - 1 and word[word_index + 1] == u'r':
                breakdown_word.append('R')  # RR - trilled a lot
            else:
                breakdown_word.append('R')  # only a little trilled
        elif letter == u's':
            if word_index < len(word) - 1 and word[word_index + 1] in [
                    u'd', u'g', u'l', u'm', u'n'
            ]:
                breakdown_word.append('Z')
            else:
                breakdown_word.append('S')
        elif letter == u'u':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[
                    word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[
                    word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW1')
        elif letter == u'v':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'x':
            if previous in [
                    u'a', u'e', u'i', u'o', u'u'
            ] and word_index < len(word) - 1 and word[word_index + 1] in [
                    u'a', u'e', u'i', u'o', u'u'
            ]:
                breakdown_word.append('K')
                breakdown_word.append('S')
            else:
                breakdown_word.append('S')
        elif letter == u'y':
            if len(word) == 1:
                breakdown_word.append('IY1')
            elif word_index == len(word) - 1:
                breakdown_word.append('IY0')
            else:
                breakdown_word.append('Y')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    breakdown_word.append(phon[0])
        previous = letter
        word_index += 1
    breakdown_word = stressSpanishWord(breakdown_word)
    # return breakdown_word
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in breakdown_word:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #5
0
def breakdownSwedishSyllable(word, recursive=False, phonetic=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        #    u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        #    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0',
        #    u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0',
        'a': 'AH0',  # not exact - AO0 ??
        'b': 'B',
        'f': 'F',
        'm': 'M',
        'o': 'UH0',  # compromise, actually UW0 or AA0 (not), sometimes AO0
        'q': 'K',
        'v': 'V',
        'w': 'V',
        'z': 'S',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0',  # not exact
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # cc, handle on next case
            elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                               u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                               u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                           u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
                # ~ if previous == 's':
                # ~ phonemes.append('SH')
                # ~ else:
                # ~ phonemes.append('CH')  # sometimes 'K' as in English 'chorus', but no rule
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('K')
        elif letter == 'd':
            if pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'j':  # dj at beginning of word
                pass  # same as j alone
            else:
                phonemes.append('D')
        elif letter == 'e':
            if phonetic:
                phonemes.append('EH0')
            elif len(word) == pos + 2 and word[pos + 1] == 'r':  # ends in er
                phonemes.append('AE0')
            else:
                phonemes.append('EH0')  # sometimes 'IY0', sometimes 'EY0'
        elif letter == 'g':
            if previous in ['l', 'r']:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[
                        pos + 2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'n' and previous in ['a', 'o', 'u', 'e', 'i', 'y',
                                                                               u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                               u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('NG')
            elif previous == 'n':  # ng
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # gj
                pass  # same as 'j' alone
            elif len(word) == pos + 2 and word[pos + 1] == 'e':  # ends in 'ge' - French loan-word such as garage ?
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                        u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                # ??? if e is unstressed (how to tell?), pronounce as 'G'
                phonemes.append('Y')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['a', 'o', 'u',
                                                                        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
            elif previous == 'g':
                pass
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c':
                pass  # handled under c
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            elif pos == 1 and previous == 's':  # probably a foreign loan-word
                phonemes.append('SH')
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous == 'g' and len(word) > pos + 1 and word[pos + 1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'o':  # sio e.g mission
                phonemes.append('UH0')
            else:
                phonemes.append('IY0')  # sometimes 'IH0'
        elif letter == 'k':  # needs to be handled before j to handle skj sound
            if pos == 0 and word in [u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse']:
                phonemes.append('K')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                        u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('CH')
            elif word == unicode('människa', input_encoding):
                phonemes.append('SH')
            elif word == unicode('människor', input_encoding):
                phonemes.append('SH')
            elif len(word) == pos + 1 and previous == 's':  # ends in SK
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                # phonemes.append('SH')
                phonemes.append('CH')  # more Finnish-Swedish than Swedish ???
            elif len(word) == pos + 1 and previous == 'c':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in ['a', 'o', 'u',
                                                                               u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('S')
                phonemes.append('K')
            elif previous == 's' and pos == 1:  # sk at beginning of word
                phonemes.append('SH')
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u',  u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']
                phonemes.append('K')
        elif letter == 't':  # needs to be handled before j to handle stj sound
            if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'j':
                phonemes.append('SH')
            if previous == 't' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # tj
                pass  # handled under j
            else:
                phonemes.append('T')
        elif letter == 'j':
            if previous == 's':
                phonemes.append('SH')
            elif previous == 't':
                if word[pos - 2] == 's':  # stj, handled under 't'
                    pass
                else:
                    phonemes.append('CH')
            elif previous == 'k':
                pass  # handled under k
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':  # ng
                pass  # handled under 'g'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':  # ng
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if previous == 'p':
                pass
            else:
                phonemes.append('P')
        elif letter == 'r':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handled under s
            else:
                phonemes.append('R')
        elif letter == 's':
            if len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h':
                pass  # handled under 'c'
            elif len(word) > pos + 2 and word[pos + 1] == 't' and word[pos + 2] == 'j':
                pass  # handled under 't'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':
                pass  # handled under 'k'
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # handled under 'j'
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'i' and len(word) > pos + 2 and word[
                        pos + 2] == 'o':  # might need more breakdown
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # handled under 'h'
            elif previous == 'r':
                phonemes.append('SH')  # not entirely accurate, use HH ??
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'q':
                phonemes.append('V')
            else:
                phonemes.append('UW0')  # inaccurate, no accurate CMU equiivalent
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if word in [u'yoga', u'yoghurt']:
                phonemes.append('Y')
            elif word == u'fyrtio':
                phonemes.append('ER0')
            else:
                phonemes.append('UW0')  # not exact
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if phonetic:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] == 'r':
                phonemes.append('AE0')  # not exact, and skips exceptions---
            else:
                phonemes.append('EH0')  # not exact, and skips exceptions
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = " ".join(breakdownSwedishSyllable(hammer(letter), True, phonetic))
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #6
0
def syllablesToPhonemes(syllables, recursive=False):
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'j': 'Y',  # SH in some words borrowed from French
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'F',  # English F mixed with English V
        'w': 'V',  # closer to soft English V than the English W - pronounced back in mouth, not with pursed lips
        'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    syllable_pos, letter_pos = 0, 1
    pos = [1, 1]  # syllable 1, letter 1
    previous_letter = ' '
    for syllable in syllables:
        for letter in syllable:
            if letter == previous_letter and not isvowel(letter):  # double consonants
                pass
            # ===================== consonants ==========================
            elif letter == "b" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(
                    syllables[-1]):  # last letter in word
                phonemes.append("P")
            elif letter == "d" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(
                    syllables[-1]):  # last letter in word
                phonemes.append("T")
            elif letter == "n" and len(syllable) > pos[letter_pos] and syllable[letter_pos] == "g":  # ng
                pass  # handled in next case
            elif letter == "g" and previous_letter == "n":  # ng
                phonemes.append("NG")
            elif letter == 'g':
                phonemes.append(
                    "HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            # elif letter == 'c' and len(syllable) > pos[letter_pos]-1 and syllable[pos[letter_pos]] == 'h':
            elif letter == 'c' and len(syllable) > pos[letter_pos] + 1 and syllable[pos[letter_pos]] == 'h':  # ch
                pass
            elif letter == 'h' and previous_letter == 'c':  # ch
                phonemes.append(
                    "HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            elif letter == 't' and len(syllable) > pos[letter_pos] and syllable[pos[letter_pos]] == 'h':  # th
                pass  # handled in next case
            elif letter == 'h' and previous_letter == 't':  # th
                phonemes.append("TH")
            elif letter == 'j' and previous_letter == 'i':
                pass  # handled in vowels
            elif letter == 'w' and previous_letter == 'u':
                pass  # handled in vowels
            elif letter == 'x':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("S")
            elif letter == 'q':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("W")
            elif letter == 'c':
                if pos[letter_pos] < len(syllable) and syllable[
                    pos[letter_pos]] in "ei":  # c     before e and i pronounce as s
                    phonemes.append("S")
                else:
                    phonemes.append(
                        "K")  # c     before a consonant, at the end of a word and before a, o, u pronounce as k;
            elif letter in easy_consonants:
                phonemes.append(simple_convert[letter])
            # =============== vowels ================
            # ------------ A -------------------------------
            elif letter == 'a':  # short AH, long AA
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'a':  # double a
                    phonemes.append("AA0")
                elif previous_letter == 'a':  # double a handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long a reduced to single letter
                    phonemes.append("AA0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # au
                    phonemes.append("AW0")  # occasionally as UW0 in some words borrowed from French
                else:
                    phonemes.append('AH0')  # like English short u (cut, hut)
            # ------------ E -------------------------------
            elif letter == 'e':  # e short EH long EY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # double e
                    phonemes.append("EY0")
                elif previous_letter == 'e':  # double e handled by case above
                    pass
                elif previous_letter == 'i':  # ie handled at i stage
                    pass
                elif previous_letter == 'o':  # oe handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long e reduced to single letter
                    phonemes.append("EY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # eu
                    phonemes.append("ER0")  # less R than English equivalent, closer to French eu
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # ei
                    phonemes.append("AY0")
                else:
                    phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            # ------------ I -------------------------------
            elif letter == 'i':  # i short IH long IY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # double i
                    phonemes.append("IY0")
                elif previous_letter == 'u':  # ui handled at u stage
                    pass
                elif previous_letter == 'i':  # double i handled by case above
                    pass
                elif previous_letter == 'e':  # ei handled at ei stage
                    pass
                # elif previous_letter == 'a': # !!!FIXME!!! handle aai, aaij
                #    pass
                elif pos[letter_pos] == len(syllable):  # long i reduced to single letter
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'j':  # ij
                    phonemes.append("AY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # iu
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # ie
                    phonemes.append("IY0")
                # elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ieuw !!!FIXME!!!  handle ieuw IY UW ???
                #    phonemes.append("IY0")
                else:
                    phonemes.append('IH0')
            # ------------ O -------------------------------
            elif letter == 'o':  # o short AA long OW
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'o':  # double o
                    phonemes.append("OW0")
                elif previous_letter == 'o':  # double o handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long o reduced to single letter
                    phonemes.append("OW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # oe
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # ou
                    phonemes.append("AW0")
                else:
                    phonemes.append('AO0')
            # ------------ U -------------------------------
            elif letter == 'u':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # double u
                    phonemes.append("UW0")
                elif previous_letter == 'u':  # double u handled by case above
                    pass
                elif previous_letter == 'a':  # au handled at a stage
                    pass
                elif previous_letter == 'e':  # handled at e stage
                    pass
                elif previous_letter == 'i':
                    phonemes.append("UW0")
                elif previous_letter == 'o':  # handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long u reduced to single letter
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'w':  # uw
                    phonemes.append("UW0")  # uw = EW in English DEW IY UW ???
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # ui
                    phonemes.append("UH0")  # - not accurate but the nearest phoneme in CMU? (use UW instead?)
                else:
                    phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            # ------------ TREMA (looks like German Umlaut, but different meaning) -------------------------------
            elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # δ
                phonemes.append('AH0')  # like English short u (cut, hut)
            elif letter == u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':  # λ
                phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            elif letter == u'\N{LATIN SMALL LETTER I WITH DIAERESIS}':  # ο
                phonemes.append('IH0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # φ
                phonemes.append('AO0')
            elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':  # ό
                phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            elif letter == u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' or letter == u'\N{LATIN SMALL LETTER Y WITH ACUTE}':  # '�' or 'ύ'
                # LATIN SMALL LETTER Y WITH DIAERESIS
                # LATIN SMALL LETTER Y WITH ACUTE
                phonemes.append("AY0")
            elif len(hammer(letter)) == 1:
                if not recursive:
                    phon = syllablesToPhonemes(hammer(letter), True)
                    if phon:
                        phonemes.append(phon[0])
            pos[letter_pos] += 1
            previous_letter = letter
        pos[syllable_pos] += 1
        pos[letter_pos] = 1
        previous_letter = ' '
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(u'aeiou\N{LATIN SMALL LETTER DOTLESS I}'
                            u'\N{LATIN SMALL LETTER O WITH DIAERESIS}\N{LATIN SMALL LETTER U WITH DIAERESIS}'
                            u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}').has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'c': 'JH',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH',
        'd': 'D',
        'f': 'F',
        'g': 'G',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0',
        'i': 'IY0',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        'p': 'P',
        'r': 'R',
        's': 'S',
        u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH',
        't': 'T',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        'w': 'V',  # loan-words
        'z': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}':
            pass
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ pass
            # ~ elif letter == 'g':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
            # ~ elif letter == 'l':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'b':
                phonemes.append('M')
            else:
                phonemes.append('N')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('OW0')
        elif letter == 'q':  # loan-words
            phonemes.append('K')
        elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == 'u':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('IY0')
            else:
                phonemes.append('UH0')
        elif letter == 'v':
            if isvowel(previous):
                phonemes.append('W')
            else:
                phonemes.append('V')
        elif letter == 'x':  # loan-words
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                pass
            else:
                phonemes.append('Y')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def syllablesToPhonemes(syllables, recursive=False):
    isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'j': 'Y',  # SH in some words borrowed from French
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'F',  # English F mixed with English V
        'w': 'V',  # closer to soft English V than the English W - pronounced back in mouth, not with pursed lips
        'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    syllable_pos, letter_pos = 0, 1
    pos = [1, 1]  # syllable 1, letter 1
    previous_letter = ' '
    for syllable in syllables:
        for letter in syllable:
            if letter == previous_letter and not isvowel(letter):  # double consonants
                pass
            # ===================== consonants ==========================
            elif letter == "b" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(
                    syllables[-1]):  # last letter in word
                phonemes.append("P")
            elif letter == "d" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(
                    syllables[-1]):  # last letter in word
                phonemes.append("T")
            elif letter == "n" and len(syllable) > pos[letter_pos] and syllable[letter_pos] == "g":  # ng
                pass  # handled in next case
            elif letter == "g" and previous_letter == "n":  # ng
                phonemes.append("NG")
            elif letter == 'g':
                phonemes.append(
                    "HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            # elif letter == 'c' and len(syllable) > pos[letter_pos]-1 and syllable[pos[letter_pos]] == 'h':
            elif letter == 'c' and len(syllable) > pos[letter_pos] + 1 and syllable[pos[letter_pos]] == 'h':  # ch
                pass
            elif letter == 'h' and previous_letter == 'c':  # ch
                phonemes.append(
                    "HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            elif letter == 't' and len(syllable) > pos[letter_pos] and syllable[pos[letter_pos]] == 'h':  # th
                pass  # handled in next case
            elif letter == 'h' and previous_letter == 't':  # th
                phonemes.append("TH")
            elif letter == 'j' and previous_letter == 'i':
                pass  # handled in vowels
            elif letter == 'w' and previous_letter == 'u':
                pass  # handled in vowels
            elif letter == 'x':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("S")
            elif letter == 'q':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("W")
            elif letter == 'c':
                if pos[letter_pos] < len(syllable) and syllable[
                    pos[letter_pos]] in "ei":  # c     before e and i pronounce as s
                    phonemes.append("S")
                else:
                    phonemes.append(
                        "K")  # c     before a consonant, at the end of a word and before a, o, u pronounce as k;
            elif letter in easy_consonants:
                phonemes.append(simple_convert[letter])
            # =============== vowels ================
            # ------------ A -------------------------------
            elif letter == 'a':  # short AH, long AA
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'a':  # double a
                    phonemes.append("AA0")
                elif previous_letter == 'a':  # double a handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long a reduced to single letter
                    phonemes.append("AA0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # au
                    phonemes.append("AW0")  # occasionally as UW0 in some words borrowed from French
                else:
                    phonemes.append('AH0')  # like English short u (cut, hut)
            # ------------ E -------------------------------
            elif letter == 'e':  # e short EH long EY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # double e
                    phonemes.append("EY0")
                elif previous_letter == 'e':  # double e handled by case above
                    pass
                elif previous_letter == 'i':  # ie handled at i stage
                    pass
                elif previous_letter == 'o':  # oe handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long e reduced to single letter
                    phonemes.append("EY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # eu
                    phonemes.append("ER0")  # less R than English equivalent, closer to French eu
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # ei
                    phonemes.append("AY0")
                else:
                    phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            # ------------ I -------------------------------
            elif letter == 'i':  # i short IH long IY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # double i
                    phonemes.append("IY0")
                elif previous_letter == 'u':  # ui handled at u stage
                    pass
                elif previous_letter == 'i':  # double i handled by case above
                    pass
                elif previous_letter == 'e':  # ei handled at ei stage
                    pass
                # elif previous_letter == 'a': # !!!FIXME!!! handle aai, aaij
                #    pass
                elif pos[letter_pos] == len(syllable):  # long i reduced to single letter
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'j':  # ij
                    phonemes.append("AY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # iu
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # ie
                    phonemes.append("IY0")
                # elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ieuw !!!FIXME!!!  handle ieuw IY UW ???
                #    phonemes.append("IY0")
                else:
                    phonemes.append('IH0')
            # ------------ O -------------------------------
            elif letter == 'o':  # o short AA long OW
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'o':  # double o
                    phonemes.append("OW0")
                elif previous_letter == 'o':  # double o handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long o reduced to single letter
                    phonemes.append("OW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # oe
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # ou
                    phonemes.append("AW0")
                else:
                    phonemes.append('AO0')
            # ------------ U -------------------------------
            elif letter == 'u':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # double u
                    phonemes.append("UW0")
                elif previous_letter == 'u':  # double u handled by case above
                    pass
                elif previous_letter == 'a':  # au handled at a stage
                    pass
                elif previous_letter == 'e':  # handled at e stage
                    pass
                elif previous_letter == 'i':
                    phonemes.append("UW0")
                elif previous_letter == 'o':  # handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long u reduced to single letter
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'w':  # uw
                    phonemes.append("UW0")  # uw = EW in English DEW IY UW ???
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # ui
                    phonemes.append("UH0")  # - not accurate but the nearest phoneme in CMU? (use UW instead?)
                else:
                    phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            # ------------ TREMA (looks like German Umlaut, but different meaning) -------------------------------
            elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # δ
                phonemes.append('AH0')  # like English short u (cut, hut)
            elif letter == u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':  # λ
                phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            elif letter == u'\N{LATIN SMALL LETTER I WITH DIAERESIS}':  # ο
                phonemes.append('IH0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # φ
                phonemes.append('AO0')
            elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':  # ό
                phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            elif letter == u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' or letter == u'\N{LATIN SMALL LETTER Y WITH ACUTE}':  # '�' or 'ύ'
                # LATIN SMALL LETTER Y WITH DIAERESIS
                # LATIN SMALL LETTER Y WITH ACUTE
                phonemes.append("AY0")
            elif len(hammer(letter)) == 1:
                if not recursive:
                    phon = syllablesToPhonemes(hammer(letter), True)
                    if phon:
                        phonemes.append(phon[0])
            pos[letter_pos] += 1
            previous_letter = letter
        pos[syllable_pos] += 1
        pos[letter_pos] = 1
        previous_letter = ' '
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(
        'aабвгдежeийклiмнопoтуфхцшњuщъыь')
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'g': 'G',
        'j': 'JH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'V',
        'w': 'W',
        'y': 'Y',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S'  #  з
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter in ['a', accented_a]:  # a
            phonemes.append('AE0')
        elif letter in ['e', accented_e]:  # e
            phonemes.append('EH0')
        elif letter in ['i', accented_i]:  # i
            phonemes.append('IH0')
        elif letter in ['o', accented_o]:  # o
            phonemes.append('AO0')
        elif letter in ['u', accented_u]:  # u
            phonemes.append('UW0')

        elif letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'h':  # ch
                phonemes.append('CH')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', accented_e, accented_i
            ]:  #ce, ci
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'o', 'r', 'u', accented_a, accented_o, accented_u
            ]:  # ca, co, cu, cr
                phonemes.append('K')
            else:
                phonemes.append('K')
        elif letter == 'h':
            if previous in ['c', 's']:
                pass
            else:
                phonemes.append('HH')  # h
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')  # sh
            else:
                phonemes.append('S')  # s
        elif letter == 'x':  # x
            if pos + 1 == len(word):
                phonemes.append('Z')
            else:
                phonemes.append('K')
                phonemes.append('S')

        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
        #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #10
0
    def ImportMultipleAudioFiles(self, audioFolderPath, textPath):
        language = self.languageChoice.GetStringSelection()

        spokenTexts = {}
        #if we have a text file with spoken words then build dictionary of them
        if textPath is not None:
            textFile = codecs.open(textPath, 'r', 'utf-8', 'replace')

            for line in textFile.readlines():
                #to fix mac apostorofjsldfkthingises and other annoying chars
                #line = line.replace("’","'")# hammer(line)
                #line = line.replace(u".","")
                #line = line.replace(u"\"","")
                #line = line.replace(u"“","")
                #line = line.replace(u"”","")
                line = hammer(line)

                print line

                if not "=" in line:
                    continue

                if "#" in line:
                    line = line.split("#")[1]

                splitted = line.split("=")

                name = hammer(splitted[0].strip())
                spokenTexts[name] = hammer(splitted[1].strip())

        print "creating new doc"

        if self.doc is None:
            self.CreateNewDoc()

        resultDialogue = MultiImportResultDialogue(self)

        print "after new doc"

        #next go through audio folder
        for fileName in os.listdir(audioFolderPath):
            for extension in audioFileExtensions:
                if fileName.endswith(extension):
                    absAudioPath = os.path.join(audioFolderPath, fileName)
                    voiceName = hammer(fileName.split(".")[0])
                    voice = None

                    #if there already is a voice in the document that has the same name then we use that
                    for exisitingVoice in self.doc.voices:
                        if exisitingVoice.name == voiceName:
                            voice = exisitingVoice

                    if voice is None:
                        voice = LipsyncVoice(self.doc)
                        voice.name = voiceName
                        self.doc.voices.append(voice)
                        self.voiceList.Insert(voice.name,
                                              self.voiceList.GetCount())

                    if voiceName in spokenTexts:  #is the audio to be found in the list of names we have?
                        print "found it!"
                        if voice.text != spokenTexts[
                                voiceName]:  #has what is being said changed?
                            print "and it is unchanged"

                            voice.OpenAudio(absAudioPath)
                            voice.text = spokenTexts[voiceName]
                            voice.RunBreakdown(self, language, self.langman)
                            resultDialogue.Changed_Files_List.Insert(
                                voice.name, 0)

        if len(self.doc.voices) >= 1:
            self.doc.currentVoice = self.doc.voices[0]
            print "setting voice"
            self.SetVoice(self.doc.currentVoice)
            print "after setting voice"
            self.waveformView.UpdateDrawing()
            self.mouthView.DrawMe()

        resultDialogue.ShowModal()
        resultDialogue.Destroy()

        self.EnableGUI()
Example #11
0
def breakdownWord(input_word, recursive=False):
    word = input_word
    word = word.lower(
    )  # trasformando tutte le parole in minuscolo si diminuiscono le combinazioni da gestire
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word:
        if letter == u'c':
            # ci
            if word_index < len(word) and word[word_index + 1] == u'i':
                breakdown_word.append('EH0')
            # ce
            elif word_index < len(word) and word[word_index + 1] == u'e':
                breakdown_word.append('EH0')
            # cci
            elif word_index < len(word) - 1 and word[
                    word_index + 1] == u'c' and word[word_index + 2] == u'i':
                breakdown_word.append('EH0')
            else:
                breakdown_word.append('K')
        elif letter == u'g':
            # gi
            if word_index < len(word) and word[word_index + 1] == u'i':
                breakdown_word.append('JH')
            # gli
            elif word_index < len(word) - 1 and word[
                    word_index + 1] == u'l' and word[word_index + 2] == u'i':
                breakdown_word.append('JH')
            else:
                breakdown_word.append('G')
        elif letter == u'i':
            # ci, #gi
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER I WITH ACUTE}':
            # cí, #gí
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH1')
        elif letter == u'\N{LATIN SMALL LETTER I WITH GRAVE}':
            # cì, #gì
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH1')
        elif letter == u'h':
            # ch
            if previous == u'c':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('HH')
        elif letter == u'j':
            if 0 < word_index < len(word):
                breakdown_word.append('JH')
            else:
                breakdown_word.append('EH0')
        elif letter == u'l':
            # gli
            if word_index < len(word) and previous == u'g' and word[
                    word_index + 1] == u'i':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('L')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    breakdown_word.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        previous = letter
        word_index += 1
    return breakdown_word
def breakdownSwedishSyllable(word, recursive=False, phonetic=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
        #    u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        #    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0',
        #    u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0',
        'a': 'AH0',  # not exact - AO0 ??
        'b': 'B',
        'f': 'F',
        'm': 'M',
        'o': 'UH0',  # compromise, actually UW0 or AA0 (not), sometimes AO0
        'q': 'K',
        'v': 'V',
        'w': 'V',
        'z': 'S',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0',  # not exact
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # cc, handle on next case
            elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                               u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                               u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                           u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
                # ~ if previous == 's':
                # ~ phonemes.append('SH')
                # ~ else:
                # ~ phonemes.append('CH')  # sometimes 'K' as in English 'chorus', but no rule
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('K')
        elif letter == 'd':
            if pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'j':  # dj at beginning of word
                pass  # same as j alone
            else:
                phonemes.append('D')
        elif letter == 'e':
            if phonetic:
                phonemes.append('EH0')
            elif len(word) == pos + 2 and word[pos + 1] == 'r':  # ends in er
                phonemes.append('AE0')
            else:
                phonemes.append('EH0')  # sometimes 'IY0', sometimes 'EY0'
        elif letter == 'g':
            if previous in ['l', 'r']:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[
                        pos + 2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'n' and previous in ['a', 'o', 'u', 'e', 'i', 'y',
                                                                               u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                               u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('NG')
            elif previous == 'n':  # ng
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # gj
                pass  # same as 'j' alone
            elif len(word) == pos + 2 and word[pos + 1] == 'e':  # ends in 'ge' - French loan-word such as garage ?
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                        u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                # ??? if e is unstressed (how to tell?), pronounce as 'G'
                phonemes.append('Y')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['a', 'o', 'u',
                                                                        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
            elif previous == 'g':
                pass
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c':
                pass  # handled under c
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            elif pos == 1 and previous == 's':  # probably a foreign loan-word
                phonemes.append('SH')
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous == 'g' and len(word) > pos + 1 and word[pos + 1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'o':  # sio e.g mission
                phonemes.append('UH0')
            else:
                phonemes.append('IY0')  # sometimes 'IH0'
        elif letter == 'k':  # needs to be handled before j to handle skj sound
            if pos == 0 and word in [u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse']:
                phonemes.append('K')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y',
                                                                        u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                                                                        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('CH')
            elif word == unicode('människa', input_encoding):
                phonemes.append('SH')
            elif word == unicode('människor', input_encoding):
                phonemes.append('SH')
            elif len(word) == pos + 1 and previous == 's':  # ends in SK
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                # phonemes.append('SH')
                phonemes.append('CH')  # more Finnish-Swedish than Swedish ???
            elif len(word) == pos + 1 and previous == 'c':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in ['a', 'o', 'u',
                                                                               u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('S')
                phonemes.append('K')
            elif previous == 's' and pos == 1:  # sk at beginning of word
                phonemes.append('SH')
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u',  u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']
                phonemes.append('K')
        elif letter == 't':  # needs to be handled before j to handle stj sound
            if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'j':
                phonemes.append('SH')
            if previous == 't' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # tj
                pass  # handled under j
            else:
                phonemes.append('T')
        elif letter == 'j':
            if previous == 's':
                phonemes.append('SH')
            elif previous == 't':
                if word[pos - 2] == 's':  # stj, handled under 't'
                    pass
                else:
                    phonemes.append('CH')
            elif previous == 'k':
                pass  # handled under k
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':  # ng
                pass  # handled under 'g'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':  # ng
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if previous == 'p':
                pass
            else:
                phonemes.append('P')
        elif letter == 'r':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handled under s
            else:
                phonemes.append('R')
        elif letter == 's':
            if len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h':
                pass  # handled under 'c'
            elif len(word) > pos + 2 and word[pos + 1] == 't' and word[pos + 2] == 'j':
                pass  # handled under 't'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':
                pass  # handled under 'k'
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # handled under 'j'
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'i' and len(word) > pos + 2 and word[
                        pos + 2] == 'o':  # might need more breakdown
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # handled under 'h'
            elif previous == 'r':
                phonemes.append('SH')  # not entirely accurate, use HH ??
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'q':
                phonemes.append('V')
            else:
                phonemes.append('UW0')  # inaccurate, no accurate CMU equiivalent
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if word in [u'yoga', u'yoghurt']:
                phonemes.append('Y')
            elif word == u'fyrtio':
                phonemes.append('ER0')
            else:
                phonemes.append('UW0')  # not exact
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if phonetic:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] == 'r':
                phonemes.append('AE0')  # not exact, and skips exceptions---
            else:
                phonemes.append('EH0')  # not exact, and skips exceptions
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = " ".join(breakdownSwedishSyllable(hammer(letter), True, phonetic))
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #13
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key
    phonemes = []
    simple_convert = {
        'd': 'D',
        'h': 'HH',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'V',
        # in foreign and borrowed words and names
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0',  # ??? # á
        u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0',  # à
        u'\N{LATIN SMALL LETTER AE}': 'AE0',  # æ - Norwegian / Danish
        'b': 'B',
        'c': 'K',  # S ???
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH',  # ç - French, etc
        u'\N{LATIN SMALL LETTER C WITH CARON}': 'S',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER ETH}': 'DH',  # ð - Icelandic
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',  # é
        u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':
        'EH0',  # ??? # ë - scientific names
        'f': 'F',
        u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y',  # ñ - Spanish
        u'\N{LATIN SMALL LETTER ENG}': 'N',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER O WITH STROKE}':
        'ER0',  # ??? # ø - Norwegian / Danish
        u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0',  # ??? # õ - Estonian
        'q': 'K',
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',  # ß - German
        u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER THORN}': 'TH',  # Þ - Icelandic
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
        'ER0',  # ??? # ü - German / Estonian
        'w': 'V',
        'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # ~ if letter == previous:
        # ~ pass
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
                    'u',
            ]:
                pass  # handled under following letter
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            elif previous == 'i':  # ie
                phonemes.append('IY0')  # ???
            else:
                phonemes.append('EH0')
        elif letter == 'i':
            prev_match_i = {
                'a': 'AY0',  # ai
                'e': 'EY0',  # ei
                'o': 'OY0',  # oi
                'u': 'UW0',  # ui
                'y': 'IY0'  # yi
                # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ???
                # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ???
            }
            if previous in prev_match_i:
                phonemes.append(prev_match_i[previous])
            else:
                phonemes.append('IH0')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'u']:
                pass  # handled under following letter
            elif previous == 'u':  # uo
                phonemes.append('OW0')  # ???
            else:
                phonemes.append('OY0')
        elif letter == 'u':
            prev_match_u = {
                'a': 'AW0',  # au
                'o': 'OW0'  # AO??? # ou
                # eu ???
                # iu ???
            }
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            elif previous in prev_match_u:
                phonemes.append(prev_match_u[previous])
            else:
                phonemes.append('UH0')
        elif letter == 'y':
            # äy ???
            # öy ???
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            else:
                phonemes.append('UW0')  # ???
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # ä
            phonemes.append('AE0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # ö
            # yö ???
            phonemes.append('ER0')  # ???
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            else:
                phonemes.append('G')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':
                pass  # handled under g
            else:
                phonemes.append('N')
        elif letter in simple_convert:
            phonemes.append(simple_convert[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(setvowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',
        'y': 'IY0',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S',  # з
    }

    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        # A
        if letter in ['a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                      u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']:
            phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}':
            phonemes.append('AE0')
            # E
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EH0')
            # I
        elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
            phonemes.append('IY0')
            # O
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            phonemes.append('OY0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}':
            phonemes.append('AW0')
            # U
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']:
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # Special rule to digraphs consonant:
            # qu and gu (followed by e or i):  aquilo, questгo, quilo, querida, guerra, бguia
            # ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiзa, aguentar
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            if previous == 'q':  # digraph consonant Qu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('K')
                else:
                    phonemes.append('UW0')
            elif previous == 'g':  # digraph consonant Gu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('G')
                else:
                    phonemes.append('UW0')
            else:
                phonemes.append('UW0')
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # consonants with combinations
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # C
        elif letter == 'c':
            if previous == 's':  # digraph consonant sC #asCender
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            if previous == 'x':  # digraph consonant xC #exCelente
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            # ce #ci
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                           u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('S')
            else:
                phonemes.append('K')
                # G
        elif letter == 'g':
            # ge #gi
            if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                         u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                         u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                         u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('ZH')
            else:
                phonemes.append('G')

        # H
        elif letter == 'h':  # silent letter
            if previous == 'n':
                phonemes.append('N')  # digraph consonant Nh
            else:
                pass

        # M
        elif letter == 'm':
            # ['i', 'o', 'u', 'н', 'у', 'ъ', 'о', 'ф', х]
            if previous in ['i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                            u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                            u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                            u'\N{LATIN SMALL LETTER O WITH TILDE}'] and word[-1] == ('m') or len(
                word) > pos + 1 and not isvowel(word[pos + 1]):
                pass  # digraphs vowel am em im om um
            else:
                phonemes.append('M')

        # N
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # Nh handled under #H
            elif isvowel(previous) and word[-1] == ('n') or len(word) > pos + 1 and not isvowel(word[pos + 1]):
                pass  # digraphs vowel an en in on un
            else:
                phonemes.append('N')
        # S
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # sC handled under #C
            elif isvowel(previous) and len(word) > pos + 1 and isvowel(
                    word[pos + 1]):  # check if have vowel before and after S #caSa
                phonemes.append('Z')
            else:
                phonemes.append('S')
        # X
        elif letter == 'x':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # xC handled under #C
            else:
                phonemes.append('SH')  # There are some exceptions where X have phoneme "KS" like tбxi = T A K S I
            #
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    vowels = [
        'a', 'A',
        'e', 'E',
        'i', 'I',
        'o', 'O',
        'u', 'U',
        u'\N{CYRILLIC SMALL LETTER A}',  # looks like normal a
        # u'\N{CYRILLIC SMALL LETTER IE}',  # looks like normal e
        u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}',  # looks something like small Euro symbol with one cross-piece
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal i
        # u'\N{CYRILLIC SMALL LETTER YI}',  # i with diaresis
        u'\N{CYRILLIC SMALL LETTER I}',  # looks like small backwards capital N
        u'\N{CYRILLIC SMALL LETTER SHORT I}',  # looks like small backwards capital N with tilde
        u'\N{CYRILLIC SMALL LETTER O}',  # looks like normal o
        u'\N{CYRILLIC SMALL LETTER U}',  # looks like normal y
        u'\N{CYRILLIC CAPITAL LETTER A}',  # looks like normal A
        # u'\N{CYRILLIC CAPITAL LETTER IE}',  # looks like normal E
        u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}',  # looks something like Euro symbol with one cross-piece
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal I
        # u'\N{CYRILLIC CAPITAL LETTER YI}',  # I with diaresis
        u'\N{CYRILLIC CAPITAL LETTER I}',  # looks like backwards capital N
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}',  # looks like backwards capital N with tilde
        u'\N{CYRILLIC CAPITAL LETTER O}',  # looks like normal O
        u'\N{CYRILLIC CAPITAL LETTER U}',  # looks like normal Y
    ]
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'v': 'V',
        'g': 'G',
        'd': 'D',
        'e': 'EH0',
        'j': 'Y',
        'y': 'IH0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'f': 'F',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER IE}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y',

        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                pass
        elif letter == 'i':
            if previous == 'j':
                phonemes.append('IY0')
            else:
                phonemes.append('IH0')
        elif letter == 'h':
            if letter == 'h':
                if previous in ['z', 's', 'c']:
                    pass
                else:
                    phonemes.append('HH')
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'j':
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('ZH')
            else:
                phonemes.append('Z')
        elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC SMALL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC SMALL LETTER YA}':
            phonemes.append('Y')
            phonemes.append('AO0')  # not if unstressed - drop this line ?
        elif letter == u'\N{CYRILLIC SMALL LETTER YU}':
            phonemes.append('Y')
            phonemes.append('UW0')
        elif letter == u'\N{CYRILLIC SMALL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('AO0')  # not if unstressed - drop this line ?
            else:
                phonemes.append('AA0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(vowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'IY0',  # actual pronunciation varies with word origin
        'k': 'K',  # actual pronunciation varies with word origin
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',  # actual pronunciation varies with word origin
        'x': 'K S',  # actual pronunciation varies with word origin
        'y': 'IY0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter in [u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                      u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']:
            phonemes.append('AA0')
        elif letter == 'c':
            if previous == 's':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('SH')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                                                           u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                                                           u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                           u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('CH')
            else:
                phonemes.append('K')
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EH0')  # long is "EY0"
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}':
            phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] in ['e', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                         u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                                                         u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                                                           u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                pass  # handled under 'i'
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('G')
            elif len(word) > pos + 1 and word[pos + 1] == 'l':
                pass  # handled nuder 'l'
            elif len(word) > pos + 1 and word[pos + 1] == 'n':
                pass  # handled under 'n'
            elif len(word) > pos + 1 and word[pos + 1] == 'u':
                phonemes.append('G')
                phonemes.append('W')
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
            if previous == 'c' and len(word) > pos + 1 and isvowel(word[pos + 1]):
                pass
            elif previous == 'g':
                if len(word) > pos + 1 and word[pos + 1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']:  # or isvowel(word[pos+1]) ??
                    phonemes.append('JH')
                else:
                    phonemes.append('G')
                    phonemes.append('IY0')
            else:
                phonemes.append('IY0')
        elif letter == 'l':
            if previous == 'g':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('L')
                    phonemes.append('IY0')
                else:
                    phonemes.append('L')
                    phonemes.append('G')
            else:
                phonemes.append('L')
        elif letter == 'n':
            if previous == 'g':
                if len(word) > pos + 1 and isvowel(word[pos + 1]):
                    phonemes.append('N')
                    phonemes.append('Y')
                else:
                    phonemes.append('G')
                    phonemes.append('N')
            else:
                phonemes.append('N')
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')  # when closed, when open as 'AO0' ?
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # handled under c
            elif isvowel(previous) and len(word) > pos + 1 and isvowel(word[pos + 1]):
                phonemes.append('Z')
            elif pos == 0:
                if len(word) > pos + 1 and isvowel(word[pos + 1]):
                    phonemes.append('S')
                elif len(word) > pos + 1 and word[pos + 1] in ['c', 'f', 'p', 'q', 's', 't']:
                    phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in ['b', 'd', 'g', 'l', 'm', 'n', 'r', 'v']:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']:
            if previous == 'q':
                phonemes.append('W')
            elif previous == 'g':
                pass  # handled under 'g'
            else:
                phonemes.append('UW0')
        elif letter == 'z':
            if pos == 0:
                phonemes.append('Z')
            elif previous == 'z':
                phonemes.append('T')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #17
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(vowels)
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'IY0',  # actual pronunciation varies with word origin
        'k': 'K',  # actual pronunciation varies with word origin
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',  # actual pronunciation varies with word origin
        'x': 'K S',  # actual pronunciation varies with word origin
        'y': 'IY0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter in [
                u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}'
        ]:
            phonemes.append('AA0')
        elif letter == 'c':
            if previous == 's':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('SH')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                phonemes.append('CH')
            else:
                phonemes.append('K')
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EH0')  # long is "EY0"
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}':
            phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'e', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}'
            ]:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                pass  # handled under 'i'
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('G')
            elif len(word) > pos + 1 and word[pos + 1] == 'l':
                pass  # handled nuder 'l'
            elif len(word) > pos + 1 and word[pos + 1] == 'n':
                pass  # handled under 'n'
            elif len(word) > pos + 1 and word[pos + 1] == 'u':
                phonemes.append('G')
                phonemes.append('W')
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter in [
                'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
        ]:
            if previous == 'c' and len(word) > pos + 1 and word[pos +
                                                                1] in isvowel:
                pass
            elif previous == 'g':
                if len(word) > pos + 1 and word[pos + 1] in [
                        'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}'
                ]:  # or isvowel(word[pos+1]) ??
                    phonemes.append('JH')
                else:
                    phonemes.append('G')
                    phonemes.append('IY0')
            else:
                phonemes.append('IY0')
        elif letter == 'l':
            if previous == 'g':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('L')
                    phonemes.append('IY0')
                else:
                    phonemes.append('L')
                    phonemes.append('G')
            else:
                phonemes.append('L')
        elif letter == 'n':
            if previous == 'g':
                if len(word) > pos + 1 and word[pos + 1] in isvowel:
                    phonemes.append('N')
                    phonemes.append('Y')
                else:
                    phonemes.append('G')
                    phonemes.append('N')
            else:
                phonemes.append('N')
        elif letter in [
                'o', u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}'
        ]:
            phonemes.append('OW0')  # when closed, when open as 'AO0' ?
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # handled under c
            elif previous in isvowel and len(word) > pos + 1 and word[
                    pos + 1] in isvowel:
                phonemes.append('Z')
            elif pos == 0:
                if len(word) > pos + 1 and word[pos + 1] in isvowel:
                    phonemes.append('S')
                elif len(word) > pos + 1 and word[pos + 1] in [
                        'c', 'f', 'p', 'q', 's', 't'
                ]:
                    phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'b', 'd', 'g', 'l', 'm', 'n', 'r', 'v'
            ]:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter in [
                'u', u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}'
        ]:
            if previous == 'q':
                phonemes.append('W')
            elif previous == 'g':
                pass  # handled under 'g'
            else:
                phonemes.append('UW0')
        elif letter == 'z':
            if pos == 0:
                phonemes.append('Z')
            elif previous == 'z':
                phonemes.append('T')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #18
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'j': 'JH',
        'k': 'K',
        'q': 'K',
        'v': 'V',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S'  # ç
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == len(word) > pos + 1 and word[pos + 1]:
            phonemes.append({letter})
        elif letter in ['b', 'd', 'g', 'p', 'x'
                        ] and pos + 1 == len(word):  # silent at end of words
            pass
        elif letter in ['a', accented_a]:
            if (len(word) > pos + 2 and word[pos + 1]
                    in ['i', accented_i]) and word[pos + 2] != 'l':  # ai
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u
                                                           ]:  # au
                phonemes.append('AO0')
            else:
                phonemes.append('AE0')
        elif letter in ['e', accented_e]:
            if pos + 1 == len(word) and len(
                    word) == 2:  # takes care of words like 'je'
                phonemes.append('EH0')
            elif previous == 'u' and pos + 1 == len(word) and len(
                    word) == 3 and word[pos - 2] == 'q':  # que
                phonemes.append('EH0')
            elif pos + 1 == len(word) and len(
                    word) > 2:  # takes care of words like 'parle'
                pass
            elif previous == 'l' and word[pos + 1] == 's' and len(
                    word) == 5 and word[pos - 2] == 'l':  # elles
                pass
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'a' and word[pos +
                                                                 2] == 'u':
                pass
            elif previous in ['o', accented_o]:
                pass
            elif word[0] == letter and (
                    len(word) > pos + 2 and word[pos + 1] in ['m', 'n']
                    and word[pos + 2] in ['m', 'n']) and (word != 'ennemmi'):
                phonemes.append('AE0')
            elif previous != 'i' and (
                    len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                        len(word) > pos + 2 and word[pos + 1] in ['m', 'n']
                        and word[pos + 2] in [
                            'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q',
                            'r', 's', 't', 'v', 'w', 'x', 'z'
                        ]):
                phonemes.append('AE0')
            elif previous == 'f' and len(word) > pos + 3 and word[
                    pos + 1] == 'm' and word[pos +
                                             2] == 'm' and word[pos +
                                                                3] == 'e':
                phonemes.append('AE0')
            elif previous == 'u' and word[pos - 2] == 'q' and pos == len(word):
                pass
            else:
                phonemes.append('EH0')
        elif letter in ['i', accented_i]:
            if previous in ['e', accented_e] and (
                (len(word) > pos + 2 and word[pos + 1] in ['m', 'n']
                 and word[pos + 2] in [
                     'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p',
                     'q', 'r', 's', 't', 'v', 'w', 'x', 'z'
                 ]) or (len(word) == pos + 2)):
                pass
            elif previous in [
                    'f', 't', 'v', 's'
            ] and word[-1] == 'n' and len(word) > 1 and letter == word[-2]:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'm' and word[pos + 2] in [
                                                  'b', 'p'
                                              ]:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'n' and word[pos + 2] in [
                                                  'c', 'd', 'f', 'g', 'j', 'l',
                                                  'q', 's', 't', 'v'
                                              ]:
                phonemes.append('EH0')
                phonemes.append('NG')
            elif previous in [
                    'a', accented_a
            ] and len(word) > pos + 1 and word[pos + 1] != 'l':
                phonemes.append('EH0')
            elif previous in [
                    'o', accented_o
            ] and len(word) == pos + 2 and word[pos + 1] == 'n':
                phonemes.append('EH0')
            elif previous in [
                    'o', accented_o
            ] and len(word) > pos + 2 and word[pos +
                                               1] == 'n' and word[pos + 2] in [
                                                   'b', 'c', 'd', 'f', 'g',
                                                   'j', 'k', 'l', 'm', 'n',
                                                   'p', 'q', 'r', 's', 't',
                                                   'v', 'w', 'x', 'z'
                                               ]:
                phonemes.append('EH0')
            elif previous in [
                    'o', accented_o
            ] and not (len(word) > pos + 2 and word[pos + 1] == 'n' and
                       (word[pos + 2] in [
                           'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n',
                           'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'
                       ] or pos + 2 == len(word))):
                phonemes.append('AE0')
            else:
                phonemes.append('IH0')
        elif letter in ['o', accented_o]:
            if previous == 'm' and len(word) > pos + 6 and word[
                    pos + 1] == 'n' and word[pos + 2] == 's' and word[
                        pos + 3] == 'i' and word[pos + 4] == 'e' and word[
                            pos + 5] == 'u' and word[pos + 6] == 'r':
                phonemes.append('EH0')  # monsieur
            elif len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('W')
                phonemes.append('AE0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'i' and word[pos + 2] in [
                                                  'm', 'n'
                                              ]:
                phonemes.append('W')
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'u' and word[pos + 2] in [
                                                  'i', accented_i
                                              ]:  # stress vowel
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]:
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            else:
                phonemes.append('AO0')
        elif letter in ['u', accented_u]:
            if previous == 'l' and len(word) > pos + 3 and word[
                    pos + 1] == 'n' and word[pos +
                                             2] == 'd' and word[pos +
                                                                3] == 'i':
                phonemes.append('EH0')  # lundi
            elif previous == 'o' and len(word) > pos + 1 and word[pos + 1] in [
                    'i', accented_i
            ]:
                pass
            elif previous in [
                    'b', 'c', 'd', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r',
                    's', 't', 'v', 'w', 'x', 'z'
            ] and len(word) > pos + 1 and word[pos + 1] == 'i':
                phonemes.append('W')
            elif (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                    len(word) > pos + 2 and word[pos + 1] in ['m', 'n']
                    and word[pos + 2] in [
                        'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r',
                        's', 't', 'v', 'w', 'x', 'z'
                    ]):
                phonemes.append('EH0')
            elif previous in ['a', accented_a]:
                phonemes.append('AO0')
            elif previous in ['g', 'q']:
                pass
            elif previous in ['o', accented_o]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['a', accented_a]:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            elif previous == 'g' and len(word) > pos + 1 and word[pos + 1] in [
                    'e', accented_e
            ]:
                phonemes.append('JH')
            else:
                phonemes.append('UW0')
        elif letter == 'y':
            if letter == word[0]:
                phonemes.append('Y')
            elif previous in [
                    'a', 'e', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ] and len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'e', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ]:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] in [
                    'm', 'n'
            ] and len(word) == pos + 2:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] in [
                    'm', 'n'
            ] and word[pos + 2] in [
                    'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r', 's',
                    't', 'v', 'w', 'x', 'z'
            ]:
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'b':
            if len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if len(word) > pos + 2 and word[pos + 1] == 'q' and word[pos +
                                                                     2] == 'u':
                pass
            elif word[pos - 2] == 'p' and previous in [
                    'e', accented_e
            ] and len(word) == pos + 2 and word[
                    pos + 1] == 't':  # takes care of words like 'respect'
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', accented_e, accented_i
            ]:
                pass
            elif len(word) > pos + 1 and word[pos +
                                              1] == word[-1] and word[-1] in [
                                                  'e', accented_e
                                              ]:
                phonemes.append('Z')
            elif len(word) > pos + 1 and (word[pos + 1] in [
                    'a', 'o', 'u', 'l', accented_a, accented_o, accented_u
            ] or word[pos + 1] in [
                    'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q',
                    'r', 's', 't', 'v', 'w', 'x', 'z'
            ]):
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', accented_e, accented_i
            ]:
                phonemes.append('S')
            elif previous == 'n' and len(word) == pos + 1:
                pass
            else:
                pass
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('D')
        elif letter == 'f':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('V')
            else:
                phonemes.append('F')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', accented_e, accented_i
            ]:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c' and len(word) > pos + 1 and word[pos +
                                                                1] == 'r':
                phonemes.append('K')
            elif previous == 'c' and len(word) > pos + 1 and word[pos +
                                                                  1] != 'r':
                phonemes.append('SH')
            else:
                pass
        elif letter == 'l':
            if word[pos - 2] in [
                    'm', 'v', 'h', 'k'
            ] and previous == 'i' and word[pos - 3] not in ['a', ''
                                                            ]:  # mil*, vil*
                phonemes.append('L')
            elif word[pos - 3] in [
                    'm', 'v'
            ] and word[pos -
                       2] == 'i' and previous == 'l' and word[pos - 4] not in [
                           'a', ''
                       ]:  # mill* ,vill*
                phonemes.append('L')
            elif word[pos - 3] == 'q' and word[
                    pos - 2] == 'u' and previous == 'i':  # tranquil*
                phonemes.append('L')
            elif word[pos - 3] == 'u' and word[
                    pos - 2] == 'i' and previous == 'l' and word[
                        pos - 4] == 'q':  # tranquill*
                phonemes.append('L')
            elif ((previous == 'i' or (previous == 'i' and len(word) > pos + 1
                                       and word[pos + 1] == letter)
                   or (previous == 'i' and len(word) > pos + 2
                       and word[pos + 1] == letter and word[pos + 2] == 'e'))):
                phonemes.append('Y')
            elif ((word[pos - 2] == 'i' and previous == letter)
                  or (word[pos - 2] == 'i' and previous == letter
                      and len(word) > pos + 1
                      and word[pos + 1] == 'e')):  # il, ill,ille
                phonemes.append('Y')
            else:
                phonemes.append('L')
        elif letter == 'm':
            if previous == 'a' and len(word) > pos + 1 and word[pos +
                                                                1] == 'n':
                pass
            elif letter == word[-1] and word[-2] == 'i' and word[-3] == 'a':
                phonemes.append('NG')
            elif previous in [
                    'a', 'e', 'i', 'o', 'u'
            ] and (len(word) == pos + 1 or
                   (len(word) > pos + 1 and word[pos + 1] in [
                       'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r',
                       's', 't', 'v', 'w', 'x', 'z'
                   ])):
                phonemes.append('NG')
            else:
                phonemes.append('M')
        elif letter == 'n':
            if previous == 'o' and len(word) > pos + 5 and word[
                    pos + 1] == 's' and word[pos + 2] == 'i' and word[
                        pos + 3] == 'e' and word[pos +
                                                 4] == 'u' and word[pos +
                                                                    5] == 'r':
                pass
            elif previous in [
                    'a', 'e', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ] and (len(word) == pos + 1 or
                   (len(word) > pos + 1 and word[pos + 1] in [
                       'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'n', 'p', 'q',
                       'r', 's', 't', 'v', 'w', 'x', 'z'
                   ])):  # n was forcefully added
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('P')
            elif previous == 'm' and len(word) > pos + 1 and word[
                    pos + 1] == 't':  # mpt
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'h':  # ph
                phonemes.append('F')
            else:
                phonemes.append('P')
        elif letter == 'r':
            if word[pos - 2] == 'e' and previous == 'u':
                phonemes.append('R')
            elif pos + 1 == len(word):
                pass
            else:
                phonemes.append('R')
        elif letter == 's':
            if pos + 1 == len(word) and not (
                (word[pos - 3] == 'i' and word[pos - 2] == 'l'
                 and previous == 'i') or
                (word[pos - 3] in ['e', accented_e, 't']
                 and word[pos - 2] == 'l' and previous == 'a') or
                (word[pos - 3] == 'f' and word[pos - 2] == 'i'
                 and previous == 'l') or word == 'lis'):
                pass
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'c' and word[pos +
                                                                 2] == 'h':
                pass
            elif previous in ['d', 't']:
                pass
            elif previous == 'e' and pos + 2 == len(word) and len(
                    word) == 3 and word[pos + 1] == 't':  # est
                pass
            elif previous in [
                    'a', 'e', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ] and len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'e', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ]:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter == 't':
            if pos + 1 == len(word) and previous not in [
                    'i', 'c', accented_i
            ] and word != 'gadget' or word[pos - 2] in ['a', accented_a]:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif previous in ['d', 'g']:
                pass
            elif word[pos - 3] == 'p' and word[
                    pos -
                    2] == 'e' and previous == 'c' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 3 and word[pos + 1] == 'i' and word[
                    pos + 2] == 'o' and word[pos + 3] == 'n' or len(
                        word) > pos + 5 and word[pos + 1] == 'i' and word[
                            pos + 2] == 'e' and word[pos + 3] == 'n' and word[
                                pos + 4] == 'c' and word[pos + 5] == 'e':
                phonemes.append('S')  # takes care of words ending with 'ience'
            else:
                phonemes.append('T')
        elif letter == 'w':
            if len(word) > pos + 4 and word[1:] == 'agon':
                phonemes.append('V')  # wagon
            else:
                phonemes.append('W')
        elif letter == 'x':
            if previous == 'u' and pos == len(word):
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('Z')
            elif (len(word) > pos + 1 and word[pos + 1] in [
                    'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q',
                    'r', 's', 't', 'v', 'w', 'x', 'y', 'z'
            ]) or (word[pos - 2] == 't' and previous != 'a'):
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'e', 'h', 'i', 'o', 'u', accented_a, accented_e,
                    accented_i, accented_o, accented_u
            ] and (word[pos - 2] != 't' and previous not in ['a', accented_a]):
                phonemes.append('Z')
            else:
                phonemes.append('K')
                phonemes.append('S')
        elif letter == 'y':
            if previous == 'a':  # ay
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if word[-1] == letter and word[:-1] == 'berlio':
                phonemes.append('Z')
            elif word[-1] == letter and len(word) > 1:
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #19
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(
        u'aeiouy\N{LATIN SMALL LETTER A WITH RING ABOVE}\N{LATIN SMALL LETTER AE}\N{LATIN SMALL LETTER O WITH STROKE}'
    ).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'c': 'S',
        'f': 'F',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'V',
        'z': 'S',
    }
    short_vowels = {
        u'a': 'AA0',
        u'e': 'EH0',
        u'i': 'IH0',
        u'o': 'UH0',
        u'u': 'UH0',
        u'y': 'IH0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'AH0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AA0'
    }
    long_vowels = {
        u'a': 'AA0',
        u'e': 'EY0',
        u'i': 'IY0',
        u'o': 'OW0',
        u'u': 'UW0',
        u'y': 'IY0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0'
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if isvowel(letter):
            if len(word) == pos + 3 and word[pos +
                                             1] == 'r' and word[pos +
                                                                1] == 'd':
                phonemes.append(long_vowels[letter])
            elif letter == 'a' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('AY0')
            elif letter == 'a' and len(word) > pos + 1 and word[pos +
                                                                1] == 'u':
                phonemes.append('AW0')
            elif letter == 'e' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('AY0')
            elif letter == 'e' and len(word) > pos + 1 and word[pos +
                                                                1] == 'r':
                phonemes.append('AE0')
            elif letter == 'o' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('OY0')
            elif letter == 'o' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('UW0')
                phonemes.append('IY0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH STROKE}' and len(
                    word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('OW0')
                phonemes.append('IY0')
            elif len(word) == pos + 2 and word[pos + 1] == 'm':
                phonemes.append(short_vowels[letter])
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append(short_vowels[letter])
            elif len(word) == pos + 3 and word[pos +
                                               1] == 'r' and word[pos +
                                                                  2] == 'd':
                phonemes.append(long_vowels[letter])
            elif len(word) > pos + 2 and word[pos + 1] != word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append(long_vowels[letter])
            else:
                phonemes.append(long_vowels[letter])
        elif letter == 'd':
            if len(word
                   ) == pos + 1 and previous == 'r':  # ends in d, e.g. jord
                pass
            elif len(word) == pos + 1 and isvowel(
                    previous):  # ends in long vowel then d, e.g. god
                pass
            elif previous in ['l', 'n']:  # holde, land
                pass
            else:
                phonemes.append('D')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # gjær
                pass  # handled as a normal j
            elif len(word) == pos + 1 and previous == 'i':  # ærlig
                pass  # silent at end of word
            elif previous == 'n':
                pass  # handled under n
            elif len(word) > pos + 1 and word[pos + 1] in ['i', 'y']:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'e' and word[pos +
                                                                 2] == 'i':
                phonemes.append('Y')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # hjem
                pass  # handled as a normal j
            if len(word) > pos + 1 and word[pos + 1] == 'v':  # hver
                pass  # handled as a normal v
            else:
                phonemes.append('HH')
        elif letter == 'j':
            if previous == 'k':
                pass  # handled under k
            elif previous == 's':
                pass  # handled under s
            else:
                phonemes.append('Y')
        elif letter == 'k':
            if previous == 's' and len(word) > pos + 1 and word[pos + 1] in [
                    u'j', u'i', u'y', u'\N{LATIN SMALL LETTER O WITH STROKE}'
            ]:
                phonemes.append('SH')  # sjkære, ski, skøyter
            elif previous == 's':
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in [u'i', u'y'
                                                           ]:  # kirke, kyss
                phonemes.append('HH')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # kjønn
                phonemes.append('HH')
            else:
                phonemes.append('K')  # kaffe
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # ljug
                pass  # handled as a normal j
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':  # fang
                phonemes.append('NG')
            else:
                phonemes.append('N')  # ni
        elif letter == 'q':  # foreign language loan-words?
            phonemes.append('K')
            phonemes.append('UW0')
        elif letter == 's':
            if previous == 'r':
                phonemes.append(
                    'SH')  # Eastern Norway - norsk, person, for sent
            elif len(word) > pos + 1 and word[pos + 1] == 'k':
                pass  # handled under k
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # sjø
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'l':
                phonemes.append('SH')  # informal usage
            else:
                phonemes.append('S')  # syv
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #20
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'i': 'IY0',
        'j': 'IY0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        'v': 'V',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER E}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UW0',
        u'\N{CYRILLIC SMALL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0',
        u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if previous == 's' and len(word) > pos + 1 and word[pos +
                                                                1] == 'h':
                phonemes.append('SH')  # as in
                phonemes.append('CH')  # freSH CHeese
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'b' and len(word) == pos + 1:
            phonemes.append('P')
        elif letter == 'd' and len(word) == pos + 1:
            phonemes.append('T')
        elif letter in ['e', u'\N{CYRILLIC SMALL LETTER IE}']:
            if pos == 0:
                phonemes.append('Y')
                phonemes.append('EH0')
            if len(word) > pos + 1 and word[pos + 1] in ['h', '^']:
                phonemes.append('EH0')
            else:
                phonemes.append('EH0')
        elif letter == '^':
            pass
        elif letter == 'g':
            if len(word) == pos + 1:
                phonemes.append('K')
            elif previous in [
                    'e', 'o'
            ] and len(word) == pos + 2 and word[pos + 1] == 'o':
                phonemes.append('V')  # possessive endings -ogo and -ego
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'v' and len(word) == pos + 1:
            phonemes.append('F')
        elif letter == 'y':
            if len(word) > pos + 1 and word[pos + 1] == 'a':
                phonemes.append('Y')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                if len(word) == pos + 2:
                    phonemes.append('SH')
                else:
                    phonemes.append('ZH')
            else:
                if len(word) == pos + 1:
                    phonemes.append('S')
                else:
                    phonemes.append('Z')
        elif letter in [
                u'\N{CYRILLIC CAPITAL LETTER SHCHA}',
                u'\N{CYRILLIC SMALL LETTER SHCHA}'
        ]:
            phonemes.append('SH')
            # phonemes.append('CH')
        elif letter in [
                u'\N{CYRILLIC CAPITAL LETTER TSE}',
                u'\N{CYRILLIC SMALL LETTER TSE}'
        ]:
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}':
            if pos == 0:
                phonemes.append('IY0')
            phonemes.append('AA1')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}':
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('UW0')
        elif letter in [
                u'\N{LATIN SMALL LETTER E WITH DIAERESIS}',
                u'\N{CYRILLIC SMALL LETTER IO}'
        ]:
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('AO0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys('aàáâãäåæeèéêëiìíîïoòóôõöøœuùúûü').has_key
    phonemes = []
    simple_convert = {
        'j': 'JH',
        'k': 'K',
        'q': 'K',
        'v': 'V',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S'  # ç
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == len(word) > pos + 1 and word[pos + 1]:
            phonemes.append({letter})
        elif letter in ['b', 'd', 'g', 'p', 'x'] and pos + 1 == len(word):  # silent at end of words
            pass
        elif letter in ['a', accented_a]:
            if (len(word) > pos + 2 and word[pos + 1] in ['i', accented_i]) and word[pos + 2] != 'l':  # ai
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]:  # au
                phonemes.append('AO0')
            else:
                phonemes.append('AE0')
        elif letter in ['e', accented_e]:
            if pos + 1 == len(word) and len(word) == 2:  # takes care of words like 'je'
                phonemes.append('EH0')
            elif previous == 'u' and pos + 1 == len(word) and len(word) == 3 and word[pos - 2] == 'q':  # que
                phonemes.append('EH0')
            elif pos + 1 == len(word) and len(word) > 2:  # takes care of words like 'parle'
                pass
            elif previous == 'l' and word[pos + 1] == 's' and len(word) == 5 and word[pos - 2] == 'l':  # elles
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 'a' and word[pos + 2] == 'u':
                pass
            elif previous in ['o', accented_o]:
                pass
            elif word[0] == letter and (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['m',
                                                                                                          'n']) and (
                        word != 'ennemmi'):
                phonemes.append('AE0')
            elif previous != 'i' and (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd',
                                                                                                          'f',
                                                                                                          'g', 'j', 'k',
                                                                                                          'l',
                                                                                                          'p', 'q', 'r',
                                                                                                          's',
                                                                                                          't', 'v', 'w',
                                                                                                          'x',
                                                                                                          'z']):
                phonemes.append('AE0')
            elif previous == 'f' and len(word) > pos + 3 and word[pos + 1] == 'm' and word[pos + 2] == 'm' and word[
                        pos + 3] == 'e':
                phonemes.append('AE0')
            elif previous == 'u' and word[pos - 2] == 'q' and pos == len(word):
                pass
            else:
                phonemes.append('EH0')
        elif letter in ['i', accented_i]:
            if previous in ['e', accented_e] and ((len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[
                    pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w',
                                 'x', 'z']) or (len(word) == pos + 2)):
                pass
            elif previous in ['f', 't', 'v', 's'] and word[-1] == 'n' and len(word) > 1 and letter == word[-2]:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'm' and word[pos + 2] in ['b', 'p']:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in ['c', 'd', 'f', 'g', 'j', 'l', 'q',
                                                                                    's', 't', 'v']:
                phonemes.append('EH0')
                phonemes.append('NG')
            elif previous in ['a', accented_a] and len(word) > pos + 1 and word[pos + 1] != 'l':
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and len(word) == pos + 2 and word[pos + 1] == 'n':
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in [
                'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']:
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and not (len(word) > pos + 2 and word[pos + 1] == 'n' and (
                            word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't',
                                              'v',
                                              'w', 'x', 'z'] or pos + 2 == len(word))):
                phonemes.append('AE0')
            else:
                phonemes.append('IH0')
        elif letter in ['o', accented_o]:
            if previous == 'm' and len(word) > pos + 6 and word[pos + 1] == 'n' and word[pos + 2] == 's' and word[
                        pos + 3] == 'i' and word[pos + 4] == 'e' and word[pos + 5] == 'u' and word[pos + 6] == 'r':
                phonemes.append('EH0')  # monsieur
            elif len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('W')
                phonemes.append('AE0')
            elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[pos + 2] in ['m', 'n']:
                phonemes.append('W')
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'u' and word[pos + 2] in ['i', accented_i]:  # stress vowel
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]:
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            else:
                phonemes.append('AO0')
        elif letter in ['u', accented_u]:
            if previous == 'l' and len(word) > pos + 3 and word[pos + 1] == 'n' and word[pos + 2] == 'd' and word[
                        pos + 3] == 'i':
                phonemes.append('EH0')  # lundi
            elif previous == 'o' and len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]:
                pass
            elif previous in ['b', 'c', 'd', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x',
                              'z'] and len(word) > pos + 1 and word[pos + 1] == 'i':
                phonemes.append('W')
            elif (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd',
                                                                                                          'f',
                                                                                                          'g', 'j', 'k',
                                                                                                          'l',
                                                                                                          'p', 'q', 'r',
                                                                                                          's',
                                                                                                          't', 'v', 'w',
                                                                                                          'x',
                                                                                                          'z']):
                phonemes.append('EH0')
            elif previous in ['a', accented_a]:
                phonemes.append('AO0')
            elif previous in ['g', 'q']:
                pass
            elif previous in ['o', accented_o]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['a', accented_a]:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            elif previous == 'g' and len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('JH')
            else:
                phonemes.append('UW0')
        elif letter == 'y':
            if letter == word[0]:
                phonemes.append('Y')
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o,
                              accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u',
                                                                                        accented_a, accented_e,
                                                                                        accented_i, accented_o,
                                                                                        accented_u]:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and len(word) == pos + 2:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j',
                                                                                           'k', 'l', 'p', 'q', 'r', 's',
                                                                                           't', 'v', 'w', 'x', 'z']:
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'b':
            if len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if len(word) > pos + 2 and word[pos + 1] == 'q' and word[pos + 2] == 'u':
                pass
            elif word[pos - 2] == 'p' and previous in ['e', accented_e] and len(word) == pos + 2 and word[
                        pos + 1] == 't':  # takes care of words like 'respect'
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == word[-1] and word[-1] in ['e', accented_e]:
                phonemes.append('Z')
            elif len(word) > pos + 1 and (
                            word[pos + 1] in ['a', 'o', 'u', 'l', accented_a, accented_o, accented_u] or word[
                            pos + 1] in ['b',
                                         'c',
                                         'd',
                                         'f',
                                         'g',
                                         'j',
                                         'k',
                                         'l',
                                         'm',
                                         'n',
                                         'p',
                                         'q',
                                         'r',
                                         's',
                                         't',
                                         'v',
                                         'w',
                                         'x',
                                         'z']):
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                phonemes.append('S')
            elif previous == 'n' and len(word) == pos + 1:
                pass
            else:
                pass
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('D')
        elif letter == 'f':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('V')
            else:
                phonemes.append('F')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c' and len(word) > pos + 1 and word[pos + 1] == 'r':
                phonemes.append('K')
            elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] != 'r':
                phonemes.append('SH')
            else:
                pass
        elif letter == 'l':
            if word[pos - 2] in ['m', 'v', 'h', 'k'] and previous == 'i' and word[pos - 3] not in ['a',
                                                                                                   '']:  # mil*, vil*
                phonemes.append('L')
            elif word[pos - 3] in ['m', 'v'] and word[pos - 2] == 'i' and previous == 'l' and word[pos - 4] not in ['a',
                                                                                                                    '']:  # mill* ,vill*
                phonemes.append('L')
            elif word[pos - 3] == 'q' and word[pos - 2] == 'u' and previous == 'i':  # tranquil*
                phonemes.append('L')
            elif word[pos - 3] == 'u' and word[pos - 2] == 'i' and previous == 'l' and word[
                        pos - 4] == 'q':  # tranquill*
                phonemes.append('L')
            elif ((previous == 'i' or (previous == 'i' and len(word) > pos + 1 and word[pos + 1] == letter) or (
                                    previous == 'i' and len(word) > pos + 2 and word[pos + 1] == letter and word[
                            pos + 2] == 'e'))):
                phonemes.append('Y')
            elif ((word[pos - 2] == 'i' and previous == letter) or (
                                    word[pos - 2] == 'i' and previous == letter and len(word) > pos + 1 and word[
                            pos + 1] == 'e')):  # il, ill,ille
                phonemes.append('Y')
            else:
                phonemes.append('L')
        elif letter == 'm':
            if previous == 'a' and len(word) > pos + 1 and word[pos + 1] == 'n':
                pass
            elif letter == word[-1] and word[-2] == 'i' and word[-3] == 'a':
                phonemes.append('NG')
            elif previous in ['a', 'e', 'i', 'o', 'u'] and (len(word) == pos + 1 or (
                            len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q',
                                                                      'r',
                                                                      's', 't', 'v', 'w', 'x', 'z'])):
                phonemes.append('NG')
            else:
                phonemes.append('M')
        elif letter == 'n':
            if previous == 'o' and len(word) > pos + 5 and word[pos + 1] == 's' and word[pos + 2] == 'i' and word[
                        pos + 3] == 'e' and word[pos + 4] == 'u' and word[pos + 5] == 'r':
                pass
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and (
                            len(word) == pos + 1 or (
                                    len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l',
                                                                              'n', 'p', 'q',
                                                                              'r', 's', 't', 'v', 'w', 'x',
                                                                              'z'])):  # n was forcefully added
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('P')
            elif previous == 'm' and len(word) > pos + 1 and word[pos + 1] == 't':  # mpt
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'h':  # ph
                phonemes.append('F')
            else:
                phonemes.append('P')
        elif letter == 'r':
            if word[pos - 2] == 'e' and previous == 'u':
                phonemes.append('R')
            elif pos + 1 == len(word):
                pass
            else:
                phonemes.append('R')
        elif letter == 's':
            if pos + 1 == len(word) and not ((word[pos - 3] == 'i' and word[pos - 2] == 'l' and previous == 'i') or (
                                word[pos - 3] in ['e', accented_e, 't'] and word[
                                pos - 2] == 'l' and previous == 'a') or (
                                word[pos - 3] == 'f' and word[pos - 2] == 'i' and previous == 'l') or word == 'lis'):
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h':
                pass
            elif previous in ['d', 't']:
                pass
            elif previous == 'e' and pos + 2 == len(word) and len(word) == 3 and word[pos + 1] == 't':  # est
                pass
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o,
                              accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u',
                                                                                        accented_a, accented_e,
                                                                                        accented_i, accented_o,
                                                                                        accented_u]:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter == 't':
            if pos + 1 == len(word) and previous not in ['i', 'c', accented_i] and word != 'gadget' or word[
                        pos - 2] in ['a', accented_a]:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif previous in ['d', 'g']:
                pass
            elif word[pos - 3] == 'p' and word[pos - 2] == 'e' and previous == 'c' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 3 and word[pos + 1] == 'i' and word[pos + 2] == 'o' and word[pos + 3] == 'n' or len(
                    word) > pos + 5 and word[pos + 1] == 'i' and word[pos + 2] == 'e' and word[pos + 3] == 'n' and word[
                        pos + 4] == 'c' and word[pos + 5] == 'e':
                phonemes.append('S')  # takes care of words ending with 'ience'
            else:
                phonemes.append('T')
        elif letter == 'w':
            if len(word) > pos + 4 and word[1:] == 'agon':
                phonemes.append('V')  # wagon
            else:
                phonemes.append('W')
        elif letter == 'x':
            if previous == 'u' and pos == len(word):
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('Z')
            elif (len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q',
                                                            'r', 's', 't', 'v', 'w', 'x', 'y', 'z']) or (
                            word[pos - 2] == 't' and previous != 'a'):
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'h', 'i', 'o', 'u', accented_a, accented_e,
                                                           accented_i, accented_o, accented_u] and (
                            word[pos - 2] != 't' and previous not in ['a', accented_a]):
                phonemes.append('Z')
            else:
                phonemes.append('K')
                phonemes.append('S')
        elif letter == 'y':
            if previous == 'a':  # ay
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if word[-1] == letter and word[:-1] == 'berlio':
                phonemes.append('Z')
            elif word[-1] == letter and len(word) > 1:
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(input_word, recursive=False):
    """breaks down a word into phonemes
    """
    # word = input_word.decode(input_encoding)  # decode input into Python default internal format (utf-16) from the GUI input format
    word = input_word
    word = word.lower()
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word:
        if letter == u'b':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'c':
            if word_index < len(word) - 1 and word[word_index + 1] == u'h':
                breakdown_word.append('CH')
            elif previous == u'c':
                breakdown_word.append('S')
            elif word_index < len(word) - 1 and word[word_index + 1] == u's':
                pass
            elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']:
                # should this be SH before 'e', S before 'i' ??
                breakdown_word.append('S')  # South American, Castilian Spanish uses 'TH'
            else:
                breakdown_word.append('K')
        elif letter == u'd':
            if word_index == 0 or previous in [u'l', u'n']:
                breakdown_word.append('D')
            else:
                breakdown_word.append('DH')
        elif letter == u'e':
            if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('EY0')
            else:
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('EY1')
            else:
                breakdown_word.append('EH1')
        elif letter == u'g':
            if word_index < len(word) - 1 and word[word_index + 1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
                breakdown_word.append('V')
            elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']:
                breakdown_word.append('HH')
            else:
                breakdown_word.append('G')
        elif letter == u'h':
            pass
        elif letter == u'l':
            if word_index < len(word) - 1 and word[word_index + 1] == u'l':
                pass
            elif previous == u'l':
                breakdown_word.append('Y')
            else:
                breakdown_word.append('L')
        elif letter == u'n':
            if word_index < len(word) - 1 and word[word_index + 1] == u'v':
                breakdown_word.append('M')
            else:
                breakdown_word.append('N')
        elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}':
            breakdown_word.append('N')
            breakdown_word.append('Y')
        elif letter == u'o':
            if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o',
                                                                           u'u']:  # last bit necessary ?
                breakdown_word.append('AO0')
            else:
                breakdown_word.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o',
                                                                           u'u']:  # last bit necessary ?
                breakdown_word.append('AO1')
            else:
                breakdown_word.append('OW1')
        elif letter == u'p':
            if word_index == len(word) - 1:
                pass
            else:
                breakdown_word.append('P')
        elif letter == u'r':
            if previous == u'r':
                pass
            elif word_index < len(word) - 1 and word[word_index + 1] == u'r':
                breakdown_word.append('R')  # RR - trilled a lot
            else:
                breakdown_word.append('R')  # only a little trilled
        elif letter == u's':
            if word_index < len(word) - 1 and word[word_index + 1] in [u'd', u'g', u'l', u'm', u'n']:
                breakdown_word.append('Z')
            else:
                breakdown_word.append('S')
        elif letter == u'u':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW1')
        elif letter == u'v':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'x':
            if previous in [u'a', u'e', u'i', u'o', u'u'] and word_index < len(word) - 1 and word[word_index + 1] in [
                u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('K')
                breakdown_word.append('S')
            else:
                breakdown_word.append('S')
        elif letter == u'y':
            if len(word) == 1:
                breakdown_word.append('IY1')
            elif word_index == len(word) - 1:
                breakdown_word.append('IY0')
            else:
                breakdown_word.append('Y')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    breakdown_word.append(phon[0])
        previous = letter
        word_index += 1
    breakdown_word = stressSpanishWord(breakdown_word)
    # return breakdown_word
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in breakdown_word:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        'f': 'F',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        'k': 'K',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0',  # ER0 ? AO0 ?
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0',
        'v': 'V',
        'w': 'V',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('AY0')
            elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handle under 'z'
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'e':
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'e':
                phonemes.append('EY0')
            elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('JH')
            else:
                phonemes.append('G')
        elif letter == 'i':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('IH0')  # IY0?
        elif letter == 'j':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                pass  # handled under y - ly is close enough to just IY
            else:
                phonemes.append('L')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == 'q':  # loan words
            phonemes.append('K')
            phonemes.append('W')
        elif letter == 's':
            if previous == 'c':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 's' and word[pos + 2] == 'z':  # ssz
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'z' and previous == 's':  # ssz
                phonemes.append('S')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':  # sz
                phonemes.append('S')
            else:
                phonemes.append('SH')
        elif letter == 'x':  # loan words only
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            elif previous == 'g':
                pass  # handled under g
            elif previous == 't':
                phonemes.append('Y')
            elif previous == 'n':
                pass  # close enough to just n, although more like Spanish ñ
            else:
                phonemes.append('IY0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 's' and previous == 'd':  # dzs
                phonemes.append('JH')
            elif previous == 'z' and len(word) > pos + 1 and word[pos + 1] == 's':  # zzs
                phonemes.append('ZH')
                phonemes.append('ZH')
            elif len(word) > pos + 1 and word[pos + 1] == 's':  # zs
                phonemes.append('ZH')
            elif len(word) > pos + 2 and word[pos + 1] == 'z' and word[pos + 2] == 's':  # probably zzs
                pass
            elif previous == 'd':  # dz
                phonemes.append('D')
                phonemes.append('S')
            elif previous == 's':
                pass  # handled under s
            elif previous == 'c':
                pass  # handled under c
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #24
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        'c': 'JH',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH',
        'd': 'D',
        'f': 'F',
        'g': 'G',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0',
        'i': 'IY0',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        'p': 'P',
        'r': 'R',
        's': 'S',
        u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH',
        't': 'T',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        'w': 'V',  # loan-words
        'z': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}':
            pass
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ pass
            # ~ elif letter == 'g':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
            # ~ elif letter == 'l':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'b':
                phonemes.append('M')
            else:
                phonemes.append('N')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('OW0')
        elif letter == 'q':  # loan-words
            phonemes.append('K')
        elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == 'u':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('IY0')
            else:
                phonemes.append('UH0')
        elif letter == 'v':
            if isvowel(previous):
                phonemes.append('W')
            else:
                phonemes.append('V')
        elif letter == 'x':  # loan-words
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                pass
            else:
                phonemes.append('Y')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key
    phonemes = []
    simple_convert = {
        'd': 'D',
        'h': 'HH',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'V',
        # in foreign and borrowed words and names
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0',  # ??? # á
        u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0',  # à
        u'\N{LATIN SMALL LETTER AE}': 'AE0',  # æ - Norwegian / Danish
        'b': 'B',
        'c': 'K',  # S ???
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH',  # ç - French, etc
        u'\N{LATIN SMALL LETTER C WITH CARON}': 'S',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER ETH}': 'DH',  # ð - Icelandic
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',  # é
        u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'EH0',  # ??? # ë - scientific names
        'f': 'F',
        u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y',  # ñ - Spanish
        u'\N{LATIN SMALL LETTER ENG}': 'N',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0',  # ??? # ø - Norwegian / Danish
        u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0',  # ??? # õ - Estonian
        'q': 'K',
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',  # ß - German
        u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER THORN}': 'TH',  # Þ - Icelandic
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',  # ??? # ü - German / Estonian
        'w': 'V',
        'z': 'Z'

    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # ~ if letter == previous:
        # ~ pass
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'u', ]:
                pass  # handled under following letter
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] in ['i', ]:
                pass  # handled under following letter
            elif previous == 'i':  # ie
                phonemes.append('IY0')  # ???
            else:
                phonemes.append('EH0')
        elif letter == 'i':
            prev_match_i = {
                'a': 'AY0',  # ai
                'e': 'EY0',  # ei
                'o': 'OY0',  # oi
                'u': 'UW0',  # ui
                'y': 'IY0'  # yi
                # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ???
                # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ???
            }
            if previous in prev_match_i:
                phonemes.append(prev_match_i[previous])
            else:
                phonemes.append('IH0')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'u']:
                pass  # handled under following letter
            elif previous == 'u':  # uo
                phonemes.append('OW0')  # ???
            else:
                phonemes.append('OY0')
        elif letter == 'u':
            prev_match_u = {
                'a': 'AW0',  # au
                'o': 'OW0'  # AO??? # ou
                # eu ???
                # iu ???
            }
            if len(word) > pos + 1 and word[pos + 1] in ['i', ]:
                pass  # handled under following letter
            elif previous in prev_match_u:
                phonemes.append(prev_match_u[previous])
            else:
                phonemes.append('UH0')
        elif letter == 'y':
            # äy ???
            # öy ???
            if len(word) > pos + 1 and word[pos + 1] in ['i', ]:
                pass  # handled under following letter
            else:
                phonemes.append('UW0')  # ???
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # ä
            phonemes.append('AE0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # ö
            # yö ???
            phonemes.append('ER0')  # ???
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            else:
                phonemes.append('G')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':
                pass  # handled under g
            else:
                phonemes.append('N')
        elif letter in simple_convert:
            phonemes.append(simple_convert[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'i': 'IY0',
        'j': 'IY0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        'v': 'V',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER E}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UW0',
        u'\N{CYRILLIC SMALL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',

        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0',
        u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')  # as in
                phonemes.append('CH')  # freSH CHeese
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'b' and len(word) == pos + 1:
            phonemes.append('P')
        elif letter == 'd' and len(word) == pos + 1:
            phonemes.append('T')
        elif letter in ['e', u'\N{CYRILLIC SMALL LETTER IE}']:
            if pos == 0:
                phonemes.append('Y')
                phonemes.append('EH0')
            if len(word) > pos + 1 and word[pos + 1] in ['h', '^']:
                phonemes.append('EH0')
            else:
                phonemes.append('EH0')
        elif letter == '^':
            pass
        elif letter == 'g':
            if len(word) == pos + 1:
                phonemes.append('K')
            elif previous in ['e', 'o'] and len(word) == pos + 2 and word[pos + 1] == 'o':
                phonemes.append('V')  # possessive endings -ogo and -ego
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'v' and len(word) == pos + 1:
            phonemes.append('F')
        elif letter == 'y':
            if len(word) > pos + 1 and word[pos + 1] == 'a':
                phonemes.append('Y')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                if len(word) == pos + 2:
                    phonemes.append('SH')
                else:
                    phonemes.append('ZH')
            else:
                if len(word) == pos + 1:
                    phonemes.append('S')
                else:
                    phonemes.append('Z')
        elif letter in [u'\N{CYRILLIC CAPITAL LETTER SHCHA}', u'\N{CYRILLIC SMALL LETTER SHCHA}']:
            phonemes.append('SH')
            # phonemes.append('CH')
        elif letter in [u'\N{CYRILLIC CAPITAL LETTER TSE}', u'\N{CYRILLIC SMALL LETTER TSE}']:
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}':
            if pos == 0:
                phonemes.append('IY0')
            phonemes.append('AA1')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}':
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('UW0')
        elif letter in [u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', u'\N{CYRILLIC SMALL LETTER IO}']:
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('AO0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #27
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',
        'y': 'IY0',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S',  # з
    }

    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        # A
        if letter in [
                'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}'
        ]:
            phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}':
            phonemes.append('AE0')
            # E
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EH0')
            # I
        elif letter in [
                'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
        ]:
            phonemes.append('IY0')
            # O
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            phonemes.append('OY0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}':
            phonemes.append('AW0')
            # U
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']:
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # Special rule to digraphs consonant:
            # qu and gu (followed by e or i):  aquilo, questгo, quilo, querida, guerra, бguia
            # ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiзa, aguentar
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            if previous == 'q':  # digraph consonant Qu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('K')
                else:
                    phonemes.append('UW0')
            elif previous == 'g':  # digraph consonant Gu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('G')
                else:
                    phonemes.append('UW0')
            else:
                phonemes.append('UW0')
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # consonants with combinations
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # C
        elif letter == 'c':
            if previous == 's':  # digraph consonant sC #asCender
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            if previous == 'x':  # digraph consonant xC #exCelente
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            # ce #ci
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                phonemes.append('S')
            else:
                phonemes.append('K')
                # G
        elif letter == 'g':
            # ge #gi
            if len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                phonemes.append('ZH')
            else:
                phonemes.append('G')

        # H
        elif letter == 'h':  # silent letter
            if previous == 'n':
                phonemes.append('N')  # digraph consonant Nh
            else:
                pass

        # M
        elif letter == 'm':
            # ['i', 'o', 'u', 'н', 'у', 'ъ', 'о', 'ф', х]
            if previous in [
                    'i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER O WITH TILDE}'
            ] and word[-1] == ('m') or len(word) > pos + 1 and not isvowel(
                    word[pos + 1]):
                pass  # digraphs vowel am em im om um
            else:
                phonemes.append('M')

        # N
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # Nh handled under #H
            elif isvowel(previous) and word[-1] == (
                    'n') or len(word) > pos + 1 and not isvowel(word[pos + 1]):
                pass  # digraphs vowel an en in on un
            else:
                phonemes.append('N')
        # S
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # sC handled under #C
            elif isvowel(previous) and len(word) > pos + 1 and isvowel(
                    word[pos +
                         1]):  # check if have vowel before and after S #caSa
                phonemes.append('Z')
            else:
                phonemes.append('S')
        # X
        elif letter == 'x':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # xC handled under #C
            else:
                phonemes.append(
                    'SH'
                )  # There are some exceptions where X have phoneme "KS" like tбxi = T A K S I
            #
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #28
0
    def ImportMultipleAudioFiles(self, audioFolderPath, textPath):
        language = self.languageChoice.GetStringSelection()

        spokenTexts = {}
        # if we have a text file with spoken words then build dictionary of them
        if textPath is not None:
            textFile = codecs.open(textPath, "r", "utf-8", "replace")

            for line in textFile.readlines():
                # to fix mac apostorofjsldfkthingises and other annoying chars
                # line = line.replace("’","'")# hammer(line)
                # line = line.replace(u".","")
                # line = line.replace(u"\"","")
                # line = line.replace(u"“","")
                # line = line.replace(u"”","")
                line = hammer(line)

                print line

                if not "=" in line:
                    continue

                if "#" in line:
                    line = line.split("#")[1]

                splitted = line.split("=")

                name = hammer(splitted[0].strip())
                spokenTexts[name] = hammer(splitted[1].strip())

        print "creating new doc"

        if self.doc is None:
            self.CreateNewDoc()

        resultDialogue = MultiImportResultDialogue(self)

        print "after new doc"

        # next go through audio folder
        for fileName in os.listdir(audioFolderPath):
            for extension in audioFileExtensions:
                if fileName.endswith(extension):
                    absAudioPath = os.path.join(audioFolderPath, fileName)
                    voiceName = hammer(fileName.split(".")[0])
                    voice = None

                    # if there already is a voice in the document that has the same name then we use that
                    for exisitingVoice in self.doc.voices:
                        if exisitingVoice.name == voiceName:
                            voice = exisitingVoice

                    if voice is None:
                        voice = LipsyncVoice(self.doc)
                        voice.name = voiceName
                        self.doc.voices.append(voice)
                        self.voiceList.Insert(voice.name, self.voiceList.GetCount())

                    if voiceName in spokenTexts:  # is the audio to be found in the list of names we have?
                        print "found it!"
                        if voice.text != spokenTexts[voiceName]:  # has what is being said changed?
                            print "and it is unchanged"

                            voice.OpenAudio(absAudioPath)
                            voice.text = spokenTexts[voiceName]
                            voice.RunBreakdown(self, language, self.langman)
                            resultDialogue.Changed_Files_List.Insert(voice.name, 0)

        if len(self.doc.voices) >= 1:
            self.doc.currentVoice = self.doc.voices[0]
            print "setting voice"
            self.SetVoice(self.doc.currentVoice)
            print "after setting voice"
            self.waveformView.UpdateDrawing()
            self.mouthView.DrawMe()

        resultDialogue.ShowModal()
        resultDialogue.Destroy()

        self.EnableGUI()