Example #1
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(u'aeiou\N{LATIN SMALL LETTER DOTLESS I}'
                            u'\N{LATIN SMALL LETTER O WITH DIAERESIS}\N{LATIN SMALL LETTER U WITH DIAERESIS}'
                            u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}').has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'c': 'JH',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH',
        'd': 'D',
        'f': 'F',
        'g': 'G',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0',
        'i': 'IY0',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        'p': 'P',
        'r': 'R',
        's': 'S',
        u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH',
        't': 'T',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        'w': 'V',  # loan-words
        'z': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}':
            pass
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ pass
            # ~ elif letter == 'g':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
            # ~ elif letter == 'l':
            # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
            # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
            # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
            # ~ phonemes.append('L')
            # ~ phonemes.append('Y')
            # ~ else:
            # ~ phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'b':
                phonemes.append('M')
            else:
                phonemes.append('N')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('OW0')
        elif letter == 'q':  # loan-words
            phonemes.append('K')
        elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == 'u':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('IY0')
            else:
                phonemes.append('UH0')
        elif letter == 'v':
            if isvowel(previous):
                phonemes.append('W')
            else:
                phonemes.append('V')
        elif letter == 'x':  # loan-words
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                pass
            else:
                phonemes.append('Y')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #2
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys('aeiouäöü').has_key
    phonemes = []
    simple_convert = {
        'f': 'F',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',  # use AH0 or ER0 for final letter in word ??
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',
        't': 'T',
        'v': 'F',  # non-native loan-words, 'V'
        'w': 'V',
        'y': 'IH0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == previous and not isvowel(letter):
            pass
        elif letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] == 'i':  # ai
                phonemes.append('AY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'u':  # au
                phonemes.append('AW0')
            elif previous == 'a':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append('AH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('AH0')
            elif len(word) == pos + 1 and not isvowel(previous):
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if len(word) > pos + 1 and word[pos + 1] == 'u':  # äu
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos + 1:
                phonemes.append('EY0')
            else:
                phonemes.append('EY0')
        elif letter == 'b':
            if len(word) == pos + 1:
                phonemes.append('P')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if previous == 's' and len(word) > pos + 1 and word[pos +
                                                                1] == 'h':
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('HH')  # use 'K'??
            else:
                phonemes.append('K')
        elif letter == 'd':
            if len(word) == pos + 1:
                phonemes.append('T')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('T')
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'i':
                pass  # covered under 'i'
            elif len(word) == pos + 2 and word[pos + 1] in ['l', 'n', 'r'
                                                            ]:  # -en, -er, -el
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[pos + 1] == 'i':  # ei
                phonemes.append('AY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'u':  # eu
                phonemes.append('OY0')
            elif len(word) > pos + 1 and word[pos + 1] == 'e':  # ee
                phonemes.append('EY0')
            elif previous == 'e':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos + 1 and not isvowel(previous):
                phonemes.append('EH0')
            else:
                phonemes.append('EY0')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) == pos + 1 and previous == 'i':
                phonemes.append('HH')
            elif len(word) == pos + 1:
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('K')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if isvowel(previous):
                pass  # silent
            elif previous == 'c':
                pass  # covered under 'c'
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous in ['a', 'e']:
                pass  # covered under other vowel
            elif len(word) > pos + 1 and word[pos + 1] == 'e':  # ie
                phonemes.append('IY0')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append('IH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('IH0')
            elif len(word) == pos + 1 and not isvowel(previous):
                phonemes.append('IY0')  # also use IH0 here instead?
            elif pos == 0:
                phonemes.append('IH0')
            else:
                phonemes.append('IH0')  # also use IH0 here instead?
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':
                pass  # covered under 'g'
            else:
                phonemes.append('N')
        elif letter == 'o':
            if previous == 'o':
                pass
            elif len(word) == pos + 1 and not isvowel(previous):
                phonemes.append('AO0')
            else:
                phonemes.append(
                    'AO0')  # somtimes o in on, not covered in CMU/USA
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
            phonemes.append('ER0')
        elif letter == 's':
            if pos == 0 and len(word) > pos + 1 and word[pos +
                                                         1] in ['p', 't']:
                phonemes.append('SH')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'c' and word[pos +
                                                                 2] == 'h':
                pass  # covered under 'c'
            elif pos == 0:
                phonemes.append('Z')  # at beginning of word
            elif len(word) == pos + 1:
                phonemes.append('S')  # at end of word
            else:
                phonemes.append('S')  # default sound - or 'Z' ??
        elif letter == 'u':
            if previous in [
                    'a', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', 'e'
            ]:
                pass
            elif previous == 'q':
                phonemes.append('V')
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append('UH0')
            elif len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('UH0')
            elif len(word) == pos + 1 and not isvowel(previous):
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
            phonemes.append('UW0')
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'z':
            phonemes.append('T')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
        #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        'f': 'F',
        'h': 'HH',
        u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        'k': 'K',
        'm': 'M',
        'n': 'N',
        u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0',  # ER0 ? AO0 ?
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0',
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
        u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0',
        u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0',  # IH0?
        u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0',
        'v': 'V',
        'w': 'V',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('AY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handle under 'z'
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'e':
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'e':
                phonemes.append('EY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('JH')
            else:
                phonemes.append('G')
        elif letter == 'i':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('IH0')  # IY0?
        elif letter == 'j':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'y':
                pass  # handled under y - ly is close enough to just IY
            else:
                phonemes.append('L')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'l' and word[pos +
                                                                 2] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == 'q':  # loan words
            phonemes.append('K')
            phonemes.append('W')
        elif letter == 's':
            if previous == 'c':
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 's' and word[
                    pos + 2] == 'z':  # ssz
                pass
            elif len(word) > pos + 1 and word[
                    pos + 1] == 'z' and previous == 's':  # ssz
                phonemes.append('S')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':  # sz
                phonemes.append('S')
            else:
                phonemes.append('SH')
        elif letter == 'x':  # loan words only
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            elif previous == 'g':
                pass  # handled under g
            elif previous == 't':
                phonemes.append('Y')
            elif previous == 'n':
                pass  # close enough to just n, although more like Spanish ñ
            else:
                phonemes.append('IY0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[
                    pos + 1] == 's' and previous == 'd':  # dzs
                phonemes.append('JH')
            elif previous == 'z' and len(word) > pos + 1 and word[
                    pos + 1] == 's':  # zzs
                phonemes.append('ZH')
                phonemes.append('ZH')
            elif len(word) > pos + 1 and word[pos + 1] == 's':  # zs
                phonemes.append('ZH')
            elif len(word) > pos + 2 and word[pos + 1] == 'z' and word[
                    pos + 2] == 's':  # probably zzs
                pass
            elif previous == 'd':  # dz
                phonemes.append('D')
                phonemes.append('S')
            elif previous == 's':
                pass  # handled under s
            elif previous == 'c':
                pass  # handled under c
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
        #~ else:
        #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'i': 'IY0',
        'j': 'IY0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        'v': 'V',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER E}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH', #  'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UW0',
        u'\N{CYRILLIC SMALL LETTER HARD SIGN}': 'Y',
        u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y',
        u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',

        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', #  'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0',
        u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': 'Y',
        u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y',
        u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if previous == 's' and len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('SH')  # as in
                phonemes.append('CH')  #       freSH CHeese
            elif len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'b' and len(word) == pos+1:
            phonemes.append('P')
        elif letter == 'd' and len(word) == pos+1:
            phonemes.append('T')
        elif letter in  ['e', u'\N{CYRILLIC SMALL LETTER IE}']:
            if pos == 0:
                phonemes.append('Y')
                phonemes.append('EH0')
            if len(word) > pos+1 and word[pos+1] in ['h', '^']:
                phonemes.append('EH0')
            else:
                phonemes.append('EH0')
        elif letter =='^':
            pass
        elif letter == 'g':
            if len(word) == pos+1:
                phonemes.append('K')
            elif previous in ['e', 'o'] and len(word) == pos+2 and word[pos+1] == 'o':
                phonemes.append('V')  # possessive endings -ogo and -ego
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter == 's':
            if len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'v' and len(word) == pos+1:
            phonemes.append('F')
        elif letter == 'y':
            if len(word) > pos+1 and word[pos+1] == 'a':
                phonemes.append('Y')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if len(word) > pos+1 and word[pos+1] == 'h':
                if len(word) == pos+2:
                    phonemes.append('SH')
                else:
                    phonemes.append('ZH')
            else:
                if len(word) == pos+1:
                    phonemes.append('S')
                else:
                    phonemes.append('Z')
        elif letter in [ u'\N{CYRILLIC CAPITAL LETTER SHCHA}', u'\N{CYRILLIC SMALL LETTER SHCHA}' ]:
            phonemes.append('SH')
            #phonemes.append('CH')
        elif letter in [ u'\N{CYRILLIC CAPITAL LETTER TSE}' , u'\N{CYRILLIC SMALL LETTER TSE}' ]:
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}':
			if pos==0:
				phonemes.append('IY0')
			phonemes.append('AA1')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}':
			if pos==0:
				phonemes.append('Y')
			phonemes.append('UW0')
        elif letter in [u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', u'\N{CYRILLIC SMALL LETTER IO}']:
			if pos==0:
				phonemes.append('Y')
			phonemes.append('AO0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon)
        #~ else:
            #~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #5
0
def breakdownSwedishSyllable(word, recursive=False, phonetic=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
        #    u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0',
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
        #    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0',
        #    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0',
        #    u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0',
        'a': 'AH0',  # not exact - AO0 ??
        'b': 'B',
        'f': 'F',
        'm': 'M',
        'o': 'UH0',  # compromise, actually UW0 or AA0 (not), sometimes AO0
        'q': 'K',
        'v': 'V',
        'w': 'V',
        'z': 'S',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0',  # not exact
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # cc, handle on next case
            elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}'
            ]:
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}'
            ]:
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
                #~ if previous == 's':
                #~ phonemes.append('SH')
                #~ else:
                #~ phonemes.append('CH')  # sometimes 'K' as in English 'chorus', but no rule
            else:  #  elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('K')
        elif letter == 'd':
            if pos == 0 and len(word) > pos + 1 and word[
                    pos + 1] == 'j':  # dj at beginning of word
                pass  # same as j alone
            else:
                phonemes.append('D')
        elif letter == 'e':
            if phonetic:
                phonemes.append('EH0')
            elif len(word) == pos + 2 and word[pos + 1] == 'r':  # ends in er
                phonemes.append('AE0')
            else:
                phonemes.append('EH0')  # sometimes 'IY0', sometimes 'EY0'
        elif letter == 'g':
            if previous in ['l', 'r']:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[
                    pos + 2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'n' and previous in [
                    'a', 'o', 'u', 'e', 'i', 'y',
                    u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}'
            ]:
                phonemes.append('NG')
            elif previous == 'n':  # ng
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # gj
                pass  # same as 'j' alone
            elif len(word) == pos + 2 and word[
                    pos +
                    1] == 'e':  # ends in 'ge' - French loan-word such as garage ?
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}'
            ]:
                # ??? if e is unstressed (how to tell?), pronounce as 'G'
                phonemes.append('Y')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}'
            ]:
                phonemes.append('G')
            elif previous == 'g':
                pass
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c':
                pass  # handled under c
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            elif pos == 1 and previous == 's':  # probably a foreign loan-word
                phonemes.append('SH')
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous == 'g' and len(word) > pos + 1 and word[
                    pos + 1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[
                    pos + 1] == 'o':  # sio e.g mission
                phonemes.append('UH0')
            else:
                phonemes.append('IY0')  # sometimes 'IH0'
        elif letter == 'k':  # needs to be handled before j to handle skj sound
            if pos == 0 and word in [
                    u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse'
            ]:
                phonemes.append('K')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}',
                    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}'
            ]:
                phonemes.append('CH')
            elif word == unicode('människa', input_encoding):
                phonemes.append('SH')
            elif word == unicode('människor', input_encoding):
                phonemes.append('SH')
            elif len(word) == pos + 1 and previous == 's':  # ends in SK
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                # phonemes.append('SH')
                phonemes.append('CH')  # more Finnish-Swedish than Swedish ???
            elif len(word) == pos + 1 and previous == 'c':
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in [
                    'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}'
            ]:
                phonemes.append('S')
                phonemes.append('K')
            elif previous == 's' and pos == 1:  # sk at beginning of word
                phonemes.append('SH')
            else:  #  elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u',  u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']
                phonemes.append('K')
        elif letter == 't':  # needs to be handled before j to handle stj sound
            if previous == 's' and len(word) > pos + 1 and word[pos +
                                                                1] == 'j':
                phonemes.append('SH')
            if previous == 't' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # tj
                pass  # handled under j
            else:
                phonemes.append('T')
        elif letter == 'j':
            if previous == 's':
                phonemes.append('SH')
            elif previous == 't':
                if word[pos - 2] == 's':  # stj, handled under 't'
                    pass
                else:
                    phonemes.append('CH')
            elif previous == 'k':
                pass  # handled under k
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # same as 'j' alone
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':  # ng
                pass  # handled under 'g'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':  # ng
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if previous == 'p':
                pass
            else:
                phonemes.append('P')
        elif letter == 'r':
            if len(word) > pos + 1 and word[pos + 1] == 's':
                pass  # handled under s
            else:
                phonemes.append('R')
        elif letter == 's':
            if len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos +
                                                                     2] == 'h':
                pass  # handled under 'c'
            elif len(word) > pos + 2 and word[pos +
                                              1] == 't' and word[pos +
                                                                 2] == 'j':
                pass  # handled under 't'
            elif len(word) > pos + 1 and word[pos + 1] == 'k':
                pass  # handled under 'k'
            elif len(word) > pos + 1 and word[pos + 1] == 'j':
                pass  # handled under 'j'
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif len(word) > pos + 1 and word[
                    pos + 1] == 'i' and len(word) > pos + 2 and word[
                        pos + 2] == 'o':  ## might need more breakdown
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # handled under 'h'
            elif previous == 'r':
                phonemes.append('SH')  # not entirely accurate, use HH ??
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'q':
                phonemes.append('V')
            else:
                phonemes.append(
                    'UW0')  # inaccurate, no accurate CMU equiivalent
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if word in [u'yoga', u'yoghurt']:
                phonemes.append('Y')
            elif word == u'fyrtio':
                phonemes.append('ER0')
            else:
                phonemes.append('UW0')  # not exact
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if phonetic:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] == 'r':
                phonemes.append('AE0')  # not exact, and skips exceptions---
            else:
                phonemes.append('EH0')  # not exact, and skips exceptions
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = " ".join(
                    breakdownSwedishSyllable(hammer(letter), True, phonetic))
                if phon:
                    phonemes.append(phon)
        #~ else:
        #~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(input_word, recursive=False):
    word = input_word
    word = word.lower()  # trasformando tutte le parole in minuscolo si diminuiscono le combinazioni da gestire
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word :
        if letter == u'c' :
            #ci
            if word_index < len(word) and word[word_index+1]==u'i' :
                breakdown_word.append('EH0')
            #ce
            elif word_index < len(word) and word[word_index+1]==u'e' :
                breakdown_word.append('EH0')
            #cci
            elif word_index < len(word)-1 and word[word_index+1]==u'c' and word[word_index+2]==u'i' :
                breakdown_word.append('EH0')
            else :
                breakdown_word.append('K')
        elif letter == u'g' :
        	#gi
            if word_index < len(word) and word[word_index+1]==u'i' :
                breakdown_word.append('JH')
            #gli
            elif word_index < len(word)-1 and word[word_index+1]==u'l' and word[word_index+2]==u'i' :
                breakdown_word.append('JH')
            else :
                breakdown_word.append('G')
        elif letter == u'i' :
            #ci, #gi
            if previous == u'c' or previous == u'g' :
                previous = letter
                word_index = word_index + 1
                continue
            else :
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER I WITH ACUTE}' :
            #cí, #gí
            if previous == u'c' or previous == u'g' :
                previous = letter
                word_index = word_index + 1
                continue
            else :
                breakdown_word.append('EH1')
        elif letter == u'\N{LATIN SMALL LETTER I WITH GRAVE}' :
            #cì, #gì
            if previous == u'c' or previous == u'g' :
                previous = letter
                word_index = word_index + 1
                continue
            else :
                breakdown_word.append('EH1')
        elif letter == u'h' :
            #ch
            if previous == u'c':
                previous = letter
                word_index = word_index + 1
                continue
            else :
                breakdown_word.append('HH')
        elif letter == u'j' :
            if word_index > 0 and word_index <len(word) :
                breakdown_word.append('JH')
            else :
                breakdown_word.append('EH0')
        elif letter == u'l' :
            #gli
            if word_index < len(word) and previous == u'g' and word[word_index+1] == u'i' :
                previous = letter
                word_index = word_index + 1
                continue
            else :
                breakdown_word.append('L')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    breakdown_word.append(phon.split()[0])
        #~ else:
            #~ print "not handled", letter, word
        previous = letter
        word_index = word_index + 1
    return breakdown_word
Example #7
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(u'aeiou\N{LATIN SMALL LETTER DOTLESS I}'
u'\N{LATIN SMALL LETTER O WITH DIAERESIS}\N{LATIN SMALL LETTER U WITH DIAERESIS}'
u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}').has_key
    phonemes = []
    simple_convert = {
    'b': 'B',
    'c': 'JH',
    u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH',
    'd': 'D',
    'f': 'F',
    'g': 'G',
    'h': 'HH',
    u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0',
    'i': 'IY0',
    'j': 'ZH',
    'k': 'K',
    'l': 'L',
    'm': 'M',
    'n': 'N',
    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
    'p': 'P',
    'r': 'R',
    's': 'S',
    u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH',
    't': 'T',
    u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0?
    'w': 'V',  # loan-words
    'z': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'a':
            if len(word) > pos+1 and word[pos+1] == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos+1 and word[pos+1] == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}':
            pass
            #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
                            #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
                            #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
                #~ phonemes.append('Y')
            #~ else:
                #~ pass
        #~ elif letter == 'g':
            #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
                            #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
                            #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
                #~ phonemes.append('L')
                #~ phonemes.append('Y')
            #~ else:
                #~ phonemes.append('L')
        #~ elif letter == 'l':
            #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i',
                            #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}',
                            #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']:
                #~ phonemes.append('L')
                #~ phonemes.append('Y')
            #~ else:
                #~ phonemes.append('L')
        elif letter =='n':
            if len(word) > pos+1 and word[pos+1] == 'b':
                phonemes.append('M')
            else:
                phonemes.append('N')
        elif letter == 'o':
            if len(word) > pos+1 and word[pos+1] == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('OW0')
        elif letter == 'q':  # loan-words
            phonemes.append('K')
        elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}':
            if previous in ['g', 'k', 'l']:
                phonemes.append('IY0')
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter == 'u':
            if len(word) > pos+1 and word[pos+1] == 'y':
                phonemes.append('IY0')
            else:
                phonemes.append('UH0')
        elif letter == 'v':
            if isvowel(previous):
                phonemes.append('W')
            else:
                phonemes.append('V')
        elif letter == 'x':  # loan-words
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                pass
            else:
                phonemes.append('Y')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(u'aeiouy\N{LATIN SMALL LETTER A WITH RING ABOVE}\N{LATIN SMALL LETTER AE}\N{LATIN SMALL LETTER O WITH STROKE}').has_key
    phonemes = []
    simple_convert = {
    'b': 'B',
    'c': 'S',
    'f': 'F',
    'm': 'M',
    'p': 'P',
    'r': 'R',
    't': 'T',
    'v': 'V',
    'w': 'V',
    'z': 'S',
    }
    short_vowels = {
        u'a': 'AA0',
        u'e': 'EH0',
        u'i': 'IH0',
        u'o': 'UH0',
        u'u': 'UH0',
        u'y': 'IH0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'AH0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AA0'
    }
    long_vowels = {
        u'a': 'AA0',
        u'e': 'EY0',
        u'i': 'IY0',
        u'o': 'OW0',
        u'u': 'UW0',
        u'y': 'IY0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0'
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if isvowel(letter):
            if len(word) == pos+3 and word[pos+1] == 'r' and word[pos+1] == 'd':
                phonemes.append(long_vowels[letter])
            elif letter == 'a'and len(word) > pos+1 and word[pos+1] == 'i':
                    phonemes.append('AY0')
            elif letter == 'a'and len(word) > pos+1 and word[pos+1] == 'u':
                    phonemes.append('AW0')
            elif letter == 'e'and len(word) > pos+1 and word[pos+1] == 'i':
                    phonemes.append('AY0')
            elif letter == 'e'and len(word) > pos+1 and word[pos+1] == 'r':
                    phonemes.append('AE0')
            elif letter == 'o'and len(word) > pos+1 and word[pos+1] == 'i':
                    phonemes.append('OY0')
            elif letter == 'o'and len(word) > pos+1 and word[pos+1] == 'i':
                    phonemes.append('UW0')
                    phonemes.append('IY0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH STROKE}'and len(word) > pos+1 and word[pos+1] == 'y':
                    phonemes.append('OW0')
                    phonemes.append('IY0')
            elif len(word) == pos+2 and word[pos+1] == 'm':
                phonemes.append(short_vowels[letter])
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append(short_vowels[letter])
            elif len(word) == pos+3 and word[pos+1] == 'r' and word[pos+2] == 'd':
                phonemes.append(long_vowels[letter])
            elif len(word) > pos+2 and word[pos+1] != word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append(long_vowels[letter])
            else:
                phonemes.append(long_vowels[letter])
        elif letter == 'd':
            if len(word) == pos+1 and previous == 'r': # ends in d, e.g. jord
                pass
            elif len(word) == pos+1 and isvowel(previous): # ends in long vowel then d, e.g. god
                pass
            elif previous in ['l', 'n']:  # holde, land
                pass
            else:
                phonemes.append('D')
        elif letter == 'g':
            if len(word) > pos+1 and word[pos+1] == 'j':   # gjær
                pass  # handled as a normal j
            elif len(word) == pos+1 and previous == 'i': # ærlig
                pass  # silent at end of word
            elif previous == 'n':
                pass  # handled under n
            elif len(word) > pos+1 and word[pos+1] in ['i', 'y']:
                phonemes.append('Y')
            elif len(word) > pos+2 and word[pos+1] == 'e' and word[pos+2] == 'i':
                phonemes.append('Y')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if len(word) > pos+1 and word[pos+1] == 'j':   # hjem
                pass  # handled as a normal j
            if len(word) > pos+1 and word[pos+1] == 'v':   # hver
                pass  # handled as a normal v
            else:
                phonemes.append('HH')
        elif letter == 'j':
            if previous == 'k':
                pass  # handled under k
            elif previous == 's':
                pass  # handled under s
            else:
                phonemes.append('Y')
        elif letter == 'k':
            if previous == 's' and len(word) > pos+1 and word[pos+1] in [u'j', u'i', u'y', u'\N{LATIN SMALL LETTER O WITH STROKE}']:
                phonemes.append('SH')  # sjkære, ski, skøyter
            elif previous == 's':
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos+1 and word[pos+1] in [u'i', u'y']:  # kirke, kyss
                phonemes.append('HH')
            elif len(word) > pos+1 and word[pos+1] == 'j': # kjønn
                phonemes.append('HH')
            else:
                phonemes.append('K')  # kaffe
        elif letter == 'l':
            if len(word) > pos+1 and word[pos+1] == 'j':   # ljug
                pass  # handled as a normal j
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos+1 and word[pos+1] == 'g': # fang
                phonemes.append('NG')
            else:
                phonemes.append('N')  # ni
        elif letter == 'q':  # foreign language loan-words?
            phonemes.append('K')
            phonemes.append('UW0')
        elif letter == 's':
            if previous == 'r':
                phonemes.append('SH') # Eastern Norway - norsk, person, for sent
            elif len(word) > pos+1 and word[pos+1] == 'k':
                pass  # handled under k
            elif len(word) > pos+1 and word[pos+1] == 'j': # sjø
                phonemes.append('SH')
            elif len(word) > pos+1 and word[pos+1] == 'l':
                phonemes.append('SH') # informal usage
            else:
                phonemes.append('S')  # syv
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon)
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    vowels = [
        'a', 'A',
        'e', 'E',
        'i', 'I',
        'o', 'O',
        'u', 'U',
        u'\N{CYRILLIC SMALL LETTER A}',  # looks like normal a
        # u'\N{CYRILLIC SMALL LETTER IE}',  # looks like normal e
        u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}', # looks something like small Euro symbol with one cross-piece
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal i
        # u'\N{CYRILLIC SMALL LETTER YI}',  # i with diaresis
        u'\N{CYRILLIC SMALL LETTER I}',  # looks like small backwards capital N
        u'\N{CYRILLIC SMALL LETTER SHORT I}',  # looks like small backwards capital N with tilde
        u'\N{CYRILLIC SMALL LETTER O}',  # looks like normal o
        u'\N{CYRILLIC SMALL LETTER U}',  # looks like normal y
        u'\N{CYRILLIC CAPITAL LETTER A}',  # looks like normal A
        # u'\N{CYRILLIC CAPITAL LETTER IE}',  # looks like normal E
        u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}', # looks something like Euro symbol with one cross-piece
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal I
        # u'\N{CYRILLIC CAPITAL LETTER YI}',  # I with diaresis
        u'\N{CYRILLIC CAPITAL LETTER I}',  # looks like backwards capital N
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}',  # looks like backwards capital N with tilde
        u'\N{CYRILLIC CAPITAL LETTER O}',  # looks like normal O
        u'\N{CYRILLIC CAPITAL LETTER U}',  # looks like normal Y
    ]
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'v': 'V',
        'g': 'G',
        'd': 'D',
        'e': 'EH0',
        'j': 'Y',
        'y': 'IH0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'f': 'F',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER IE}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0', # 'Y' ?
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH', #  'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y',

        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0', # 'Y' ?
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', #  'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('CH')
            else:
                pass
        elif letter == 'i':
            if previous == 'j':
                phonemes.append('IY0')
            else:
                phonemes.append('IH0')
        elif letter == 'h':
            if letter == 'h':
                if previous in ['z', 's', 'c']:
                    pass
                else:
                    phonemes.append('HH')
        elif letter == 's':
            if len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'j':
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == 'z':
            if len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('ZH')
            else:
                phonemes.append('Z')
        elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC SMALL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC SMALL LETTER YA}':
            phonemes.append('Y')
            phonemes.append('AO0') # not if unstressed - drop this line ?
        elif letter == u'\N{CYRILLIC SMALL LETTER YU}':
            phonemes.append('Y')
            phonemes.append('UW0')
        elif letter == u'\N{CYRILLIC SMALL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('AO0') # not if unstressed - drop this line ?
            else:
                phonemes.append('AA0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #10
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    vowels = [
        'a', 'A',
        'e', 'E',
        'i', 'I',
        'o', 'O',
        'u', 'U',
        u'\N{CYRILLIC SMALL LETTER A}',  # looks like normal a
        # u'\N{CYRILLIC SMALL LETTER IE}',  # looks like normal e
        u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}',  # looks something like small Euro symbol with one cross-piece
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal i
        # u'\N{CYRILLIC SMALL LETTER YI}',  # i with diaresis
        u'\N{CYRILLIC SMALL LETTER I}',  # looks like small backwards capital N
        u'\N{CYRILLIC SMALL LETTER SHORT I}',  # looks like small backwards capital N with tilde
        u'\N{CYRILLIC SMALL LETTER O}',  # looks like normal o
        u'\N{CYRILLIC SMALL LETTER U}',  # looks like normal y
        u'\N{CYRILLIC CAPITAL LETTER A}',  # looks like normal A
        # u'\N{CYRILLIC CAPITAL LETTER IE}',  # looks like normal E
        u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}',  # looks something like Euro symbol with one cross-piece
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}',  # looks like normal I
        # u'\N{CYRILLIC CAPITAL LETTER YI}',  # I with diaresis
        u'\N{CYRILLIC CAPITAL LETTER I}',  # looks like backwards capital N
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}',  # looks like backwards capital N with tilde
        u'\N{CYRILLIC CAPITAL LETTER O}',  # looks like normal O
        u'\N{CYRILLIC CAPITAL LETTER U}',  # looks like normal Y
    ]
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'v': 'V',
        'g': 'G',
        'd': 'D',
        'e': 'EH0',
        'j': 'Y',
        'y': 'IH0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'f': 'F',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER IE}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y',

        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',  # 'Y' ?
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH',
        # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                pass
        elif letter == 'i':
            if previous == 'j':
                phonemes.append('IY0')
            else:
                phonemes.append('IH0')
        elif letter == 'h':
            if letter == 'h':
                if previous in ['z', 's', 'c']:
                    pass
                else:
                    phonemes.append('HH')
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'j':
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('ZH')
            else:
                phonemes.append('Z')
        elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC SMALL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC SMALL LETTER YA}':
            phonemes.append('Y')
            phonemes.append('AO0')  # not if unstressed - drop this line ?
        elif letter == u'\N{CYRILLIC SMALL LETTER YU}':
            phonemes.append('Y')
            phonemes.append('UW0')
        elif letter == u'\N{CYRILLIC SMALL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}':
            phonemes.append('SH')
            phonemes.append('CH')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}':
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('UW0')
            else:
                phonemes.append('UH0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}':
            if previous in vowels or previous == "'" or pos == 0:
                phonemes.append('Y')
                phonemes.append('AO0')  # not if unstressed - drop this line ?
            else:
                phonemes.append('AA0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}':
            phonemes.append('Y')
            phonemes.append('IY0')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}':
            phonemes.append('Y')
            phonemes.append('EH0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #11
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
    'b': 'B',
    u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
    'f': 'F',
    'h': 'HH',
    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
    'k': 'K',
    'm': 'M',
    'n': 'N',
    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0', # ER0 ? AO0 ?
    'p': 'P',
    'r': 'R',
    't': 'T',
    'u': 'UW0',
    u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0',
    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0',
    u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0',
    u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0?
    u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0',
    'v': 'V',
    'w': 'V',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter =='a':
            if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos+2 and word[pos+1]  == 'l' and word[pos+2]  == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}':
            if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']:
                phonemes.append('AY0')
            elif len(word) > pos+2 and word[pos+1]  == 'l' and word[pos+2]  == 'y':
                phonemes.append('AY0')
            else:
                phonemes.append('AA0')
        elif letter == 'c':
            if len(word) > pos+1 and word[pos+1] == 's':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'd':
            if len(word) > pos+1 and word[pos+1] == 's':
                pass  # handle under 'z'
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'e':
                pass
            elif len(word) > pos+1 and word[pos+1] == 'e':
                phonemes.append('EY0')
            elif len(word) > pos+2 and word[pos+1]  == 'l' and word[pos+2]  == 'y':
                phonemes.append('EY0')
            else:
                phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos+1 and word[pos+1] == 'y':
                phonemes.append('JH')
            else:
                phonemes.append('G')
        elif letter == 'i':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('IH0')  # IY0?
        elif letter == 'j':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos+1 and word[pos+1] == 'y':
                pass  # handled under y - ly is close enough to just IY
            else:
                phonemes.append('L')
        elif letter == 'o':
            if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']:
                phonemes.append('OY0')
            elif len(word) > pos+2 and word[pos+1]  == 'l' and word[pos+2]  == 'y':
                phonemes.append('OY0')
            else:
                phonemes.append('AO0')
        elif letter == 'q': # loan words
            phonemes.append('K')
            phonemes.append('W')
        elif letter == 's':
            if previous == 'c':
                pass
            elif len(word) > pos+2 and word[pos+1] == 's' and word[pos+2] == 'z': # ssz
                pass
            elif len(word) > pos+1 and word[pos+1] == 'z' and previous == 's': # ssz
                phonemes.append('S')
                phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] == 'z': # sz
                phonemes.append('S')
            else:
                phonemes.append('SH')
        elif letter == 'x':  # loan words only
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']:
                pass
            elif previous == 'g':
                pass  # handled under g
            elif previous == 't':
                phonemes.append('Y')
            elif previous == 'n':
                pass  # close enough to just n, although more like Spanish ñ
            else:
                phonemes.append('IY0')
        elif letter == 'z':
            if len(word) > pos+1 and word[pos+1] == 's' and previous == 'd':   # dzs
                phonemes.append('JH')
            elif previous == 'z' and len(word) > pos+1 and word[pos+1] == 's': # zzs
                phonemes.append('ZH')
                phonemes.append('ZH')
            elif len(word) > pos+1 and word[pos+1] == 's': # zs
                phonemes.append('ZH')
            elif len(word) > pos+2 and word[pos+1] == 'z' and word[pos+2] == 's': # probably zzs
                pass
            elif previous == 'd':   # dz
                phonemes.append('D')
                phonemes.append('S')
            elif previous == 's':
                pass  # handled under s
            elif previous == 'c':
                pass  # handled under c
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon.split()[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #12
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys('aàáâãäåæeèéêëiìíîïoòóôõöøœuùúûü').has_key
    phonemes = []
    simple_convert = {
        'j': 'JH',
        'k': 'K',
        'q': 'K',
        'v': 'V',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S'  # ç
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == len(word) > pos + 1 and word[pos + 1]:
            phonemes.append({letter})
        elif letter in ['b', 'd', 'g', 'p', 'x'] and pos + 1 == len(word):  # silent at end of words
            pass
        elif letter in ['a', accented_a]:
            if (len(word) > pos + 2 and word[pos + 1] in ['i', accented_i]) and word[pos + 2] != 'l':  # ai
                phonemes.append('EH0')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]:  # au
                phonemes.append('AO0')
            else:
                phonemes.append('AE0')
        elif letter in ['e', accented_e]:
            if pos + 1 == len(word) and len(word) == 2:  # takes care of words like 'je'
                phonemes.append('EH0')
            elif previous == 'u' and pos + 1 == len(word) and len(word) == 3 and word[pos - 2] == 'q':  # que
                phonemes.append('EH0')
            elif pos + 1 == len(word) and len(word) > 2:  # takes care of words like 'parle'
                pass
            elif previous == 'l' and word[pos + 1] == 's' and len(word) == 5 and word[pos - 2] == 'l':  # elles
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 'a' and word[pos + 2] == 'u':
                pass
            elif previous in ['o', accented_o]:
                pass
            elif word[0] == letter and (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['m',
                                                                                                          'n']) and (
                        word != 'ennemmi'):
                phonemes.append('AE0')
            elif previous != 'i' and (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd',
                                                                                                          'f',
                                                                                                          'g', 'j', 'k',
                                                                                                          'l',
                                                                                                          'p', 'q', 'r',
                                                                                                          's',
                                                                                                          't', 'v', 'w',
                                                                                                          'x',
                                                                                                          'z']):
                phonemes.append('AE0')
            elif previous == 'f' and len(word) > pos + 3 and word[pos + 1] == 'm' and word[pos + 2] == 'm' and word[
                        pos + 3] == 'e':
                phonemes.append('AE0')
            elif previous == 'u' and word[pos - 2] == 'q' and pos == len(word):
                pass
            else:
                phonemes.append('EH0')
        elif letter in ['i', accented_i]:
            if previous in ['e', accented_e] and ((len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[
                    pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w',
                                 'x', 'z']) or (len(word) == pos + 2)):
                pass
            elif previous in ['f', 't', 'v', 's'] and word[-1] == 'n' and len(word) > 1 and letter == word[-2]:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'm' and word[pos + 2] in ['b', 'p']:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in ['c', 'd', 'f', 'g', 'j', 'l', 'q',
                                                                                    's', 't', 'v']:
                phonemes.append('EH0')
                phonemes.append('NG')
            elif previous in ['a', accented_a] and len(word) > pos + 1 and word[pos + 1] != 'l':
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and len(word) == pos + 2 and word[pos + 1] == 'n':
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in [
                'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']:
                phonemes.append('EH0')
            elif previous in ['o', accented_o] and not (len(word) > pos + 2 and word[pos + 1] == 'n' and (
                            word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't',
                                              'v',
                                              'w', 'x', 'z'] or pos + 2 == len(word))):
                phonemes.append('AE0')
            else:
                phonemes.append('IH0')
        elif letter in ['o', accented_o]:
            if previous == 'm' and len(word) > pos + 6 and word[pos + 1] == 'n' and word[pos + 2] == 's' and word[
                        pos + 3] == 'i' and word[pos + 4] == 'e' and word[pos + 5] == 'u' and word[pos + 6] == 'r':
                phonemes.append('EH0')  # monsieur
            elif len(word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('W')
                phonemes.append('AE0')
            elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[pos + 2] in ['m', 'n']:
                phonemes.append('W')
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] == 'u' and word[pos + 2] in ['i', accented_i]:  # stress vowel
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]:
                phonemes.append('W')
            elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            else:
                phonemes.append('AO0')
        elif letter in ['u', accented_u]:
            if previous == 'l' and len(word) > pos + 3 and word[pos + 1] == 'n' and word[pos + 2] == 'd' and word[
                        pos + 3] == 'i':
                phonemes.append('EH0')  # lundi
            elif previous == 'o' and len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]:
                pass
            elif previous in ['b', 'c', 'd', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x',
                              'z'] and len(word) > pos + 1 and word[pos + 1] == 'i':
                phonemes.append('W')
            elif (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or (
                                len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd',
                                                                                                          'f',
                                                                                                          'g', 'j', 'k',
                                                                                                          'l',
                                                                                                          'p', 'q', 'r',
                                                                                                          's',
                                                                                                          't', 'v', 'w',
                                                                                                          'x',
                                                                                                          'z']):
                phonemes.append('EH0')
            elif previous in ['a', accented_a]:
                phonemes.append('AO0')
            elif previous in ['g', 'q']:
                pass
            elif previous in ['o', accented_o]:
                phonemes.append('UW0')
            elif len(word) > pos + 1 and word[pos + 1] in ['a', accented_a]:
                phonemes.append('AE0')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('EH0')
            elif previous == 'g' and len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]:
                phonemes.append('JH')
            else:
                phonemes.append('UW0')
        elif letter == 'y':
            if letter == word[0]:
                phonemes.append('Y')
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o,
                              accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u',
                                                                                        accented_a, accented_e,
                                                                                        accented_i, accented_o,
                                                                                        accented_u]:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and len(word) == pos + 2:
                phonemes.append('EH0')
            elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j',
                                                                                           'k', 'l', 'p', 'q', 'r', 's',
                                                                                           't', 'v', 'w', 'x', 'z']:
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'b':
            if len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if len(word) > pos + 2 and word[pos + 1] == 'q' and word[pos + 2] == 'u':
                pass
            elif word[pos - 2] == 'p' and previous in ['e', accented_e] and len(word) == pos + 2 and word[
                        pos + 1] == 't':  # takes care of words like 'respect'
                pass
            elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == word[-1] and word[-1] in ['e', accented_e]:
                phonemes.append('Z')
            elif len(word) > pos + 1 and (
                            word[pos + 1] in ['a', 'o', 'u', 'l', accented_a, accented_o, accented_u] or word[
                            pos + 1] in ['b',
                                         'c',
                                         'd',
                                         'f',
                                         'g',
                                         'j',
                                         'k',
                                         'l',
                                         'm',
                                         'n',
                                         'p',
                                         'q',
                                         'r',
                                         's',
                                         't',
                                         'v',
                                         'w',
                                         'x',
                                         'z']):
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                phonemes.append('S')
            elif previous == 'n' and len(word) == pos + 1:
                pass
            else:
                pass
        elif letter == 'd':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('D')
        elif letter == 'f':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('V')
            else:
                phonemes.append('F')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']:
                pass
            else:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c' and len(word) > pos + 1 and word[pos + 1] == 'r':
                phonemes.append('K')
            elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] != 'r':
                phonemes.append('SH')
            else:
                pass
        elif letter == 'l':
            if word[pos - 2] in ['m', 'v', 'h', 'k'] and previous == 'i' and word[pos - 3] not in ['a',
                                                                                                   '']:  # mil*, vil*
                phonemes.append('L')
            elif word[pos - 3] in ['m', 'v'] and word[pos - 2] == 'i' and previous == 'l' and word[pos - 4] not in ['a',
                                                                                                                    '']:  # mill* ,vill*
                phonemes.append('L')
            elif word[pos - 3] == 'q' and word[pos - 2] == 'u' and previous == 'i':  # tranquil*
                phonemes.append('L')
            elif word[pos - 3] == 'u' and word[pos - 2] == 'i' and previous == 'l' and word[
                        pos - 4] == 'q':  # tranquill*
                phonemes.append('L')
            elif ((previous == 'i' or (previous == 'i' and len(word) > pos + 1 and word[pos + 1] == letter) or (
                                    previous == 'i' and len(word) > pos + 2 and word[pos + 1] == letter and word[
                            pos + 2] == 'e'))):
                phonemes.append('Y')
            elif ((word[pos - 2] == 'i' and previous == letter) or (
                                    word[pos - 2] == 'i' and previous == letter and len(word) > pos + 1 and word[
                            pos + 1] == 'e')):  # il, ill,ille
                phonemes.append('Y')
            else:
                phonemes.append('L')
        elif letter == 'm':
            if previous == 'a' and len(word) > pos + 1 and word[pos + 1] == 'n':
                pass
            elif letter == word[-1] and word[-2] == 'i' and word[-3] == 'a':
                phonemes.append('NG')
            elif previous in ['a', 'e', 'i', 'o', 'u'] and (len(word) == pos + 1 or (
                            len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q',
                                                                      'r',
                                                                      's', 't', 'v', 'w', 'x', 'z'])):
                phonemes.append('NG')
            else:
                phonemes.append('M')
        elif letter == 'n':
            if previous == 'o' and len(word) > pos + 5 and word[pos + 1] == 's' and word[pos + 2] == 'i' and word[
                        pos + 3] == 'e' and word[pos + 4] == 'u' and word[pos + 5] == 'r':
                pass
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and (
                            len(word) == pos + 1 or (
                                    len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l',
                                                                              'n', 'p', 'q',
                                                                              'r', 's', 't', 'v', 'w', 'x',
                                                                              'z'])):  # n was forcefully added
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('P')
            elif previous == 'm' and len(word) > pos + 1 and word[pos + 1] == 't':  # mpt
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 'h':  # ph
                phonemes.append('F')
            else:
                phonemes.append('P')
        elif letter == 'r':
            if word[pos - 2] == 'e' and previous == 'u':
                phonemes.append('R')
            elif pos + 1 == len(word):
                pass
            else:
                phonemes.append('R')
        elif letter == 's':
            if pos + 1 == len(word) and not ((word[pos - 3] == 'i' and word[pos - 2] == 'l' and previous == 'i') or (
                                word[pos - 3] in ['e', accented_e, 't'] and word[
                                pos - 2] == 'l' and previous == 'a') or (
                                word[pos - 3] == 'f' and word[pos - 2] == 'i' and previous == 'l') or word == 'lis'):
                pass
            elif len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h':
                pass
            elif previous in ['d', 't']:
                pass
            elif previous == 'e' and pos + 2 == len(word) and len(word) == 3 and word[pos + 1] == 't':  # est
                pass
            elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o,
                              accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u',
                                                                                        accented_a, accented_e,
                                                                                        accented_i, accented_o,
                                                                                        accented_u]:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter == 't':
            if pos + 1 == len(word) and previous not in ['i', 'c', accented_i] and word != 'gadget' or word[
                        pos - 2] in ['a', accented_a]:
                pass
            elif len(word) > pos + 1 and word[pos + 1] == 's':
                pass
            elif previous in ['d', 'g']:
                pass
            elif word[pos - 3] == 'p' and word[pos - 2] == 'e' and previous == 'c' and len(word) == pos + 1:
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('T')
            elif len(word) > pos + 3 and word[pos + 1] == 'i' and word[pos + 2] == 'o' and word[pos + 3] == 'n' or len(
                    word) > pos + 5 and word[pos + 1] == 'i' and word[pos + 2] == 'e' and word[pos + 3] == 'n' and word[
                        pos + 4] == 'c' and word[pos + 5] == 'e':
                phonemes.append('S')  # takes care of words ending with 'ience'
            else:
                phonemes.append('T')
        elif letter == 'w':
            if len(word) > pos + 4 and word[1:] == 'agon':
                phonemes.append('V')  # wagon
            else:
                phonemes.append('W')
        elif letter == 'x':
            if previous == 'u' and pos == len(word):
                pass
            elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']:
                phonemes.append('Z')
            elif (len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q',
                                                            'r', 's', 't', 'v', 'w', 'x', 'y', 'z']) or (
                            word[pos - 2] == 't' and previous != 'a'):
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'h', 'i', 'o', 'u', accented_a, accented_e,
                                                           accented_i, accented_o, accented_u] and (
                            word[pos - 2] != 't' and previous not in ['a', accented_a]):
                phonemes.append('Z')
            else:
                phonemes.append('K')
                phonemes.append('S')
        elif letter == 'y':
            if previous == 'a':  # ay
                phonemes.append('EH0')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if word[-1] == letter and word[:-1] == 'berlio':
                phonemes.append('Z')
            elif word[-1] == letter and len(word) > 1:
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #13
0
def syllablesToPhonemes(syllables,  recursive=False):
    isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
    'b': 'B',
    'd': 'D',
    'f': 'F',
    'h': 'HH',
    'j': 'Y',  # SH in some words borrowed from French
    'k': 'K',
    'l': 'L',
    'm': 'M',
    'n': 'N',
    'p': 'P',
    'r': 'R',
    's': 'S',
    't': 'T',
    'v': 'F', #  English F mixed with English V
    'w': 'V', # closer to soft English V than the English W - pronounced back in mouth, not with pursed lips
    'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    syllable_pos, letter_pos = 0,1
    pos = [1,1] # syllable 1, letter 1
    previous_letter = ' '
    for syllable in syllables:
        for letter in syllable:
            if letter == previous_letter and not isvowel(letter):  # double consonants
                pass
            # ===================== consonants ==========================
            elif letter == "b" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]):  # last letter in word
                phonemes.append("P")
            elif letter == "d" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]):  # last letter in word
                phonemes.append("T")
            elif letter == "n" and len(syllable) > pos[letter_pos] and syllable[letter_pos] == "g": # ng
                    pass # handled in next case
            elif letter == "g" and previous_letter == "n": # ng
                phonemes.append("NG")
            elif letter == 'g':
                phonemes.append("HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            # elif letter == 'c' and len(syllable) > pos[letter_pos]-1 and syllable[pos[letter_pos]] == 'h':
            elif letter == 'c' and len(syllable) > pos[letter_pos]+1 and syllable[pos[letter_pos]] == 'h':  # ch
                pass
            elif letter == 'h' and previous_letter == 'c': # ch
                phonemes.append("HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            elif letter == 't'and len(syllable) > pos[letter_pos] and syllable[pos[letter_pos]] == 'h': # th
                    pass # handled in next case
            elif letter == 'h' and previous_letter == 't': # th
                phonemes.append("TH")
            elif letter == 'j'and previous_letter == 'i':
                    pass # handled in vowels
            elif letter == 'w'and previous_letter == 'u':
                    pass # handled in vowels
            elif letter == 'x':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("S")
            elif letter == 'q':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("W")
            elif letter == 'c':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] in "ei":   # c     before e and i pronounce as s
                    phonemes.append("S")
                else:
                    phonemes.append("K")  # c     before a consonant, at the end of a word and before a, o, u pronounce as k;
            elif letter in easy_consonants:
                phonemes.append(simple_convert[letter])
            # =============== vowels ================
            # ------------ A -------------------------------
            elif letter == 'a': # short AH, long AA
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'a':  # double a
                    phonemes.append("AA0")
                elif previous_letter == 'a':  # double a handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long a reduced to single letter
                    phonemes.append("AA0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # au
                    phonemes.append("AW0")  # occasionally as UW0 in some words borrowed from French
                else:
                    phonemes.append('AH0')  # like English short u (cut, hut)
            # ------------ E -------------------------------
            elif letter == 'e': # e short EH long EY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # double e
                    phonemes.append("EY0")
                elif previous_letter == 'e':  # double e handled by case above
                    pass
                elif previous_letter == 'i':  # ie handled at i stage
                    pass
                elif previous_letter == 'o':  # oe handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long e reduced to single letter
                    phonemes.append("EY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # eu
                    phonemes.append("ER0")  # less R than English equivalent, closer to French eu
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ei
                    phonemes.append("AY0")
                else:
                    phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            # ------------ I -------------------------------
            elif letter == 'i': # i short IH long IY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # double i
                    phonemes.append("IY0")
                elif previous_letter == 'u':  # ui handled at u stage
                    pass
                elif previous_letter == 'i':  # double i handled by case above
                    pass
                elif previous_letter == 'e': # ei handled at ei stage
                    pass
                # elif previous_letter == 'a': # !!!FIXME!!! handle aai, aaij
                #    pass
                elif pos[letter_pos] == len(syllable):  # long i reduced to single letter
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'j': # ij
                    phonemes.append("AY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # iu
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ie
                    phonemes.append("IY0")
                # elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ieuw !!!FIXME!!!  handle ieuw IY UW ???
                #    phonemes.append("IY0")
                else:
                    phonemes.append('IH0')
            # ------------ O -------------------------------
            elif letter == 'o': # o short AA long OW
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'o':  # double o
                    phonemes.append("OW0")
                elif previous_letter == 'o':  # double o handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long o reduced to single letter
                    phonemes.append("OW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # oe
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # ou
                    phonemes.append("AW0")
                else:
                    phonemes.append('AO0')
            # ------------ U -------------------------------
            elif letter == 'u':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # double u
                    phonemes.append("UW0")
                elif previous_letter == 'u':  # double u handled by case above
                    pass
                elif previous_letter == 'a':  # au handled at a stage
                    pass
                elif previous_letter == 'e':  # handled at e stage
                    pass
                elif previous_letter == 'i':
                    phonemes.append("UW0")
                elif previous_letter == 'o':  # handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long u reduced to single letter
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'w': # uw
                    phonemes.append("UW0")  # uw = EW in English DEW IY UW ???
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ui
                    phonemes.append("UH0")  # - not accurate but the nearest phoneme in CMU? (use UW instead?)
                else:
                    phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            # ------------ TREMA (looks like German Umlaut, but different meaning) -------------------------------
            elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # δ
                phonemes.append('AH0')  # like English short u (cut, hut)
            elif letter == u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':  # λ
                phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            elif letter == u'\N{LATIN SMALL LETTER I WITH DIAERESIS}':  # ο
                phonemes.append('IH0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # φ
                phonemes.append('AO0')
            elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':  # ό
                phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            elif letter == u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' or letter == u'\N{LATIN SMALL LETTER Y WITH ACUTE}':  # '�' or 'ύ'
                # LATIN SMALL LETTER Y WITH DIAERESIS
                # LATIN SMALL LETTER Y WITH ACUTE
                phonemes.append("AY0")
            elif len(hammer(letter)) == 1:
                if not recursive:
                    phon = syllablesToPhonemes(hammer(letter), True)
                    if phon:
                        phonemes.append(phon[0])
            pos[letter_pos] += 1
            previous_letter = letter
        pos[syllable_pos] += 1
        pos[letter_pos] = 1
        previous_letter = ' '
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #14
0
def breakdownWord(input_word, recursive=False):
    """breaks down a word into phonemes
    """
    # word = input_word.decode(input_encoding)  # decode input into Python default internal format (utf-16) from the GUI input format
    word = input_word
    word = word.lower()
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word:
        if letter == u'b':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'c':
            if word_index < len(word) - 1 and word[word_index + 1] == u'h':
                breakdown_word.append('CH')
            elif previous == u'c':
                breakdown_word.append('S')
            elif word_index < len(word) - 1 and word[word_index + 1] == u's':
                pass
            elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']:
                # should this be SH before 'e', S before 'i' ??
                breakdown_word.append('S')  # South American, Castilian Spanish uses 'TH'
            else:
                breakdown_word.append('K')
        elif letter == u'd':
            if word_index == 0 or previous in [u'l', u'n']:
                breakdown_word.append('D')
            else:
                breakdown_word.append('DH')
        elif letter == u'e':
            if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('EY0')
            else:
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('EY1')
            else:
                breakdown_word.append('EH1')
        elif letter == u'g':
            if word_index < len(word) - 1 and word[word_index + 1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
                breakdown_word.append('V')
            elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']:
                breakdown_word.append('HH')
            else:
                breakdown_word.append('G')
        elif letter == u'h':
            pass
        elif letter == u'l':
            if word_index < len(word) - 1 and word[word_index + 1] == u'l':
                pass
            elif previous == u'l':
                breakdown_word.append('Y')
            else:
                breakdown_word.append('L')
        elif letter == u'n':
            if word_index < len(word) - 1 and word[word_index + 1] == u'v':
                breakdown_word.append('M')
            else:
                breakdown_word.append('N')
        elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}':
            breakdown_word.append('N')
            breakdown_word.append('Y')
        elif letter == u'o':
            if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o',
                                                                           u'u']:  # last bit necessary ?
                breakdown_word.append('AO0')
            else:
                breakdown_word.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o',
                                                                           u'u']:  # last bit necessary ?
                breakdown_word.append('AO1')
            else:
                breakdown_word.append('OW1')
        elif letter == u'p':
            if word_index == len(word) - 1:
                pass
            else:
                breakdown_word.append('P')
        elif letter == u'r':
            if previous == u'r':
                pass
            elif word_index < len(word) - 1 and word[word_index + 1] == u'r':
                breakdown_word.append('R')  # RR - trilled a lot
            else:
                breakdown_word.append('R')  # only a little trilled
        elif letter == u's':
            if word_index < len(word) - 1 and word[word_index + 1] in [u'd', u'g', u'l', u'm', u'n']:
                breakdown_word.append('Z')
            else:
                breakdown_word.append('S')
        elif letter == u'u':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}':
            if previous == u'q':
                pass
            elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']:
                pass
            else:
                breakdown_word.append('UW1')
        elif letter == u'v':
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else:
                breakdown_word.append('V')
        elif letter == u'x':
            if previous in [u'a', u'e', u'i', u'o', u'u'] and word_index < len(word) - 1 and word[word_index + 1] in [
                u'a', u'e', u'i', u'o', u'u']:
                breakdown_word.append('K')
                breakdown_word.append('S')
            else:
                breakdown_word.append('S')
        elif letter == u'y':
            if len(word) == 1:
                breakdown_word.append('IY1')
            elif word_index == len(word) - 1:
                breakdown_word.append('IY0')
            else:
                breakdown_word.append('Y')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    breakdown_word.append(phon[0])
        previous = letter
        word_index += 1
    breakdown_word = stressSpanishWord(breakdown_word)
    # return breakdown_word
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in breakdown_word:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #15
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(
        u'aeiouy\N{LATIN SMALL LETTER A WITH RING ABOVE}\N{LATIN SMALL LETTER AE}\N{LATIN SMALL LETTER O WITH STROKE}'
    ).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'c': 'S',
        'f': 'F',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'V',
        'z': 'S',
    }
    short_vowels = {
        u'a': 'AA0',
        u'e': 'EH0',
        u'i': 'IH0',
        u'o': 'UH0',
        u'u': 'UH0',
        u'y': 'IH0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'AH0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AA0'
    }
    long_vowels = {
        u'a': 'AA0',
        u'e': 'EY0',
        u'i': 'IY0',
        u'o': 'OW0',
        u'u': 'UW0',
        u'y': 'IY0',
        u'\N{LATIN SMALL LETTER AE}': 'AE0',
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0',
        u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0'
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if isvowel(letter):
            if len(word) == pos + 3 and word[pos +
                                             1] == 'r' and word[pos +
                                                                1] == 'd':
                phonemes.append(long_vowels[letter])
            elif letter == 'a' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('AY0')
            elif letter == 'a' and len(word) > pos + 1 and word[pos +
                                                                1] == 'u':
                phonemes.append('AW0')
            elif letter == 'e' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('AY0')
            elif letter == 'e' and len(word) > pos + 1 and word[pos +
                                                                1] == 'r':
                phonemes.append('AE0')
            elif letter == 'o' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('OY0')
            elif letter == 'o' and len(word) > pos + 1 and word[pos +
                                                                1] == 'i':
                phonemes.append('UW0')
                phonemes.append('IY0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH STROKE}' and len(
                    word) > pos + 1 and word[pos + 1] == 'y':
                phonemes.append('OW0')
                phonemes.append('IY0')
            elif len(word) == pos + 2 and word[pos + 1] == 'm':
                phonemes.append(short_vowels[letter])
            elif len(word) > pos + 2 and word[pos + 1] == word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append(short_vowels[letter])
            elif len(word) == pos + 3 and word[pos +
                                               1] == 'r' and word[pos +
                                                                  2] == 'd':
                phonemes.append(long_vowels[letter])
            elif len(word) > pos + 2 and word[pos + 1] != word[
                    pos + 2] and not isvowel(word[pos + 1]):
                phonemes.append(long_vowels[letter])
            else:
                phonemes.append(long_vowels[letter])
        elif letter == 'd':
            if len(word
                   ) == pos + 1 and previous == 'r':  # ends in d, e.g. jord
                pass
            elif len(word) == pos + 1 and isvowel(
                    previous):  # ends in long vowel then d, e.g. god
                pass
            elif previous in ['l', 'n']:  # holde, land
                pass
            else:
                phonemes.append('D')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # gjær
                pass  # handled as a normal j
            elif len(word) == pos + 1 and previous == 'i':  # ærlig
                pass  # silent at end of word
            elif previous == 'n':
                pass  # handled under n
            elif len(word) > pos + 1 and word[pos + 1] in ['i', 'y']:
                phonemes.append('Y')
            elif len(word) > pos + 2 and word[pos +
                                              1] == 'e' and word[pos +
                                                                 2] == 'i':
                phonemes.append('Y')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # hjem
                pass  # handled as a normal j
            if len(word) > pos + 1 and word[pos + 1] == 'v':  # hver
                pass  # handled as a normal v
            else:
                phonemes.append('HH')
        elif letter == 'j':
            if previous == 'k':
                pass  # handled under k
            elif previous == 's':
                pass  # handled under s
            else:
                phonemes.append('Y')
        elif letter == 'k':
            if previous == 's' and len(word) > pos + 1 and word[pos + 1] in [
                    u'j', u'i', u'y', u'\N{LATIN SMALL LETTER O WITH STROKE}'
            ]:
                phonemes.append('SH')  # sjkære, ski, skøyter
            elif previous == 's':
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in [u'i', u'y'
                                                           ]:  # kirke, kyss
                phonemes.append('HH')
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # kjønn
                phonemes.append('HH')
            else:
                phonemes.append('K')  # kaffe
        elif letter == 'l':
            if len(word) > pos + 1 and word[pos + 1] == 'j':  # ljug
                pass  # handled as a normal j
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':  # fang
                phonemes.append('NG')
            else:
                phonemes.append('N')  # ni
        elif letter == 'q':  # foreign language loan-words?
            phonemes.append('K')
            phonemes.append('UW0')
        elif letter == 's':
            if previous == 'r':
                phonemes.append(
                    'SH')  # Eastern Norway - norsk, person, for sent
            elif len(word) > pos + 1 and word[pos + 1] == 'k':
                pass  # handled under k
            elif len(word) > pos + 1 and word[pos + 1] == 'j':  # sjø
                phonemes.append('SH')
            elif len(word) > pos + 1 and word[pos + 1] == 'l':
                phonemes.append('SH')  # informal usage
            else:
                phonemes.append('S')  # syv
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #16
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'a': 'AA0',
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'h': 'HH',
        'i': 'IY0',
        'j': 'IY0',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'o': 'AO0',
        'p': 'P',
        'r': 'R',
        't': 'T',
        'u': 'UW0',
        'v': 'V',
        'x': 'HH',  # use 'Y' ?? 'K'??
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH',  # Š
        u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH',  # Ž
        # Cyrillic
        u'\N{CYRILLIC SMALL LETTER A}': 'AA0',
        u'\N{CYRILLIC SMALL LETTER BE}': 'B',
        u'\N{CYRILLIC SMALL LETTER VE}': 'V',
        u'\N{CYRILLIC SMALL LETTER CHE}': 'CH',
        u'\N{CYRILLIC SMALL LETTER DE}': 'D',
        u'\N{CYRILLIC SMALL LETTER E}': 'EH0',
        u'\N{CYRILLIC SMALL LETTER EF}': 'F',
        u'\N{CYRILLIC SMALL LETTER GHE}': 'G',
        u'\N{CYRILLIC SMALL LETTER I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC SMALL LETTER KA}': 'K',
        u'\N{CYRILLIC SMALL LETTER EL}': 'L',
        u'\N{CYRILLIC SMALL LETTER EM}': 'M',
        u'\N{CYRILLIC SMALL LETTER EN}': 'N',
        u'\N{CYRILLIC SMALL LETTER O}': 'AO0',
        u'\N{CYRILLIC SMALL LETTER PE}': 'P',
        u'\N{CYRILLIC SMALL LETTER ER}': 'R',
        u'\N{CYRILLIC SMALL LETTER ES}': 'S',
        u'\N{CYRILLIC SMALL LETTER SHA}': 'SH',
        u'\N{CYRILLIC SMALL LETTER TE}': 'T',
        u'\N{CYRILLIC SMALL LETTER U}': 'UW0',
        u'\N{CYRILLIC SMALL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC SMALL LETTER ZE}': 'Z',
        u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0',
        u'\N{CYRILLIC CAPITAL LETTER BE}': 'B',
        u'\N{CYRILLIC CAPITAL LETTER VE}': 'V',
        u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH',
        u'\N{CYRILLIC CAPITAL LETTER DE}': 'D',
        u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0',
        u'\N{CYRILLIC CAPITAL LETTER EF}': 'F',
        u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G',
        u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH',  # 'Y'?? 'K'??
        u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0',
        u'\N{CYRILLIC CAPITAL LETTER KA}': 'K',
        u'\N{CYRILLIC CAPITAL LETTER EL}': 'L',
        u'\N{CYRILLIC CAPITAL LETTER EM}': 'M',
        u'\N{CYRILLIC CAPITAL LETTER EN}': 'N',
        u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0',
        u'\N{CYRILLIC CAPITAL LETTER PE}': 'P',
        u'\N{CYRILLIC CAPITAL LETTER ER}': 'R',
        u'\N{CYRILLIC CAPITAL LETTER ES}': 'S',
        u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH',
        u'\N{CYRILLIC CAPITAL LETTER TE}': 'T',
        u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0',
        u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': '',
        u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0',
        u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH',
        u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter == 'c':
            if previous == 's' and len(word) > pos + 1 and word[pos +
                                                                1] == 'h':
                phonemes.append('SH')  # as in
                phonemes.append('CH')  # freSH CHeese
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('CH')
            else:
                phonemes.append('T')
                phonemes.append('S')
        elif letter == 'b' and len(word) == pos + 1:
            phonemes.append('P')
        elif letter == 'd' and len(word) == pos + 1:
            phonemes.append('T')
        elif letter in ['e', u'\N{CYRILLIC SMALL LETTER IE}']:
            if pos == 0:
                phonemes.append('Y')
                phonemes.append('EH0')
            if len(word) > pos + 1 and word[pos + 1] in ['h', '^']:
                phonemes.append('EH0')
            else:
                phonemes.append('EH0')
        elif letter == '^':
            pass
        elif letter == 'g':
            if len(word) == pos + 1:
                phonemes.append('K')
            elif previous in [
                    'e', 'o'
            ] and len(word) == pos + 2 and word[pos + 1] == 'o':
                phonemes.append('V')  # possessive endings -ogo and -ego
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('SH')
            else:
                phonemes.append('S')
        elif letter == 'v' and len(word) == pos + 1:
            phonemes.append('F')
        elif letter == 'y':
            if len(word) > pos + 1 and word[pos + 1] == 'a':
                phonemes.append('Y')
            else:
                phonemes.append('IH0')
        elif letter == 'z':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                if len(word) == pos + 2:
                    phonemes.append('SH')
                else:
                    phonemes.append('ZH')
            else:
                if len(word) == pos + 1:
                    phonemes.append('S')
                else:
                    phonemes.append('Z')
        elif letter in [
                u'\N{CYRILLIC CAPITAL LETTER SHCHA}',
                u'\N{CYRILLIC SMALL LETTER SHCHA}'
        ]:
            phonemes.append('SH')
            # phonemes.append('CH')
        elif letter in [
                u'\N{CYRILLIC CAPITAL LETTER TSE}',
                u'\N{CYRILLIC SMALL LETTER TSE}'
        ]:
            phonemes.append('T')
            phonemes.append('S')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}':
            if pos == 0:
                phonemes.append('IY0')
            phonemes.append('AA1')
        elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}':
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('UW0')
        elif letter in [
                u'\N{LATIN SMALL LETTER E WITH DIAERESIS}',
                u'\N{CYRILLIC SMALL LETTER IO}'
        ]:
            if pos == 0:
                phonemes.append('Y')
            phonemes.append('AO0')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon)
                    # ~ else:
                    # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding)
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #17
0
def breakdownWord(input_word,  recursive=False):
    """breaks down a word into phonemes
    """
    # word = input_word.decode(input_encoding)  # decode input into Python default internal format (utf-16) from the GUI input format
    word = input_word
    word = word.lower()
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word :
        if letter == u'b' :
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else :
                breakdown_word.append('V')
        elif letter == u'c' :
            if word_index < len(word)-1 and word[word_index+1]==u'h' :
                breakdown_word.append('CH')
            elif previous == u'c' :
                breakdown_word.append('S')
            elif word_index < len(word)-1 and word[word_index+1]==u's' :
                pass
            elif word_index < len(word)-1 and word[word_index+1] in [u'e', u'i'] :
                # should this be SH before 'e', S before 'i' ??
                breakdown_word.append('S')  # South American, Castilian Spanish uses 'TH'
            else :
                breakdown_word.append('K')
        elif letter == u'd' :
            if word_index == 0 or previous in [u'l', u'n']:
                breakdown_word.append('D')
            else :
                breakdown_word.append('DH')
        elif letter == u'e' :
            if word_index == len(word)-1 or word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] :
                breakdown_word.append('EY0')
            else :
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}' :
            if word_index == len(word)-1 or word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] :
                breakdown_word.append('EY1')
            else :
                breakdown_word.append('EH1')
        elif letter == u'g' :
            if word_index < len(word)-1 and word[word_index+1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
                breakdown_word.append('V')
            elif word_index < len(word)-1 and word[word_index+1] in [u'e', u'i'] :
                breakdown_word.append('HH')
            else :
                breakdown_word.append('G')
        elif letter == u'h' :
            pass
        elif letter == u'l' :
            if word_index < len(word)-1 and word[word_index+1] == u'l' :
                pass
            elif previous == u'l' :
                breakdown_word.append('Y')
            else :
                breakdown_word.append('L')
        elif letter == u'n' :
            if word_index < len(word)-1 and word[word_index+1] == u'v' :
                breakdown_word.append('M')
            else :
                breakdown_word.append('N')
        elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}':
            breakdown_word.append('N')
            breakdown_word.append('Y')
        elif letter == u'o' :
            if word_index < len(word)-1 and word[word_index+1] not in [u'a',u'e',u'i',u'o',u'u']:  # last bit necessary ?
                breakdown_word.append('AO0')
            else :
                breakdown_word.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            if word_index < len(word)-1 and word[word_index+1] not in [u'a',u'e',u'i',u'o',u'u']:  # last bit necessary ?
                breakdown_word.append('AO1')
            else :
                breakdown_word.append('OW1')
        elif letter == u'p' :
            if word_index == len(word)-1 :
                pass
            else :
                breakdown_word.append('P')
        elif letter == u'r' :
            if previous == u'r' :
                pass
            elif word_index < len(word)-1 and word[word_index+1] == u'r' :
                breakdown_word.append('R') # RR - trilled a lot
            else :
                breakdown_word.append('R') # only a little trilled
        elif letter == u's' :
            if word_index < len(word)-1 and word[word_index+1] in [u'd',u'g',u'l',u'm',u'n'] :
                breakdown_word.append('Z')
            else:
                breakdown_word.append('S')
        elif letter == u'u' :
            if previous == u'q' :
                pass
            elif previous == u'g' and word_index < len(word)-1 and word[word_index+1] in [u'u',u'i'] :
                pass
            else :
                breakdown_word.append('UW0')
        elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}':
            if previous == u'q' :
                pass
            elif previous == u'g' and word_index < len(word)-1 and word[word_index+1] in [u'u',u'i'] :
                pass
            else :
                breakdown_word.append('UW1')
        elif letter == u'v' :
            if word_index == 0 or previous in [u'm', u'n']:
                breakdown_word.append('B')
            else :
                breakdown_word.append('V')
        elif letter == u'x' :
            if previous in [u'a',u'e',u'i',u'o',u'u'] and word_index < len(word)-1 and word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] :
                breakdown_word.append('K')
                breakdown_word.append('S')
            else :
                breakdown_word.append('S')
        elif letter == u'y' :
            if len(word) == 1 :
                breakdown_word.append('IY1')
            elif word_index == len(word)-1 :
                breakdown_word.append('IY0')
            else :
                breakdown_word.append('Y')
        elif letter in unconditional_conversions.keys() :
            breakdown_word.append(unconditional_conversions[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    breakdown_word.append(phon[0])
        previous = letter
        word_index = word_index + 1
    breakdown_word = stressSpanishWord(breakdown_word)
    # return breakdown_word
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in breakdown_word:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #18
0
def breakdownWord(word,  recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(vowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'IY0',  # actual pronunciation varies with word origin
        'k': 'K',  # actual pronunciation varies with word origin
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',  # actual pronunciation varies with word origin
        'x': 'K S',  # actual pronunciation varies with word origin
        'y': 'IY0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if  letter in [u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']:
            phonemes.append('AA0')
        elif letter == 'c':
            if previous == 's':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('SH')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            elif len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('CH')
            else:
                phonemes.append('K')
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EH0')  # long is "EY0"
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}':
            phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos+1 and word[pos+1] in ['e', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
                phonemes.append('JH')
            elif len(word) > pos+1 and word[pos+1] in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                pass # handled under 'i'
            elif len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('G')
            elif len(word) > pos+1 and word[pos+1] == 'l':
                pass # handled nuder 'l'
            elif len(word) > pos+1 and word[pos+1] == 'n':
                pass # handled under 'n'
            elif len(word) > pos+1 and word[pos+1] == 'u':
                phonemes.append('G')
                phonemes.append('W')
            else:
                phonemes.append('G')
        elif letter == 'h':
                pass
        elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
            if previous == 'c' and len(word) > pos+1 and isvowel(word[pos+1]):
                pass
            elif previous == 'g':
                if len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']:  # or isvowel(word[pos+1]) ??
                    phonemes.append('JH')
                else:
                    phonemes.append('G')
                    phonemes.append('IY0')
            else:
                phonemes.append('IY0')
        elif letter == 'l':
            if previous == 'g':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('L')
                    phonemes.append('IY0')
                else:
                    phonemes.append('L')
                    phonemes.append('G')
            else:
                phonemes.append('L')
        elif letter == 'n':
            if previous == 'g':
                if len(word) > pos+1 and isvowel(word[pos+1]):
                    phonemes.append('N')
                    phonemes.append('Y')
                else:
                    phonemes.append('G')
                    phonemes.append('N')
            else:
                phonemes.append('N')
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')  # when closed, when open as 'AO0' ?
        elif letter == 's':
            if len(word) > pos+1 and word[pos+1] == 'c':
                pass  # handled under c
            elif isvowel(previous) and len(word) > pos+1 and isvowel(word[pos+1]):
                phonemes.append('Z')
            elif pos == 0:
                if len(word) > pos+1 and isvowel(word[pos+1]):
                    phonemes.append('S')
                elif len(word) > pos+1 and word[pos+1] in ['c', 'f', 'p', 'q', 's', 't']:
                    phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] in ['b', 'd', 'g', 'l', 'm', 'n', 'r', 'v']:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']:
            if previous == 'q':
                phonemes.append('W')
            elif previous == 'g':
                pass  # handled under 'g'
            else:
                phonemes.append('UW0')
        elif letter == 'z':
            if pos == 0:
                phonemes.append('Z')
            elif previous == 'z':
                phonemes.append('T')
                phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] == 'z':
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #19
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(vowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'IY0',  # actual pronunciation varies with word origin
        'k': 'K',  # actual pronunciation varies with word origin
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',  # actual pronunciation varies with word origin
        'x': 'K S',  # actual pronunciation varies with word origin
        'y': 'IY0',  # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        if letter in [
                u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}'
        ]:
            phonemes.append('AA0')
        elif letter == 'c':
            if previous == 's':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('SH')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                phonemes.append('CH')
            else:
                phonemes.append('K')
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EH0')  # long is "EY0"
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}':
            phonemes.append('EH0')
        elif letter == 'g':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'e', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}'
            ]:
                phonemes.append('JH')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                    u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                    u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
            ]:
                pass  # handled under 'i'
            elif len(word) > pos + 1 and word[pos + 1] == 'h':
                phonemes.append('G')
            elif len(word) > pos + 1 and word[pos + 1] == 'l':
                pass  # handled nuder 'l'
            elif len(word) > pos + 1 and word[pos + 1] == 'n':
                pass  # handled under 'n'
            elif len(word) > pos + 1 and word[pos + 1] == 'u':
                phonemes.append('G')
                phonemes.append('W')
            else:
                phonemes.append('G')
        elif letter == 'h':
            pass
        elif letter in [
                'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
        ]:
            if previous == 'c' and len(word) > pos + 1 and isvowel(
                    word[pos + 1]):
                pass
            elif previous == 'g':
                if len(word) > pos + 1 and word[pos + 1] in [
                        'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}'
                ]:  # or isvowel(word[pos+1]) ??
                    phonemes.append('JH')
                else:
                    phonemes.append('G')
                    phonemes.append('IY0')
            else:
                phonemes.append('IY0')
        elif letter == 'l':
            if previous == 'g':
                # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î']
                if len(word) > pos + 1 and word[pos + 1] in [
                        'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                        u'\N{LATIN SMALL LETTER E WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER I WITH GRAVE}',
                        u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                        u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}'
                ]:
                    phonemes.append('L')
                    phonemes.append('IY0')
                else:
                    phonemes.append('L')
                    phonemes.append('G')
            else:
                phonemes.append('L')
        elif letter == 'n':
            if previous == 'g':
                if len(word) > pos + 1 and isvowel(word[pos + 1]):
                    phonemes.append('N')
                    phonemes.append('Y')
                else:
                    phonemes.append('G')
                    phonemes.append('N')
            else:
                phonemes.append('N')
        elif letter in [
                'o', u'\N{LATIN SMALL LETTER O WITH ACUTE}',
                u'\N{LATIN SMALL LETTER O WITH GRAVE}',
                u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}'
        ]:
            phonemes.append('OW0')  # when closed, when open as 'AO0' ?
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # handled under c
            elif isvowel(previous) and len(word) > pos + 1 and isvowel(
                    word[pos + 1]):
                phonemes.append('Z')
            elif pos == 0:
                if len(word) > pos + 1 and isvowel(word[pos + 1]):
                    phonemes.append('S')
                elif len(word) > pos + 1 and word[pos + 1] in [
                        'c', 'f', 'p', 'q', 's', 't'
                ]:
                    phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] in [
                    'b', 'd', 'g', 'l', 'm', 'n', 'r', 'v'
            ]:
                phonemes.append('Z')
            else:
                phonemes.append('S')
        elif letter in [
                'u', u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                u'\N{LATIN SMALL LETTER U WITH GRAVE}',
                u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}'
        ]:
            if previous == 'q':
                phonemes.append('W')
            elif previous == 'g':
                pass  # handled under 'g'
            else:
                phonemes.append('UW0')
        elif letter == 'z':
            if pos == 0:
                phonemes.append('Z')
            elif previous == 'z':
                phonemes.append('T')
                phonemes.append('S')
            elif len(word) > pos + 1 and word[pos + 1] == 'z':
                pass
            else:
                phonemes.append('Z')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
        #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #20
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys('aeiouäöü').has_key
    phonemes = []
    simple_convert = {
        'f': 'F',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'q': 'K',
        'r': 'R',  # use AH0 or ER0 for final letter in word ??
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',
        't': 'T',
        'v': 'F',  # non-native loan-words, 'V'
        'w': 'V',
        'y': 'IH0', # actual pronunciation varies with word origin
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == previous and not isvowel(letter):
            pass
        elif letter == 'a':
            if len(word) > pos+1 and word[pos+1] == 'i': # ai
                phonemes.append('AY0')
            elif len(word) > pos+1 and word[pos+1] == 'u': # au
                phonemes.append('AW0')
            elif previous == 'a':
                pass
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append('AH0')
            elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('AH0')
            elif len(word) == pos+1 and not isvowel(previous):
                phonemes.append('AA0')
            else:
                phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if len(word) > pos+1 and word[pos+1] == 'u': # äu
                phonemes.append('OY0')
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append('EH0')
            elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos+1:
                phonemes.append('EY0')
            else:
                phonemes.append('EY0')
        elif letter == 'b':
            if len(word) == pos+1:
                phonemes.append('P')
            elif len(word) > pos+1 and word[pos+1] in ['s', 't']:
                phonemes.append('P')
            else:
                phonemes.append('B')
        elif letter == 'c':
            if previous == 's' and len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('SH')
            elif len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('HH')  # use 'K'??
            else:
                phonemes.append('K')
        elif letter == 'd':
            if len(word) == pos+1:
                phonemes.append('T')
            elif len(word) > pos+1 and word[pos+1] in ['s', 't']:
                phonemes.append('T')
            else:
                phonemes.append('D')
        elif letter == 'e':
            if previous == 'i':
                pass  # covered under 'i'
            elif len(word) == pos+2 and word[pos+1] in ['l', 'n', 'r']:  # -en, -er, -el
                phonemes.append('EH0')
            elif len(word) > pos+1 and word[pos+1] == 'i':  # ei
                phonemes.append('AY0')
            elif len(word) > pos+1 and word[pos+1] == 'u':  # eu
                phonemes.append('OY0')
            elif len(word) > pos+1 and word[pos+1] == 'e':  # ee
                phonemes.append('EY0')
            elif previous == 'e':
                pass
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append('EH0')
            elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('EH0')
            elif len(word) == pos+1 and not isvowel(previous):
                phonemes.append('EH0')
            else:
                phonemes.append('EY0')
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            elif len(word) == pos+1 and previous == 'i':
                phonemes.append('HH')
            elif len(word) == pos+1:
                phonemes.append('K')
            elif len(word) > pos+1 and word[pos+1] in ['s', 't']:
                phonemes.append('K')
            else:
                phonemes.append('G')
        elif letter == 'h':
            if isvowel(previous):
                pass  # silent
            elif previous == 'c':
                pass  # covered under 'c'
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous in ['a', 'e']:
                pass # covered under other vowel
            elif len(word) > pos+1 and word[pos+1] == 'e':  # ie
                phonemes.append('IY0')
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append('IH0')
            elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('IH0')
            elif len(word) == pos+1 and not isvowel(previous):
                phonemes.append('IY0')  # also use IH0 here instead?
            elif pos == 0:
                phonemes.append('IH0')
            else:
                phonemes.append('IH0')  # also use IH0 here instead?
        elif letter == 'n':
            if len(word) > pos+1 and word[pos+1] == 'g':
                pass  # covered under 'g'
            else:
                phonemes.append('N')
        elif letter == 'o':
            if previous == 'o':
                pass
            elif len(word) == pos+1 and not isvowel(previous):
                phonemes.append('AO0')
            else:
                phonemes.append('AO0')  # somtimes o in on, not covered in CMU/USA
        elif letter ==  u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
            phonemes.append('ER0')
        elif letter == 's':
            if pos == 0 and len(word) > pos+1 and word[pos+1] in ['p', 't']:
                phonemes.append('SH')
            elif len(word) > pos+2 and word[pos+1] == 'c' and word[pos+2] == 'h':
                pass  # covered under 'c'
            elif pos == 0:
                phonemes.append('Z')  # at beginning of word
            elif len(word) == pos+1:
                phonemes.append('S')  # at end of word
            else:
                phonemes.append('S')  # default sound - or 'Z' ??
        elif letter == 'u':
            if previous in ['a', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', 'e']:
                pass
            elif previous == 'q':
                phonemes.append('V')
            elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]):
                phonemes.append('UH0')
            elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}':
                phonemes.append('UH0')
            elif len(word) == pos+1 and not isvowel(previous):
                phonemes.append('UW0')
            else:
                phonemes.append('UW0')
        elif letter ==  u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':
            phonemes.append('UW0')
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'z':
            phonemes.append('T')
            phonemes.append('S')
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
def breakdownWord(word, recursive=False):
    word = word.lower()
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'g': 'G',
        'j': 'JH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        's': 'S',  
        't': 'T',
        'v': 'V',  
        'w': 'W',
        'y': 'Y',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}':'S' #  ç
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter in ['a',accented_a]:            # a
                phonemes.append('AI')
        elif letter in ['e',accented_e]:          # e
               phonemes.append('E')      
        elif letter in ['i',accented_i]:          # i
                phonemes.append('AI')
        elif letter in ['o',accented_o]:          # o
                phonemes.append('O')
        elif letter in ['u',accented_u]:          # u
                phonemes.append('U')
        elif letter in ['m',accented_u]:          # u
                phonemes.append('MBP')
        elif letter in ['b',accented_u]:          # u
                phonemes.append('MBP')
        elif letter in ['p',accented_u]:          # u
                phonemes.append('MBP')
        elif letter in ['p',accented_u]:          # u
                phonemes.append('MBP')
        elif letter in ['f',accented_u]:          # u
                phonemes.append('FV')
        elif letter in ['v',accented_u]:          # u
                phonemes.append('FV')
        elif letter in ['w',accented_u]:          # u
                phonemes.append('WQ')
        elif letter in ['q',accented_u]:          # u
                phonemes.append('WQ')
        elif letter in ['l',accented_u]:          # u
                phonemes.append('L')
                 
        elif letter == 'c':
            if len(word) > pos+1 and word[pos+1] == 'h': # ch 
                phonemes.append('CH')
            elif len(word) > pos+1 and word[pos+1] in ['e','i','y',accented_e,accented_i]: #ce, ci
                phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] in ['a','o','r','u',accented_a,accented_o,accented_u]:    # ca, co, cu, cr
                phonemes.append('K')
            else:
                phonemes.append('K')
        elif  letter == 'h':     
            if previous in ['c','s']:  
                pass 
            else:
                phonemes.append('HH')    # h      
        elif letter == 's':     
            if len(word) > pos+1 and word[pos+1] == 'h':  
                phonemes.append('SH')   # sh
            else:
                phonemes.append('S')    # s
        elif letter == 'x':             # x
            if pos+1==len(word):
               phonemes.append('Z')
            else:
               phonemes.append('K')
               phonemes.append('S')

        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter[0]), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #22
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key
    phonemes = []
    simple_convert = {
        'd': 'D',
        'h': 'HH',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'V',
        # in foreign and borrowed words and names
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0', # ??? # á
        u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0',  # à
        u'\N{LATIN SMALL LETTER AE}': 'AE0',  # æ - Norwegian / Danish
        'b': 'B',
        'c': 'K',  # S ???
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH',  # ç - French, etc
        u'\N{LATIN SMALL LETTER C WITH CARON}': 'S',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER ETH}': 'DH',  # ð - Icelandic
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',  # é
        u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'EH0',  # ??? # ë - scientific names
        'f': 'F',
        u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y',  # ñ - Spanish
        u'\N{LATIN SMALL LETTER ENG}': 'N',  #  - Northern Sámi
        u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0',  # ??? # ø - Norwegian / Danish
        u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0',  # ??? # õ - Estonian
        'q': 'K',
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',  # ß - German
        u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T',  #  - Northern Sámi
        u'\N{LATIN SMALL LETTER THORN}': 'TH',  # Þ - Icelandic
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', # ??? # ü - German / Estonian
        'w': 'V',
        'z': 'Z'

    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        #~ if letter == previous:
            #~ pass
        if letter == 'a':
            if len(word) > pos+1 and word[pos+1] in ['i', 'u', ]:
                pass  # handled under following letter
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos+1 and word[pos+1] in ['i',]:
                pass  # handled under following letter
            elif previous == 'i':  # ie
                phonemes.append('IY0')  # ???
            else:
                phonemes.append('EH0')
        elif letter == 'i':
            prev_match_i = {
                'a': 'AY0',  # ai
                'e': 'EY0',  # ei
                'o': 'OY0',  # oi
                'u': 'UW0',  # ui
                'y': 'IY0'  # yi
                # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ???
                # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ???
            }
            if previous in prev_match_i:
                phonemes.append(prev_match_i[previous])
            else:
                phonemes.append('IH0')
        elif letter == 'o':
            if len(word) > pos+1 and word[pos+1] in ['i', 'u']:
                pass  # handled under following letter
            elif previous == 'u':  # uo
                phonemes.append('OW0')  # ???
            else:
                phonemes.append('OY0')
        elif letter == 'u':
            prev_match_u = {
                'a': 'AW0',  # au
                'o': 'OW0'  # AO??? # ou
                # eu ???
                # iu ???
            }
            if len(word) > pos+1 and word[pos+1] in ['i',]:
                pass  # handled under following letter
            elif previous in prev_match_u:
                phonemes.append(prev_match_u[previous])
            else:
                phonemes.append('UH0')
        elif letter == 'y':
            # äy ???
            # öy ???
            if len(word) > pos+1 and word[pos+1] in ['i',]:
                pass  # handled under following letter
            else:
                phonemes.append('UW0')  # ???
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # ä
            phonemes.append('AE0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # ö
            # yö ???
            phonemes.append('ER0')  # ???
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            else:
                phonemes.append('G')
        elif letter == 'n':
            if len(word) > pos+1 and word[pos+1] == 'g':
                pass # handled under g
            else:
                phonemes.append('N')
        elif letter in simple_convert:
            phonemes.append(simple_convert[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #23
0
def syllablesToPhonemes(syllables,  recursive=False):
    isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
    'b': 'B',
    'd': 'D',
    'f': 'F',
    'h': 'HH',
    'j': 'Y',  # SH in some words borrowed from French
    'k': 'K',
    'l': 'L',
    'm': 'M',
    'n': 'N',
    'p': 'P',
    'r': 'R',
    's': 'S',
    't': 'T',
    'v': 'F', #  English F mixed with English V
    'w': 'V', # closer to soft English V than the English W - pronounced back in mouth, not with pursed lips
    'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    syllable_pos, letter_pos = 0,1
    pos = [1,1] # syllable 1, letter 1
    previous_letter = ' '
    for syllable in syllables:
        for letter in syllable:
            if letter == previous_letter and not isvowel(letter):  # double consonants
                pass
            # ===================== consonants ==========================
            elif letter == "b" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]):  # last letter in word
                phonemes.append("P")
            elif letter == "d" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]):  # last letter in word
                phonemes.append("T")
            elif letter == "n" and len(syllable) > pos[letter_pos] and syllable[letter_pos] == "g": # ng
                    pass # handled in next case
            elif letter == "g" and previous_letter == "n": # ng
                phonemes.append("NG")
            elif letter == 'g':
                phonemes.append("HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            # elif letter == 'c' and len(syllable) > pos[letter_pos]-1 and syllable[pos[letter_pos]] == 'h':
            elif letter == 'c' and len(syllable) > pos[letter_pos]+1 and syllable[pos[letter_pos]] == 'h':  # ch
                pass
            elif letter == 'h' and previous_letter == 'c': # ch
                phonemes.append("HH")  # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?)
            elif letter == 't'and len(syllable) > pos[letter_pos] and syllable[pos[letter_pos]] == 'h': # th
                    pass # handled in next case
            elif letter == 'h' and previous_letter == 't': # th
                phonemes.append("TH")
            elif letter == 'j'and previous_letter == 'i':
                    pass # handled in vowels
            elif letter == 'w'and previous_letter == 'u':
                    pass # handled in vowels
            elif letter == 'x':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("S")
            elif letter == 'q':  # rare, mostly borrowed words
                phonemes.append("K")
                phonemes.append("W")
            elif letter == 'c':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] in "ei":   # c     before e and i pronounce as s
                    phonemes.append("S")
                else:
                    phonemes.append("K")  # c     before a consonant, at the end of a word and before a, o, u pronounce as k;
            elif letter in easy_consonants:
                phonemes.append(simple_convert[letter])
            # =============== vowels ================
            # ------------ A -------------------------------
            elif letter == 'a': # short AH, long AA
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'a':  # double a
                    phonemes.append("AA0")
                elif previous_letter == 'a':  # double a handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long a reduced to single letter
                    phonemes.append("AA0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # au
                    phonemes.append("AW0")  # occasionally as UW0 in some words borrowed from French
                else:
                    phonemes.append('AH0')  # like English short u (cut, hut)
            # ------------ E -------------------------------
            elif letter == 'e': # e short EH long EY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e':  # double e
                    phonemes.append("EY0")
                elif previous_letter == 'e':  # double e handled by case above
                    pass
                elif previous_letter == 'i':  # ie handled at i stage
                    pass
                elif previous_letter == 'o':  # oe handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long e reduced to single letter
                    phonemes.append("EY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # eu
                    phonemes.append("ER0")  # less R than English equivalent, closer to French eu
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ei
                    phonemes.append("AY0")
                else:
                    phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            # ------------ I -------------------------------
            elif letter == 'i': # i short IH long IY
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i':  # double i
                    phonemes.append("IY0")
                elif previous_letter == 'u':  # ui handled at u stage
                    pass
                elif previous_letter == 'i':  # double i handled by case above
                    pass
                elif previous_letter == 'e': # ei handled at ei stage
                    pass
                # elif previous_letter == 'a': # !!!FIXME!!! handle aai, aaij
                #    pass
                elif pos[letter_pos] == len(syllable):  # long i reduced to single letter
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'j': # ij
                    phonemes.append("AY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # iu
                    phonemes.append("IY0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ie
                    phonemes.append("IY0")
                # elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ieuw !!!FIXME!!!  handle ieuw IY UW ???
                #    phonemes.append("IY0")
                else:
                    phonemes.append('IH0')
            # ------------ O -------------------------------
            elif letter == 'o': # o short AA long OW
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'o':  # double o
                    phonemes.append("OW0")
                elif previous_letter == 'o':  # double o handled by case above
                    pass
                elif pos[letter_pos] == len(syllable):  # long o reduced to single letter
                    phonemes.append("OW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # oe
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # ou
                    phonemes.append("AW0")
                else:
                    phonemes.append('AO0')
            # ------------ U -------------------------------
            elif letter == 'u':
                if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u':  # double u
                    phonemes.append("UW0")
                elif previous_letter == 'u':  # double u handled by case above
                    pass
                elif previous_letter == 'a':  # au handled at a stage
                    pass
                elif previous_letter == 'e':  # handled at e stage
                    pass
                elif previous_letter == 'i':
                    phonemes.append("UW0")
                elif previous_letter == 'o':  # handled at o stage
                    pass
                elif pos[letter_pos] == len(syllable):  # long u reduced to single letter
                    phonemes.append("UW0")
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'w': # uw
                    phonemes.append("UW0")  # uw = EW in English DEW IY UW ???
                elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ui
                    phonemes.append("UH0")  # - not accurate but the nearest phoneme in CMU? (use UW instead?)
                else:
                    phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            # ------------ TREMA (looks like German Umlaut, but different meaning) -------------------------------
            elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # δ
                phonemes.append('AH0')  # like English short u (cut, hut)
            elif letter == u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':  # λ
                phonemes.append('EH0')  # closer to a (bad B AH D) than English short EH0 (bed = B EH D)
            elif letter == u'\N{LATIN SMALL LETTER I WITH DIAERESIS}':  # ο
                phonemes.append('IH0')
            elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # φ
                phonemes.append('AO0')
            elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}':  # ό
                phonemes.append('ER0')  # - not accurate but the nearest phoneme in CMU? (use AH instead?)
            elif letter == u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' or letter == u'\N{LATIN SMALL LETTER Y WITH ACUTE}':  # '�' or 'ύ'
                # LATIN SMALL LETTER Y WITH DIAERESIS
                # LATIN SMALL LETTER Y WITH ACUTE
                phonemes.append("AY0")
            elif len(hammer(letter)) == 1:
                if not recursive:
                    phon = syllablesToPhonemes(hammer(letter), True)
                    if phon:
                        phonemes.append(phon[0])
            pos[letter_pos] += 1
            previous_letter = letter
        pos[syllable_pos] += 1
        pos[letter_pos] = 1
        previous_letter = ' '
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #24
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key
    phonemes = []
    simple_convert = {
        'd': 'D',
        'h': 'HH',
        'j': 'Y',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'p': 'P',
        'r': 'R',
        's': 'S',
        't': 'T',
        'v': 'V',
        # in foreign and borrowed words and names
        u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH',  # š
        u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH',  # ž
        u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0',  # ??? # á
        u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0',  # à
        u'\N{LATIN SMALL LETTER AE}': 'AE0',  # æ - Norwegian / Danish
        'b': 'B',
        'c': 'K',  # S ???
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH',  # ç - French, etc
        u'\N{LATIN SMALL LETTER C WITH CARON}': 'S',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D',  # ??? - Northern Sámi
        u'\N{LATIN SMALL LETTER ETH}': 'DH',  # ð - Icelandic
        u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',  # é
        u'\N{LATIN SMALL LETTER E WITH DIAERESIS}':
        'EH0',  # ??? # ë - scientific names
        'f': 'F',
        u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G',  # ??? - other Sámi
        u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y',  # ñ - Spanish
        u'\N{LATIN SMALL LETTER ENG}': 'N',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER O WITH STROKE}':
        'ER0',  # ??? # ø - Norwegian / Danish
        u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0',  # ??? # õ - Estonian
        'q': 'K',
        u'\N{LATIN SMALL LETTER SHARP S}': 'S',  # ß - German
        u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T',  # - Northern Sámi
        u'\N{LATIN SMALL LETTER THORN}': 'TH',  # Þ - Icelandic
        u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
        'ER0',  # ??? # ü - German / Estonian
        'w': 'V',
        'z': 'Z'
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # ~ if letter == previous:
        # ~ pass
        if letter == 'a':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
                    'u',
            ]:
                pass  # handled under following letter
            else:
                phonemes.append('AA0')
        elif letter == 'e':
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            elif previous == 'i':  # ie
                phonemes.append('IY0')  # ???
            else:
                phonemes.append('EH0')
        elif letter == 'i':
            prev_match_i = {
                'a': 'AY0',  # ai
                'e': 'EY0',  # ei
                'o': 'OY0',  # oi
                'u': 'UW0',  # ui
                'y': 'IY0'  # yi
                # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ???
                # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ???
            }
            if previous in prev_match_i:
                phonemes.append(prev_match_i[previous])
            else:
                phonemes.append('IH0')
        elif letter == 'o':
            if len(word) > pos + 1 and word[pos + 1] in ['i', 'u']:
                pass  # handled under following letter
            elif previous == 'u':  # uo
                phonemes.append('OW0')  # ???
            else:
                phonemes.append('OY0')
        elif letter == 'u':
            prev_match_u = {
                'a': 'AW0',  # au
                'o': 'OW0'  # AO??? # ou
                # eu ???
                # iu ???
            }
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            elif previous in prev_match_u:
                phonemes.append(prev_match_u[previous])
            else:
                phonemes.append('UH0')
        elif letter == 'y':
            # äy ???
            # öy ???
            if len(word) > pos + 1 and word[pos + 1] in [
                    'i',
            ]:
                pass  # handled under following letter
            else:
                phonemes.append('UW0')  # ???
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':  # ä
            phonemes.append('AE0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  # ö
            # yö ???
            phonemes.append('ER0')  # ???
        elif letter == 'g':
            if previous == 'n':
                phonemes.append('NG')
            else:
                phonemes.append('G')
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'g':
                pass  # handled under g
            else:
                phonemes.append('N')
        elif letter in simple_convert:
            phonemes.append(simple_convert[letter])
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return " ".join(phonemes)
    # return phonemes
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #25
0
def breakdownSwedishSyllable(word, recursive=False, phonetic=False):
    word = word.lower()
    # isvowel = dict.fromkeys('aeiou').has_key
    phonemes = []
    simple_convert = {
#    u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0',
    u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0',
#    u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0',
#    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0',
#    u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0',
#    u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0',
    'a' : 'AH0',  # not exact - AO0 ??
    'b': 'B',
    'f': 'F',
    'm': 'M',
    'o': 'UH0',  # compromise, actually UW0 or AA0 (not), sometimes AO0
    'q': 'K',
    'v': 'V',
    'w': 'V',
    'z': 'S',
    u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0',  # not exact
    u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':  'ER0',
    }
    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        if letter == 'c':
            if len(word) > pos+1 and word[pos+1] == 'c':
                pass # cc, handle on next case
            elif previous == 'c' and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('K')
                phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('S')
            elif len(word) > pos+1 and word[pos+1] == 'h':
                phonemes.append('SH')
                #~ if previous == 's':
                    #~ phonemes.append('SH')
                #~ else:
                    #~ phonemes.append('CH')  # sometimes 'K' as in English 'chorus', but no rule
            else: #  elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('K')
        elif letter == 'd':
            if pos == 0 and len(word) > pos+1 and word[pos+1] == 'j':  # dj at beginning of word
                pass  # same as j alone
            else:
                phonemes.append('D')
        elif letter == 'e':
            if phonetic:
                phonemes.append('EH0')
            elif len(word) == pos+2 and word[pos+1] == 'r':  # ends in er
                phonemes.append('AE0')
            else:
                phonemes.append('EH0')  # sometimes 'IY0', sometimes 'EY0'
        elif letter == 'g':
            if previous in ['l', 'r']:
                phonemes.append('Y')
            elif len(word) > pos+2 and word[pos+1] == 'i' and word[pos+2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                phonemes.append('SH')
            elif len(word) > pos+1 and word[pos+1] == 'n' and previous in ['a', 'o', 'u', 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('NG')
            elif previous == 'n':  # ng
                phonemes.append('NG')
            elif len(word) > pos+1 and word[pos+1] == 'j':  # gj
                pass  # same as 'j' alone
            elif len(word) == pos+2 and word[pos+1] == 'e':  # ends in 'ge' - French loan-word such as garage ?
                phonemes.append('SH')
            elif pos==0 and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
            # ??? if e is unstressed (how to tell?), pronounce as 'G'
                phonemes.append('Y')
            elif pos==0 and len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
            elif previous == 'g':
                pass
            else:  # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('G')
        elif letter == 'h':
            if previous == 'c':
                pass # handled under c
            elif len(word) > pos+1 and word[pos+1] == 'j':
                pass  # same as 'j' alone
            elif pos == 1 and previous == 's':  # probably a foreign loan-word
                phonemes.append('SH')
            else:
                phonemes.append('HH')
        elif letter == 'i':
            if previous == 'g' and len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}':
                pass
            elif previous == 's' and len(word) > pos+1 and word[pos+1] == 'o': # sio e.g mission
                phonemes.append('UH0')
            else:
                phonemes.append('IY0')  # sometimes 'IH0'
        elif letter == 'k': # needs to be handled before j to handle skj sound
            if pos == 0 and word in [u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse']:
                phonemes.append('K')
            elif pos == 0 and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']:
                phonemes.append('CH')
            elif word == unicode('människa', input_encoding):
                phonemes.append('SH')
            elif word == unicode('människor', input_encoding):
                phonemes.append('SH')
            elif len(word) == pos+1 and previous == 's': # ends in SK
                phonemes.append('S')
                phonemes.append('K')
            elif len(word) > pos+1 and word[pos+1] == 'j':
                # phonemes.append('SH')
                phonemes.append('CH')  # more Finnish-Swedish than Swedish ???
            elif len(word) == pos+1 and previous == 'c':
                pass
            elif previous == 's' and len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']:
                phonemes.append('S')
                phonemes.append('K')
            elif previous == 's' and pos == 1: # sk at beginning of word
                phonemes.append('SH')
            else: #  elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u',  u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']
                phonemes.append('K')
        elif letter == 't': # needs to be handled before j to handle stj sound
            if previous == 's' and len(word) > pos+1 and word[pos+1] == 'j':
                phonemes.append('SH')
            if previous == 't' and len(word) == pos+1:
                pass
            elif len(word) > pos+1 and word[pos+1] == 'j':  # tj
                pass # handled under j
            else:
                phonemes.append('T')
        elif letter == 'j':
            if previous == 's':
                phonemes.append('SH')
            elif previous == 't':
                if word[pos-2] == 's':  # stj, handled under 't'
                    pass
                else:
                    phonemes.append('CH')
            elif previous == 'k':
                pass  # handled under k
            else:
                phonemes.append('Y')
        elif letter == 'l':
            if len(word) > pos+1 and word[pos+1] == 'j':
                pass  # same as 'j' alone
            else:
                phonemes.append('L')
        elif letter == 'n':
            if len(word) > pos+1 and word[pos+1] == 'g': # ng
                pass  # handled under 'g'
            elif len(word) > pos+1 and word[pos+1] == 'k': # ng
                phonemes.append('NG')
            else:
                phonemes.append('N')
        elif letter == 'p':
            if previous == 'p':
                pass
            else:
                phonemes.append('P')
        elif letter == 'r':
            if len(word) > pos+1 and word[pos+1] == 's':
                pass  # handled under s
            else:
                phonemes.append('R')
        elif letter == 's':
            if len(word) > pos+2 and word[pos+1] == 'c' and word[pos+2] == 'h':
                pass  # handled under 'c'
            elif len(word) > pos+2 and word[pos+1] == 't' and word[pos+2] == 'j':
                pass  # handled under 't'
            elif len(word) > pos+1 and word[pos+1] == 'k':
                pass  # handled under 'k'
            elif len(word) > pos+1 and word[pos+1] == 'j':
                pass # handled under 'j'
            elif len(word) > pos+1 and word[pos+1] == 's':
                pass
            elif len(word) > pos+1 and word[pos+1] == 'i' and len(word) > pos+2 and word[pos+2] == 'o': ## might need more breakdown
                phonemes.append('SH')
            elif pos == 0 and len(word) > pos+1 and word[pos+1] == 'h':
                pass  # handled under 'h'
            elif previous == 'r':
                phonemes.append('SH')  # not entirely accurate, use HH ??
            else:
                phonemes.append('S')
        elif letter == 'u':
            if previous == 'q':
                phonemes.append('V')
            else:
                phonemes.append('UW0')  # inaccurate, no accurate CMU equiivalent
        elif letter == 'x':
            phonemes.append('K')
            phonemes.append('S')
        elif letter == 'y':
            if word in [u'yoga', u'yoghurt']:
                phonemes.append('Y')
            elif word == u'fyrtio':
                phonemes.append('ER0')
            else:
                phonemes.append('UW0')    # not exact
        elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}':
            if phonetic:
                phonemes.append('AE0')
            elif len(word) > pos+1 and word[pos+1] == 'r':
                phonemes.append('AE0')  # not exact, and skips exceptions---
            else:
                phonemes.append('EH0')  # not exact, and skips exceptions
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = " ".join(breakdownSwedishSyllable(hammer(letter), True, phonetic))
                if phon:
                    phonemes.append(phon)
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #26
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(setvowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',
        'y': 'IY0',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S',  	# ç      
    }

    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
	#A
        if  letter in ['a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']:
            phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}':
            phonemes.append('AE0')
	#E
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EH0')
	#I
        elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
            phonemes.append('IY0')
	#O
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            phonemes.append('OY0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}':
            phonemes.append('AW0')
	# U 
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']:
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Special rule to digraphs consonant:
# qu and gu (followed by e or i):  aquilo, questão, quilo, querida, guerra, águia  
# ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiça, aguentar
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
            if previous == 'q': # digraph consonant Qu
                # ['e', 'i', 'é', 'í', 'ê', 'î'] 
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('K')
                else:
                    phonemes.append('UW0')
            elif previous == 'g': # digraph consonant Gu
                # ['e', 'i', 'é', 'í', 'ê', 'î'] 
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('G')
                else:
                    phonemes.append('UW0')
            else:
                    phonemes.append('UW0')
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# consonants with combinations
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
	#C 
        elif letter == 'c':
            if previous == 's': # digraph consonant sC #asCender
                # ['e', 'i', 'é', 'í', 'ê', 'î'] 
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            if previous == 'x':  # digraph consonant xC #exCelente
                # ['e', 'i', 'é', 'í', 'ê', 'î'] 
                if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            #ce #ci
            elif len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: 
                phonemes.append('S')
            else:
                phonemes.append('K')
	#G
        elif letter == 'g':
            #ge #gi
            if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: 
                phonemes.append('ZH')
            else:
                phonemes.append('G')

        #H
        elif letter == 'h':	# silent letter
            if previous == 'n': 
              phonemes.append('N')	# digraph consonant Nh
            else:
               pass

        #M
        elif letter == 'm':
            # ['i', 'o', 'u', 'í', 'ó', 'ú', 'î', 'ô', õ]
            if previous in ['i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',  u'\N{LATIN SMALL LETTER O WITH TILDE}'] and word[-1]==('m') or len(word) > pos+1 and not isvowel(word[pos+1]):
                pass	# digraphs vowel am em im om um    
            else: 
                phonemes.append('M')

        #N
        elif letter == 'n':	
            if len(word) > pos+1 and word[pos+1] == 'h': 
                pass	 #Nh handled under #H
            elif isvowel(previous)  and word[-1]==('n') or len(word) > pos+1 and not isvowel(word[pos+1]):
                pass	# digraphs vowel an en in on un
            else:
                phonemes.append('N')
        #S
        elif letter == 's':
            if len(word) > pos+1 and word[pos+1] == 'c':  
                pass 	 #sC handled under #C
            elif isvowel(previous) and len(word) > pos+1 and isvowel(word[pos+1]): # check if have vowel before and after S #caSa 
                phonemes.append('Z')		
            else:
                phonemes.append('S')
        #X 
        elif letter == 'x':
            if len(word) > pos+1 and word[pos+1] == 'c': 
                pass 	 #xC handled under #C 
            else:
                phonemes.append('SH') # There are some exceptions where X have phoneme "KS" like táxi = T A K S I 
#
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
        #~ else:
            #~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes
Example #27
0
def breakdownWord(input_word, recursive=False):
    word = input_word
    word = word.lower(
    )  # trasformando tutte le parole in minuscolo si diminuiscono le combinazioni da gestire
    previous = u''
    word_index = 0
    breakdown_word = []
    for letter in word:
        if letter == u'c':
            # ci
            if word_index < len(word) and word[word_index + 1] == u'i':
                breakdown_word.append('EH0')
            # ce
            elif word_index < len(word) and word[word_index + 1] == u'e':
                breakdown_word.append('EH0')
            # cci
            elif word_index < len(word) - 1 and word[
                    word_index + 1] == u'c' and word[word_index + 2] == u'i':
                breakdown_word.append('EH0')
            else:
                breakdown_word.append('K')
        elif letter == u'g':
            # gi
            if word_index < len(word) and word[word_index + 1] == u'i':
                breakdown_word.append('JH')
            # gli
            elif word_index < len(word) - 1 and word[
                    word_index + 1] == u'l' and word[word_index + 2] == u'i':
                breakdown_word.append('JH')
            else:
                breakdown_word.append('G')
        elif letter == u'i':
            # ci, #gi
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH0')
        elif letter == u'\N{LATIN SMALL LETTER I WITH ACUTE}':
            # cí, #gí
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH1')
        elif letter == u'\N{LATIN SMALL LETTER I WITH GRAVE}':
            # cì, #gì
            if previous == u'c' or previous == u'g':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('EH1')
        elif letter == u'h':
            # ch
            if previous == u'c':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('HH')
        elif letter == u'j':
            if 0 < word_index < len(word):
                breakdown_word.append('JH')
            else:
                breakdown_word.append('EH0')
        elif letter == u'l':
            # gli
            if word_index < len(word) and previous == u'g' and word[
                    word_index + 1] == u'i':
                previous = letter
                word_index += 1
                continue
            else:
                breakdown_word.append('L')
        elif letter in unconditional_conversions.keys():
            breakdown_word.append(unconditional_conversions[letter])
        elif letter == " ":
            pass
        elif len(hammer(letter)) == 1:
            # print "hammer"
            if not recursive:
                phon = " ".join(breakdownWord(hammer(letter), True))
                if phon:
                    breakdown_word.append(phon.split()[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        previous = letter
        word_index += 1
    return breakdown_word
Example #28
0
def breakdownWord(word, recursive=False):
    word = word.lower()
    isvowel = dict.fromkeys(setvowels).has_key
    phonemes = []
    simple_convert = {
        'b': 'B',
        'd': 'D',
        'f': 'F',
        'j': 'ZH',
        'k': 'K',
        'l': 'L',
        'm': 'M',
        'n': 'N',
        'p': 'P',
        'q': 'K',
        'r': 'R',
        't': 'T',
        'v': 'V',
        'w': 'W',
        'y': 'IY0',
        'z': 'Z',
        u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S',  # з
    }

    easy_consonants = simple_convert.keys()
    pos = 0
    previous = ' '
    for letter in word:
        # if letter == previous and not isvowel(letter):  # double consonants
        #     pass
        # A
        if letter in ['a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}',
                      u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']:
            phonemes.append('AA0')
        elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}':
            phonemes.append('AE0')
            # E
        elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']:
            phonemes.append('EY0')
        elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}':
            phonemes.append('EH0')
            # I
        elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
            phonemes.append('IY0')
            # O
        elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']:
            phonemes.append('OW0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}':
            phonemes.append('OY0')
        elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}':
            phonemes.append('AW0')
            # U
        elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']:
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # Special rule to digraphs consonant:
            # qu and gu (followed by e or i):  aquilo, questгo, quilo, querida, guerra, бguia
            # ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiзa, aguentar
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            if previous == 'q':  # digraph consonant Qu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('K')
                else:
                    phonemes.append('UW0')
            elif previous == 'g':  # digraph consonant Gu
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('G')
                else:
                    phonemes.append('UW0')
            else:
                phonemes.append('UW0')
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # consonants with combinations
            # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            # C
        elif letter == 'c':
            if previous == 's':  # digraph consonant sC #asCender
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            if previous == 'x':  # digraph consonant xC #exCelente
                # ['e', 'i', 'й', 'н', 'к', 'о']
                if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                             u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                             u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                    phonemes.append('S')
                else:
                    phonemes.append('S')
                    phonemes.append('K')
            # ce #ci
            elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                           u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                           u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('S')
            else:
                phonemes.append('K')
                # G
        elif letter == 'g':
            # ge #gi
            if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}',
                                                         u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                                                         u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}',
                                                         u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']:
                phonemes.append('ZH')
            else:
                phonemes.append('G')

        # H
        elif letter == 'h':  # silent letter
            if previous == 'n':
                phonemes.append('N')  # digraph consonant Nh
            else:
                pass

        # M
        elif letter == 'm':
            # ['i', 'o', 'u', 'н', 'у', 'ъ', 'о', 'ф', х]
            if previous in ['i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}',
                            u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH ACUTE}',
                            u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}',
                            u'\N{LATIN SMALL LETTER O WITH TILDE}'] and word[-1] == ('m') or len(
                word) > pos + 1 and not isvowel(word[pos + 1]):
                pass  # digraphs vowel am em im om um
            else:
                phonemes.append('M')

        # N
        elif letter == 'n':
            if len(word) > pos + 1 and word[pos + 1] == 'h':
                pass  # Nh handled under #H
            elif isvowel(previous) and word[-1] == ('n') or len(word) > pos + 1 and not isvowel(word[pos + 1]):
                pass  # digraphs vowel an en in on un
            else:
                phonemes.append('N')
        # S
        elif letter == 's':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # sC handled under #C
            elif isvowel(previous) and len(word) > pos + 1 and isvowel(
                    word[pos + 1]):  # check if have vowel before and after S #caSa
                phonemes.append('Z')
            else:
                phonemes.append('S')
        # X
        elif letter == 'x':
            if len(word) > pos + 1 and word[pos + 1] == 'c':
                pass  # xC handled under #C
            else:
                phonemes.append('SH')  # There are some exceptions where X have phoneme "KS" like tбxi = T A K S I
            #
        elif letter in easy_consonants:
            phonemes.append(simple_convert[letter])
        elif letter == ' ':
            pass
        elif len(hammer(letter)) == 1:
            if not recursive:
                phon = breakdownWord(hammer(letter), True)
                if phon:
                    phonemes.append(phon[0])
                    # ~ else:
                    # ~ print "not handled", letter, word
        pos += 1
        previous = letter
    # return phonemes
    # return " ".join(phonemes)
    temp_phonemes = []
    previous_phoneme = " "
    for phoneme in phonemes:
        if phoneme != previous_phoneme:
            temp_phonemes.append(phoneme)
        previous_phoneme = phoneme
    return temp_phonemes