def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(u'aeiou\N{LATIN SMALL LETTER DOTLESS I}' u'\N{LATIN SMALL LETTER O WITH DIAERESIS}\N{LATIN SMALL LETTER U WITH DIAERESIS}' u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}').has_key phonemes = [] simple_convert = { 'b': 'B', 'c': 'JH', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH', 'd': 'D', 'f': 'F', 'g': 'G', 'h': 'HH', u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0', 'i': 'IY0', 'j': 'ZH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', 'p': 'P', 'r': 'R', 's': 'S', u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH', 't': 'T', u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0? 'w': 'V', # loan-words 'z': 'Z', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == 'a': if len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('AY0') else: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': if previous in ['g', 'k', 'l']: phonemes.append('IY0') phonemes.append('AA0') else: phonemes.append('AA0') elif letter == 'e': if len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('EY0') else: phonemes.append('EH0') elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}': pass # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: # ~ phonemes.append('Y') # ~ else: # ~ pass # ~ elif letter == 'g': # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: # ~ phonemes.append('L') # ~ phonemes.append('Y') # ~ else: # ~ phonemes.append('L') # ~ elif letter == 'l': # ~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', # ~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', # ~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: # ~ phonemes.append('L') # ~ phonemes.append('Y') # ~ else: # ~ phonemes.append('L') elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'b': phonemes.append('M') else: phonemes.append('N') elif letter == 'o': if len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('OY0') else: phonemes.append('OW0') elif letter == 'q': # loan-words phonemes.append('K') elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': if previous in ['g', 'k', 'l']: phonemes.append('IY0') phonemes.append('UW0') else: phonemes.append('UW0') elif letter == 'u': if len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('IY0') else: phonemes.append('UH0') elif letter == 'v': if isvowel(previous): phonemes.append('W') else: phonemes.append('V') elif letter == 'x': # loan-words phonemes.append('K') phonemes.append('S') elif letter == 'y': if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: pass else: phonemes.append('Y') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon.split()[0]) # ~ else: # ~ print "not handled", letter, word pos += 1 previous = letter # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys('aeiouäöü').has_key phonemes = [] simple_convert = { 'f': 'F', 'j': 'Y', 'k': 'K', 'l': 'L', 'm': 'M', 'p': 'P', 'q': 'K', 'r': 'R', # use AH0 or ER0 for final letter in word ?? u'\N{LATIN SMALL LETTER SHARP S}': 'S', 't': 'T', 'v': 'F', # non-native loan-words, 'V' 'w': 'V', 'y': 'IH0', # actual pronunciation varies with word origin } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == previous and not isvowel(letter): pass elif letter == 'a': if len(word) > pos + 1 and word[pos + 1] == 'i': # ai phonemes.append('AY0') elif len(word) > pos + 1 and word[pos + 1] == 'u': # au phonemes.append('AW0') elif previous == 'a': pass elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append('AH0') elif len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('AH0') elif len(word) == pos + 1 and not isvowel(previous): phonemes.append('AA0') else: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': if len(word) > pos + 1 and word[pos + 1] == 'u': # äu phonemes.append('OY0') elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append('EH0') elif len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('EH0') elif len(word) == pos + 1: phonemes.append('EY0') else: phonemes.append('EY0') elif letter == 'b': if len(word) == pos + 1: phonemes.append('P') elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']: phonemes.append('P') else: phonemes.append('B') elif letter == 'c': if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('SH') elif len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('HH') # use 'K'?? else: phonemes.append('K') elif letter == 'd': if len(word) == pos + 1: phonemes.append('T') elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']: phonemes.append('T') else: phonemes.append('D') elif letter == 'e': if previous == 'i': pass # covered under 'i' elif len(word) == pos + 2 and word[pos + 1] in ['l', 'n', 'r' ]: # -en, -er, -el phonemes.append('EH0') elif len(word) > pos + 1 and word[pos + 1] == 'i': # ei phonemes.append('AY0') elif len(word) > pos + 1 and word[pos + 1] == 'u': # eu phonemes.append('OY0') elif len(word) > pos + 1 and word[pos + 1] == 'e': # ee phonemes.append('EY0') elif previous == 'e': pass elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append('EH0') elif len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('EH0') elif len(word) == pos + 1 and not isvowel(previous): phonemes.append('EH0') else: phonemes.append('EY0') elif letter == 'g': if previous == 'n': phonemes.append('NG') elif len(word) == pos + 1 and previous == 'i': phonemes.append('HH') elif len(word) == pos + 1: phonemes.append('K') elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']: phonemes.append('K') else: phonemes.append('G') elif letter == 'h': if isvowel(previous): pass # silent elif previous == 'c': pass # covered under 'c' else: phonemes.append('HH') elif letter == 'i': if previous in ['a', 'e']: pass # covered under other vowel elif len(word) > pos + 1 and word[pos + 1] == 'e': # ie phonemes.append('IY0') elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append('IH0') elif len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('IH0') elif len(word) == pos + 1 and not isvowel(previous): phonemes.append('IY0') # also use IH0 here instead? elif pos == 0: phonemes.append('IH0') else: phonemes.append('IH0') # also use IH0 here instead? elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'g': pass # covered under 'g' else: phonemes.append('N') elif letter == 'o': if previous == 'o': pass elif len(word) == pos + 1 and not isvowel(previous): phonemes.append('AO0') else: phonemes.append( 'AO0') # somtimes o in on, not covered in CMU/USA elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': phonemes.append('ER0') elif letter == 's': if pos == 0 and len(word) > pos + 1 and word[pos + 1] in ['p', 't']: phonemes.append('SH') elif len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h': pass # covered under 'c' elif pos == 0: phonemes.append('Z') # at beginning of word elif len(word) == pos + 1: phonemes.append('S') # at end of word else: phonemes.append('S') # default sound - or 'Z' ?? elif letter == 'u': if previous in [ 'a', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', 'e' ]: pass elif previous == 'q': phonemes.append('V') elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append('UH0') elif len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('UH0') elif len(word) == pos + 1 and not isvowel(previous): phonemes.append('UW0') else: phonemes.append('UW0') elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': phonemes.append('UW0') elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter == 'z': phonemes.append('T') phonemes.append('S') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter[0]), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] simple_convert = { 'b': 'B', u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', 'f': 'F', 'h': 'HH', u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0', 'k': 'K', 'm': 'M', 'n': 'N', u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0', # ER0 ? AO0 ? 'p': 'P', 'r': 'R', 't': 'T', 'u': 'UW0', u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0', u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0? u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0', 'v': 'V', 'w': 'V', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == 'a': if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']: phonemes.append('OY0') elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y': phonemes.append('OY0') else: phonemes.append('AO0') elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}': if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']: phonemes.append('AY0') elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y': phonemes.append('AY0') else: phonemes.append('AA0') elif letter == 'c': if len(word) > pos + 1 and word[pos + 1] == 's': phonemes.append('CH') else: phonemes.append('T') phonemes.append('S') elif letter == 'd': if len(word) > pos + 1 and word[pos + 1] == 's': pass # handle under 'z' else: phonemes.append('D') elif letter == 'e': if previous == 'e': pass elif len(word) > pos + 1 and word[pos + 1] == 'e': phonemes.append('EY0') elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y': phonemes.append('EY0') else: phonemes.append('EH0') elif letter == 'g': if len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('JH') else: phonemes.append('G') elif letter == 'i': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass else: phonemes.append('IH0') # IY0? elif letter == 'j': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass else: phonemes.append('Y') elif letter == 'l': if len(word) > pos + 1 and word[pos + 1] == 'y': pass # handled under y - ly is close enough to just IY else: phonemes.append('L') elif letter == 'o': if len(word) > pos + 1 and word[pos + 1] in ['i', 'j', 'y']: phonemes.append('OY0') elif len(word) > pos + 2 and word[pos + 1] == 'l' and word[pos + 2] == 'y': phonemes.append('OY0') else: phonemes.append('AO0') elif letter == 'q': # loan words phonemes.append('K') phonemes.append('W') elif letter == 's': if previous == 'c': pass elif len(word) > pos + 2 and word[pos + 1] == 's' and word[ pos + 2] == 'z': # ssz pass elif len(word) > pos + 1 and word[ pos + 1] == 'z' and previous == 's': # ssz phonemes.append('S') phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] == 'z': # sz phonemes.append('S') else: phonemes.append('SH') elif letter == 'x': # loan words only phonemes.append('K') phonemes.append('S') elif letter == 'y': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass elif previous == 'g': pass # handled under g elif previous == 't': phonemes.append('Y') elif previous == 'n': pass # close enough to just n, although more like Spanish ñ else: phonemes.append('IY0') elif letter == 'z': if len(word) > pos + 1 and word[ pos + 1] == 's' and previous == 'd': # dzs phonemes.append('JH') elif previous == 'z' and len(word) > pos + 1 and word[ pos + 1] == 's': # zzs phonemes.append('ZH') phonemes.append('ZH') elif len(word) > pos + 1 and word[pos + 1] == 's': # zs phonemes.append('ZH') elif len(word) > pos + 2 and word[pos + 1] == 'z' and word[ pos + 2] == 's': # probably zzs pass elif previous == 'd': # dz phonemes.append('D') phonemes.append('S') elif previous == 's': pass # handled under s elif previous == 'c': pass # handled under c else: phonemes.append('Z') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon.split()[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] simple_convert = { 'a': 'AA0', 'b': 'B', 'd': 'D', 'f': 'F', 'h': 'HH', 'i': 'IY0', 'j': 'IY0', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AO0', 'p': 'P', 'r': 'R', 't': 'T', 'u': 'UW0', 'v': 'V', 'x': 'HH', # use 'Y' ?? 'K'?? u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH', # Š u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH', # Ž # Cyrillic u'\N{CYRILLIC SMALL LETTER A}': 'AA0', u'\N{CYRILLIC SMALL LETTER BE}': 'B', u'\N{CYRILLIC SMALL LETTER VE}': 'V', u'\N{CYRILLIC SMALL LETTER CHE}': 'CH', u'\N{CYRILLIC SMALL LETTER DE}': 'D', u'\N{CYRILLIC SMALL LETTER E}': 'EH0', u'\N{CYRILLIC SMALL LETTER EF}': 'F', u'\N{CYRILLIC SMALL LETTER GHE}': 'G', u'\N{CYRILLIC SMALL LETTER I}': 'IY0', u'\N{CYRILLIC SMALL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0', u'\N{CYRILLIC SMALL LETTER KA}': 'K', u'\N{CYRILLIC SMALL LETTER EL}': 'L', u'\N{CYRILLIC SMALL LETTER EM}': 'M', u'\N{CYRILLIC SMALL LETTER EN}': 'N', u'\N{CYRILLIC SMALL LETTER O}': 'AO0', u'\N{CYRILLIC SMALL LETTER PE}': 'P', u'\N{CYRILLIC SMALL LETTER ER}': 'R', u'\N{CYRILLIC SMALL LETTER ES}': 'S', u'\N{CYRILLIC SMALL LETTER SHA}': 'SH', u'\N{CYRILLIC SMALL LETTER TE}': 'T', u'\N{CYRILLIC SMALL LETTER U}': 'UW0', u'\N{CYRILLIC SMALL LETTER HARD SIGN}': 'Y', u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y', u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0', u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH', u'\N{CYRILLIC SMALL LETTER ZE}': 'Z', u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0', u'\N{CYRILLIC CAPITAL LETTER BE}': 'B', u'\N{CYRILLIC CAPITAL LETTER VE}': 'V', u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH', u'\N{CYRILLIC CAPITAL LETTER DE}': 'D', u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0', u'\N{CYRILLIC CAPITAL LETTER EF}': 'F', u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G', u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER KA}': 'K', u'\N{CYRILLIC CAPITAL LETTER EL}': 'L', u'\N{CYRILLIC CAPITAL LETTER EM}': 'M', u'\N{CYRILLIC CAPITAL LETTER EN}': 'N', u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0', u'\N{CYRILLIC CAPITAL LETTER PE}': 'P', u'\N{CYRILLIC CAPITAL LETTER ER}': 'R', u'\N{CYRILLIC CAPITAL LETTER ES}': 'S', u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH', u'\N{CYRILLIC CAPITAL LETTER TE}': 'T', u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0', u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': 'Y', u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y', u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0', u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH', u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter == 'c': if previous == 's' and len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') # as in phonemes.append('CH') # freSH CHeese elif len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('CH') else: phonemes.append('T') phonemes.append('S') elif letter == 'b' and len(word) == pos+1: phonemes.append('P') elif letter == 'd' and len(word) == pos+1: phonemes.append('T') elif letter in ['e', u'\N{CYRILLIC SMALL LETTER IE}']: if pos == 0: phonemes.append('Y') phonemes.append('EH0') if len(word) > pos+1 and word[pos+1] in ['h', '^']: phonemes.append('EH0') else: phonemes.append('EH0') elif letter =='^': pass elif letter == 'g': if len(word) == pos+1: phonemes.append('K') elif previous in ['e', 'o'] and len(word) == pos+2 and word[pos+1] == 'o': phonemes.append('V') # possessive endings -ogo and -ego else: phonemes.append('G') elif letter == 'h': pass elif letter == 's': if len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') else: phonemes.append('S') elif letter == 'v' and len(word) == pos+1: phonemes.append('F') elif letter == 'y': if len(word) > pos+1 and word[pos+1] == 'a': phonemes.append('Y') else: phonemes.append('IH0') elif letter == 'z': if len(word) > pos+1 and word[pos+1] == 'h': if len(word) == pos+2: phonemes.append('SH') else: phonemes.append('ZH') else: if len(word) == pos+1: phonemes.append('S') else: phonemes.append('Z') elif letter in [ u'\N{CYRILLIC CAPITAL LETTER SHCHA}', u'\N{CYRILLIC SMALL LETTER SHCHA}' ]: phonemes.append('SH') #phonemes.append('CH') elif letter in [ u'\N{CYRILLIC CAPITAL LETTER TSE}' , u'\N{CYRILLIC SMALL LETTER TSE}' ]: phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}': if pos==0: phonemes.append('IY0') phonemes.append('AA1') elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}': if pos==0: phonemes.append('Y') phonemes.append('UW0') elif letter in [u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', u'\N{CYRILLIC SMALL LETTER IO}']: if pos==0: phonemes.append('Y') phonemes.append('AO0') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon) #~ else: #~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding) pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownSwedishSyllable(word, recursive=False, phonetic=False): word = word.lower() # isvowel = dict.fromkeys('aeiou').has_key phonemes = [] simple_convert = { # u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0', u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', # u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0', # u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0', # u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0', # u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0', 'a': 'AH0', # not exact - AO0 ?? 'b': 'B', 'f': 'F', 'm': 'M', 'o': 'UH0', # compromise, actually UW0 or AA0 (not), sometimes AO0 'q': 'K', 'v': 'V', 'w': 'V', 'z': 'S', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0', # not exact u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == 'c': if len(word) > pos + 1 and word[pos + 1] == 'c': pass # cc, handle on next case elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' ]: phonemes.append('K') phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' ]: phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('SH') #~ if previous == 's': #~ phonemes.append('SH') #~ else: #~ phonemes.append('CH') # sometimes 'K' as in English 'chorus', but no rule else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('K') elif letter == 'd': if pos == 0 and len(word) > pos + 1 and word[ pos + 1] == 'j': # dj at beginning of word pass # same as j alone else: phonemes.append('D') elif letter == 'e': if phonetic: phonemes.append('EH0') elif len(word) == pos + 2 and word[pos + 1] == 'r': # ends in er phonemes.append('AE0') else: phonemes.append('EH0') # sometimes 'IY0', sometimes 'EY0' elif letter == 'g': if previous in ['l', 'r']: phonemes.append('Y') elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[ pos + 2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': phonemes.append('SH') elif len(word) > pos + 1 and word[pos + 1] == 'n' and previous in [ 'a', 'o', 'u', 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' ]: phonemes.append('NG') elif previous == 'n': # ng phonemes.append('NG') elif len(word) > pos + 1 and word[pos + 1] == 'j': # gj pass # same as 'j' alone elif len(word) == pos + 2 and word[ pos + 1] == 'e': # ends in 'ge' - French loan-word such as garage ? phonemes.append('SH') elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' ]: # ??? if e is unstressed (how to tell?), pronounce as 'G' phonemes.append('Y') elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [ 'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' ]: phonemes.append('G') elif previous == 'g': pass else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('G') elif letter == 'h': if previous == 'c': pass # handled under c elif len(word) > pos + 1 and word[pos + 1] == 'j': pass # same as 'j' alone elif pos == 1 and previous == 's': # probably a foreign loan-word phonemes.append('SH') else: phonemes.append('HH') elif letter == 'i': if previous == 'g' and len(word) > pos + 1 and word[ pos + 1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': pass elif previous == 's' and len(word) > pos + 1 and word[ pos + 1] == 'o': # sio e.g mission phonemes.append('UH0') else: phonemes.append('IY0') # sometimes 'IH0' elif letter == 'k': # needs to be handled before j to handle skj sound if pos == 0 and word in [ u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse' ]: phonemes.append('K') elif pos == 0 and len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' ]: phonemes.append('CH') elif word == unicode('människa', input_encoding): phonemes.append('SH') elif word == unicode('människor', input_encoding): phonemes.append('SH') elif len(word) == pos + 1 and previous == 's': # ends in SK phonemes.append('S') phonemes.append('K') elif len(word) > pos + 1 and word[pos + 1] == 'j': # phonemes.append('SH') phonemes.append('CH') # more Finnish-Swedish than Swedish ??? elif len(word) == pos + 1 and previous == 'c': pass elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in [ 'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' ]: phonemes.append('S') phonemes.append('K') elif previous == 's' and pos == 1: # sk at beginning of word phonemes.append('SH') else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}'] phonemes.append('K') elif letter == 't': # needs to be handled before j to handle stj sound if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'j': phonemes.append('SH') if previous == 't' and len(word) == pos + 1: pass elif len(word) > pos + 1 and word[pos + 1] == 'j': # tj pass # handled under j else: phonemes.append('T') elif letter == 'j': if previous == 's': phonemes.append('SH') elif previous == 't': if word[pos - 2] == 's': # stj, handled under 't' pass else: phonemes.append('CH') elif previous == 'k': pass # handled under k else: phonemes.append('Y') elif letter == 'l': if len(word) > pos + 1 and word[pos + 1] == 'j': pass # same as 'j' alone else: phonemes.append('L') elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'g': # ng pass # handled under 'g' elif len(word) > pos + 1 and word[pos + 1] == 'k': # ng phonemes.append('NG') else: phonemes.append('N') elif letter == 'p': if previous == 'p': pass else: phonemes.append('P') elif letter == 'r': if len(word) > pos + 1 and word[pos + 1] == 's': pass # handled under s else: phonemes.append('R') elif letter == 's': if len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h': pass # handled under 'c' elif len(word) > pos + 2 and word[pos + 1] == 't' and word[pos + 2] == 'j': pass # handled under 't' elif len(word) > pos + 1 and word[pos + 1] == 'k': pass # handled under 'k' elif len(word) > pos + 1 and word[pos + 1] == 'j': pass # handled under 'j' elif len(word) > pos + 1 and word[pos + 1] == 's': pass elif len(word) > pos + 1 and word[ pos + 1] == 'i' and len(word) > pos + 2 and word[ pos + 2] == 'o': ## might need more breakdown phonemes.append('SH') elif pos == 0 and len(word) > pos + 1 and word[pos + 1] == 'h': pass # handled under 'h' elif previous == 'r': phonemes.append('SH') # not entirely accurate, use HH ?? else: phonemes.append('S') elif letter == 'u': if previous == 'q': phonemes.append('V') else: phonemes.append( 'UW0') # inaccurate, no accurate CMU equiivalent elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter == 'y': if word in [u'yoga', u'yoghurt']: phonemes.append('Y') elif word == u'fyrtio': phonemes.append('ER0') else: phonemes.append('UW0') # not exact elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': if phonetic: phonemes.append('AE0') elif len(word) > pos + 1 and word[pos + 1] == 'r': phonemes.append('AE0') # not exact, and skips exceptions--- else: phonemes.append('EH0') # not exact, and skips exceptions elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: if not recursive: phon = " ".join( breakdownSwedishSyllable(hammer(letter), True, phonetic)) if phon: phonemes.append(phon) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(input_word, recursive=False): word = input_word word = word.lower() # trasformando tutte le parole in minuscolo si diminuiscono le combinazioni da gestire previous = u'' word_index = 0 breakdown_word = [] for letter in word : if letter == u'c' : #ci if word_index < len(word) and word[word_index+1]==u'i' : breakdown_word.append('EH0') #ce elif word_index < len(word) and word[word_index+1]==u'e' : breakdown_word.append('EH0') #cci elif word_index < len(word)-1 and word[word_index+1]==u'c' and word[word_index+2]==u'i' : breakdown_word.append('EH0') else : breakdown_word.append('K') elif letter == u'g' : #gi if word_index < len(word) and word[word_index+1]==u'i' : breakdown_word.append('JH') #gli elif word_index < len(word)-1 and word[word_index+1]==u'l' and word[word_index+2]==u'i' : breakdown_word.append('JH') else : breakdown_word.append('G') elif letter == u'i' : #ci, #gi if previous == u'c' or previous == u'g' : previous = letter word_index = word_index + 1 continue else : breakdown_word.append('EH0') elif letter == u'\N{LATIN SMALL LETTER I WITH ACUTE}' : #cí, #gí if previous == u'c' or previous == u'g' : previous = letter word_index = word_index + 1 continue else : breakdown_word.append('EH1') elif letter == u'\N{LATIN SMALL LETTER I WITH GRAVE}' : #cì, #gì if previous == u'c' or previous == u'g' : previous = letter word_index = word_index + 1 continue else : breakdown_word.append('EH1') elif letter == u'h' : #ch if previous == u'c': previous = letter word_index = word_index + 1 continue else : breakdown_word.append('HH') elif letter == u'j' : if word_index > 0 and word_index <len(word) : breakdown_word.append('JH') else : breakdown_word.append('EH0') elif letter == u'l' : #gli if word_index < len(word) and previous == u'g' and word[word_index+1] == u'i' : previous = letter word_index = word_index + 1 continue else : breakdown_word.append('L') elif letter in unconditional_conversions.keys(): breakdown_word.append(unconditional_conversions[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: breakdown_word.append(phon.split()[0]) #~ else: #~ print "not handled", letter, word previous = letter word_index = word_index + 1 return breakdown_word
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(u'aeiou\N{LATIN SMALL LETTER DOTLESS I}' u'\N{LATIN SMALL LETTER O WITH DIAERESIS}\N{LATIN SMALL LETTER U WITH DIAERESIS}' u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}').has_key phonemes = [] simple_convert = { 'b': 'B', 'c': 'JH', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'CH', 'd': 'D', 'f': 'F', 'g': 'G', 'h': 'HH', u'\N{LATIN SMALL LETTER DOTLESS I}': 'AH0', 'i': 'IY0', 'j': 'ZH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', 'p': 'P', 'r': 'R', 's': 'S', u'\N{LATIN SMALL LETTER S WITH CEDILLA}': 'SH', 't': 'T', u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0? 'w': 'V', # loan-words 'z': 'Z', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == 'a': if len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('AY0') else: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}': if previous in ['g', 'k', 'l']: phonemes.append('IY0') phonemes.append('AA0') else: phonemes.append('AA0') elif letter == 'e': if len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('EY0') else: phonemes.append('EH0') elif letter == u'\N{LATIN SMALL LETTER G WITH BREVE}': pass #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: #~ phonemes.append('Y') #~ else: #~ pass #~ elif letter == 'g': #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: #~ phonemes.append('L') #~ phonemes.append('Y') #~ else: #~ phonemes.append('L') #~ elif letter == 'l': #~ if len(word) > pos+1 and word[pos+1] in ['e', 'i', #~ u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', #~ u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']: #~ phonemes.append('L') #~ phonemes.append('Y') #~ else: #~ phonemes.append('L') elif letter =='n': if len(word) > pos+1 and word[pos+1] == 'b': phonemes.append('M') else: phonemes.append('N') elif letter == 'o': if len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('OY0') else: phonemes.append('OW0') elif letter == 'q': # loan-words phonemes.append('K') elif letter == u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}': if previous in ['g', 'k', 'l']: phonemes.append('IY0') phonemes.append('UW0') else: phonemes.append('UW0') elif letter == 'u': if len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('IY0') else: phonemes.append('UH0') elif letter == 'v': if isvowel(previous): phonemes.append('W') else: phonemes.append('V') elif letter == 'x': # loan-words phonemes.append('K') phonemes.append('S') elif letter == 'y': if previous in ['a', 'e', 'o', 'u', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: pass else: phonemes.append('Y') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon.split()[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(u'aeiouy\N{LATIN SMALL LETTER A WITH RING ABOVE}\N{LATIN SMALL LETTER AE}\N{LATIN SMALL LETTER O WITH STROKE}').has_key phonemes = [] simple_convert = { 'b': 'B', 'c': 'S', 'f': 'F', 'm': 'M', 'p': 'P', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'V', 'z': 'S', } short_vowels = { u'a': 'AA0', u'e': 'EH0', u'i': 'IH0', u'o': 'UH0', u'u': 'UH0', u'y': 'IH0', u'\N{LATIN SMALL LETTER AE}': 'AE0', u'\N{LATIN SMALL LETTER O WITH STROKE}': 'AH0', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AA0' } long_vowels = { u'a': 'AA0', u'e': 'EY0', u'i': 'IY0', u'o': 'OW0', u'u': 'UW0', u'y': 'IY0', u'\N{LATIN SMALL LETTER AE}': 'AE0', u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0' } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if isvowel(letter): if len(word) == pos+3 and word[pos+1] == 'r' and word[pos+1] == 'd': phonemes.append(long_vowels[letter]) elif letter == 'a'and len(word) > pos+1 and word[pos+1] == 'i': phonemes.append('AY0') elif letter == 'a'and len(word) > pos+1 and word[pos+1] == 'u': phonemes.append('AW0') elif letter == 'e'and len(word) > pos+1 and word[pos+1] == 'i': phonemes.append('AY0') elif letter == 'e'and len(word) > pos+1 and word[pos+1] == 'r': phonemes.append('AE0') elif letter == 'o'and len(word) > pos+1 and word[pos+1] == 'i': phonemes.append('OY0') elif letter == 'o'and len(word) > pos+1 and word[pos+1] == 'i': phonemes.append('UW0') phonemes.append('IY0') elif letter == u'\N{LATIN SMALL LETTER O WITH STROKE}'and len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('OW0') phonemes.append('IY0') elif len(word) == pos+2 and word[pos+1] == 'm': phonemes.append(short_vowels[letter]) elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append(short_vowels[letter]) elif len(word) == pos+3 and word[pos+1] == 'r' and word[pos+2] == 'd': phonemes.append(long_vowels[letter]) elif len(word) > pos+2 and word[pos+1] != word[pos+2] and not isvowel(word[pos+1]): phonemes.append(long_vowels[letter]) else: phonemes.append(long_vowels[letter]) elif letter == 'd': if len(word) == pos+1 and previous == 'r': # ends in d, e.g. jord pass elif len(word) == pos+1 and isvowel(previous): # ends in long vowel then d, e.g. god pass elif previous in ['l', 'n']: # holde, land pass else: phonemes.append('D') elif letter == 'g': if len(word) > pos+1 and word[pos+1] == 'j': # gjær pass # handled as a normal j elif len(word) == pos+1 and previous == 'i': # ærlig pass # silent at end of word elif previous == 'n': pass # handled under n elif len(word) > pos+1 and word[pos+1] in ['i', 'y']: phonemes.append('Y') elif len(word) > pos+2 and word[pos+1] == 'e' and word[pos+2] == 'i': phonemes.append('Y') else: phonemes.append('G') elif letter == 'h': if len(word) > pos+1 and word[pos+1] == 'j': # hjem pass # handled as a normal j if len(word) > pos+1 and word[pos+1] == 'v': # hver pass # handled as a normal v else: phonemes.append('HH') elif letter == 'j': if previous == 'k': pass # handled under k elif previous == 's': pass # handled under s else: phonemes.append('Y') elif letter == 'k': if previous == 's' and len(word) > pos+1 and word[pos+1] in [u'j', u'i', u'y', u'\N{LATIN SMALL LETTER O WITH STROKE}']: phonemes.append('SH') # sjkære, ski, skøyter elif previous == 's': phonemes.append('S') phonemes.append('K') elif len(word) > pos+1 and word[pos+1] in [u'i', u'y']: # kirke, kyss phonemes.append('HH') elif len(word) > pos+1 and word[pos+1] == 'j': # kjønn phonemes.append('HH') else: phonemes.append('K') # kaffe elif letter == 'l': if len(word) > pos+1 and word[pos+1] == 'j': # ljug pass # handled as a normal j else: phonemes.append('L') elif letter == 'n': if len(word) > pos+1 and word[pos+1] == 'g': # fang phonemes.append('NG') else: phonemes.append('N') # ni elif letter == 'q': # foreign language loan-words? phonemes.append('K') phonemes.append('UW0') elif letter == 's': if previous == 'r': phonemes.append('SH') # Eastern Norway - norsk, person, for sent elif len(word) > pos+1 and word[pos+1] == 'k': pass # handled under k elif len(word) > pos+1 and word[pos+1] == 'j': # sjø phonemes.append('SH') elif len(word) > pos+1 and word[pos+1] == 'l': phonemes.append('SH') # informal usage else: phonemes.append('S') # syv elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] vowels = [ 'a', 'A', 'e', 'E', 'i', 'I', 'o', 'O', 'u', 'U', u'\N{CYRILLIC SMALL LETTER A}', # looks like normal a # u'\N{CYRILLIC SMALL LETTER IE}', # looks like normal e u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}', # looks something like small Euro symbol with one cross-piece u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}', # looks like normal i # u'\N{CYRILLIC SMALL LETTER YI}', # i with diaresis u'\N{CYRILLIC SMALL LETTER I}', # looks like small backwards capital N u'\N{CYRILLIC SMALL LETTER SHORT I}', # looks like small backwards capital N with tilde u'\N{CYRILLIC SMALL LETTER O}', # looks like normal o u'\N{CYRILLIC SMALL LETTER U}', # looks like normal y u'\N{CYRILLIC CAPITAL LETTER A}', # looks like normal A # u'\N{CYRILLIC CAPITAL LETTER IE}', # looks like normal E u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}', # looks something like Euro symbol with one cross-piece u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}', # looks like normal I # u'\N{CYRILLIC CAPITAL LETTER YI}', # I with diaresis u'\N{CYRILLIC CAPITAL LETTER I}', # looks like backwards capital N u'\N{CYRILLIC CAPITAL LETTER SHORT I}', # looks like backwards capital N with tilde u'\N{CYRILLIC CAPITAL LETTER O}', # looks like normal O u'\N{CYRILLIC CAPITAL LETTER U}', # looks like normal Y ] simple_convert = { 'a': 'AA0', 'b': 'B', 'v': 'V', 'g': 'G', 'd': 'D', 'e': 'EH0', 'j': 'Y', 'y': 'IH0', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AO0', 'p': 'P', 'r': 'R', 't': 'T', 'f': 'F', 'x': 'HH', # use 'Y' ?? 'K'?? u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH', # Š u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH', # Ž # Cyrillic u'\N{CYRILLIC SMALL LETTER A}': 'AA0', u'\N{CYRILLIC SMALL LETTER BE}': 'B', u'\N{CYRILLIC SMALL LETTER VE}': 'V', u'\N{CYRILLIC SMALL LETTER GHE}': 'G', u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G', u'\N{CYRILLIC SMALL LETTER DE}': 'D', u'\N{CYRILLIC SMALL LETTER IE}': 'EH0', u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0', u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH', u'\N{CYRILLIC SMALL LETTER ZE}': 'Z', u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0', # 'Y' ? u'\N{CYRILLIC SMALL LETTER I}': 'IY0', u'\N{CYRILLIC SMALL LETTER KA}': 'K', u'\N{CYRILLIC SMALL LETTER EL}': 'L', u'\N{CYRILLIC SMALL LETTER EM}': 'M', u'\N{CYRILLIC SMALL LETTER EN}': 'N', u'\N{CYRILLIC SMALL LETTER O}': 'AO0', u'\N{CYRILLIC SMALL LETTER PE}': 'P', u'\N{CYRILLIC SMALL LETTER ER}': 'R', u'\N{CYRILLIC SMALL LETTER ES}': 'S', u'\N{CYRILLIC SMALL LETTER TE}': 'T', u'\N{CYRILLIC SMALL LETTER U}': 'UH0', u'\N{CYRILLIC SMALL LETTER EF}': 'F', u'\N{CYRILLIC SMALL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC SMALL LETTER CHE}': 'CH', u'\N{CYRILLIC SMALL LETTER SHA}': 'SH', u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH', # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y', u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0', u'\N{CYRILLIC CAPITAL LETTER BE}': 'B', u'\N{CYRILLIC CAPITAL LETTER VE}': 'V', u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G', u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G', u'\N{CYRILLIC CAPITAL LETTER DE}': 'D', u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0', u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0', u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH', u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z', u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0', # 'Y' ? u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER KA}': 'K', u'\N{CYRILLIC CAPITAL LETTER EL}': 'L', u'\N{CYRILLIC CAPITAL LETTER EM}': 'M', u'\N{CYRILLIC CAPITAL LETTER EN}': 'N', u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0', u'\N{CYRILLIC CAPITAL LETTER PE}': 'P', u'\N{CYRILLIC CAPITAL LETTER ER}': 'R', u'\N{CYRILLIC CAPITAL LETTER ES}': 'S', u'\N{CYRILLIC CAPITAL LETTER TE}': 'T', u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0', u'\N{CYRILLIC CAPITAL LETTER EF}': 'F', u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH', u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH', u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH', # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter == 'c': if len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('CH') else: pass elif letter == 'i': if previous == 'j': phonemes.append('IY0') else: phonemes.append('IH0') elif letter == 'h': if letter == 'h': if previous in ['z', 's', 'c']: pass else: phonemes.append('HH') elif letter == 's': if len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') else: phonemes.append('S') elif letter == 'u': if previous == 'j': phonemes.append('UW0') else: phonemes.append('UH0') elif letter == 'z': if len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('ZH') else: phonemes.append('Z') elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}': phonemes.append('SH') phonemes.append('CH') elif letter == u'\N{CYRILLIC SMALL LETTER TSE}': phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC SMALL LETTER YA}': phonemes.append('Y') phonemes.append('AO0') # not if unstressed - drop this line ? elif letter == u'\N{CYRILLIC SMALL LETTER YU}': phonemes.append('Y') phonemes.append('UW0') elif letter == u'\N{CYRILLIC SMALL LETTER YI}': phonemes.append('Y') phonemes.append('IY0') elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}': phonemes.append('Y') phonemes.append('EH0') elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}': phonemes.append('SH') phonemes.append('CH') elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}': phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}': if previous in vowels or previous == "'" or pos == 0: phonemes.append('Y') phonemes.append('UW0') else: phonemes.append('UH0') elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}': if previous in vowels or previous == "'" or pos == 0: phonemes.append('Y') phonemes.append('AO0') # not if unstressed - drop this line ? else: phonemes.append('AA0') elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}': phonemes.append('Y') phonemes.append('IY0') elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}': phonemes.append('Y') phonemes.append('EH0') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding) pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] vowels = [ 'a', 'A', 'e', 'E', 'i', 'I', 'o', 'O', 'u', 'U', u'\N{CYRILLIC SMALL LETTER A}', # looks like normal a # u'\N{CYRILLIC SMALL LETTER IE}', # looks like normal e u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}', # looks something like small Euro symbol with one cross-piece u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}', # looks like normal i # u'\N{CYRILLIC SMALL LETTER YI}', # i with diaresis u'\N{CYRILLIC SMALL LETTER I}', # looks like small backwards capital N u'\N{CYRILLIC SMALL LETTER SHORT I}', # looks like small backwards capital N with tilde u'\N{CYRILLIC SMALL LETTER O}', # looks like normal o u'\N{CYRILLIC SMALL LETTER U}', # looks like normal y u'\N{CYRILLIC CAPITAL LETTER A}', # looks like normal A # u'\N{CYRILLIC CAPITAL LETTER IE}', # looks like normal E u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}', # looks something like Euro symbol with one cross-piece u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}', # looks like normal I # u'\N{CYRILLIC CAPITAL LETTER YI}', # I with diaresis u'\N{CYRILLIC CAPITAL LETTER I}', # looks like backwards capital N u'\N{CYRILLIC CAPITAL LETTER SHORT I}', # looks like backwards capital N with tilde u'\N{CYRILLIC CAPITAL LETTER O}', # looks like normal O u'\N{CYRILLIC CAPITAL LETTER U}', # looks like normal Y ] simple_convert = { 'a': 'AA0', 'b': 'B', 'v': 'V', 'g': 'G', 'd': 'D', 'e': 'EH0', 'j': 'Y', 'y': 'IH0', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AO0', 'p': 'P', 'r': 'R', 't': 'T', 'f': 'F', 'x': 'HH', # use 'Y' ?? 'K'?? u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH', # Š u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH', # Ž # Cyrillic u'\N{CYRILLIC SMALL LETTER A}': 'AA0', u'\N{CYRILLIC SMALL LETTER BE}': 'B', u'\N{CYRILLIC SMALL LETTER VE}': 'V', u'\N{CYRILLIC SMALL LETTER GHE}': 'G', u'\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}': 'G', u'\N{CYRILLIC SMALL LETTER DE}': 'D', u'\N{CYRILLIC SMALL LETTER IE}': 'EH0', u'\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0', u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH', u'\N{CYRILLIC SMALL LETTER ZE}': 'Z', u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0', # 'Y' ? u'\N{CYRILLIC SMALL LETTER I}': 'IY0', u'\N{CYRILLIC SMALL LETTER KA}': 'K', u'\N{CYRILLIC SMALL LETTER EL}': 'L', u'\N{CYRILLIC SMALL LETTER EM}': 'M', u'\N{CYRILLIC SMALL LETTER EN}': 'N', u'\N{CYRILLIC SMALL LETTER O}': 'AO0', u'\N{CYRILLIC SMALL LETTER PE}': 'P', u'\N{CYRILLIC SMALL LETTER ER}': 'R', u'\N{CYRILLIC SMALL LETTER ES}': 'S', u'\N{CYRILLIC SMALL LETTER TE}': 'T', u'\N{CYRILLIC SMALL LETTER U}': 'UH0', u'\N{CYRILLIC SMALL LETTER EF}': 'F', u'\N{CYRILLIC SMALL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC SMALL LETTER CHE}': 'CH', u'\N{CYRILLIC SMALL LETTER SHA}': 'SH', u'\N{CYRILLIC SMALL LETTER SHCHA}': 'SH', # u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': 'Y', u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0', u'\N{CYRILLIC CAPITAL LETTER BE}': 'B', u'\N{CYRILLIC CAPITAL LETTER VE}': 'V', u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G', u'\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}': 'G', u'\N{CYRILLIC CAPITAL LETTER DE}': 'D', u'\N{CYRILLIC CAPITAL LETTER IE}': 'EH0', u'\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}': 'IH0', u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH', u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z', u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0', # 'Y' ? u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER KA}': 'K', u'\N{CYRILLIC CAPITAL LETTER EL}': 'L', u'\N{CYRILLIC CAPITAL LETTER EM}': 'M', u'\N{CYRILLIC CAPITAL LETTER EN}': 'N', u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0', u'\N{CYRILLIC CAPITAL LETTER PE}': 'P', u'\N{CYRILLIC CAPITAL LETTER ER}': 'R', u'\N{CYRILLIC CAPITAL LETTER ES}': 'S', u'\N{CYRILLIC CAPITAL LETTER TE}': 'T', u'\N{CYRILLIC CAPITAL LETTER U}': 'UH0', u'\N{CYRILLIC CAPITAL LETTER EF}': 'F', u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH', u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH', u'\N{CYRILLIC CAPITAL LETTER SHCHA}': 'SH', # u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': 'Y', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter == 'c': if len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('CH') else: pass elif letter == 'i': if previous == 'j': phonemes.append('IY0') else: phonemes.append('IH0') elif letter == 'h': if letter == 'h': if previous in ['z', 's', 'c']: pass else: phonemes.append('HH') elif letter == 's': if len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('SH') else: phonemes.append('S') elif letter == 'u': if previous == 'j': phonemes.append('UW0') else: phonemes.append('UH0') elif letter == 'z': if len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('ZH') else: phonemes.append('Z') elif letter == u'\N{CYRILLIC SMALL LETTER SHCHA}': phonemes.append('SH') phonemes.append('CH') elif letter == u'\N{CYRILLIC SMALL LETTER TSE}': phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC SMALL LETTER YA}': phonemes.append('Y') phonemes.append('AO0') # not if unstressed - drop this line ? elif letter == u'\N{CYRILLIC SMALL LETTER YU}': phonemes.append('Y') phonemes.append('UW0') elif letter == u'\N{CYRILLIC SMALL LETTER YI}': phonemes.append('Y') phonemes.append('IY0') elif letter == u'\N{CYRILLIC SMALL LETTER UKRAINIAN IE}': phonemes.append('Y') phonemes.append('EH0') elif letter == u'\N{CYRILLIC CAPITAL LETTER SHCHA}': phonemes.append('SH') phonemes.append('CH') elif letter == u'\N{CYRILLIC CAPITAL LETTER TSE}': phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}': if previous in vowels or previous == "'" or pos == 0: phonemes.append('Y') phonemes.append('UW0') else: phonemes.append('UH0') elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}': if previous in vowels or previous == "'" or pos == 0: phonemes.append('Y') phonemes.append('AO0') # not if unstressed - drop this line ? else: phonemes.append('AA0') elif letter == u'\N{CYRILLIC CAPITAL LETTER YI}': phonemes.append('Y') phonemes.append('IY0') elif letter == u'\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}': phonemes.append('Y') phonemes.append('EH0') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) # ~ else: # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding) pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] simple_convert = { 'b': 'B', u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', 'f': 'F', 'h': 'HH', u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0', 'k': 'K', 'm': 'M', 'n': 'N', u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'OW0', # ER0 ? AO0 ? 'p': 'P', 'r': 'R', 't': 'T', 'u': 'UW0', u'\N{LATIN SMALL LETTER U WITH ACUTE}': 'UW0', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}': 'ER0', u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': 'UW0', # IH0? u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}': 'UW0', 'v': 'V', 'w': 'V', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter =='a': if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']: phonemes.append('OY0') elif len(word) > pos+2 and word[pos+1] == 'l' and word[pos+2] == 'y': phonemes.append('OY0') else: phonemes.append('AO0') elif letter == u'\N{LATIN SMALL LETTER A WITH ACUTE}': if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']: phonemes.append('AY0') elif len(word) > pos+2 and word[pos+1] == 'l' and word[pos+2] == 'y': phonemes.append('AY0') else: phonemes.append('AA0') elif letter == 'c': if len(word) > pos+1 and word[pos+1] == 's': phonemes.append('CH') else: phonemes.append('T') phonemes.append('S') elif letter == 'd': if len(word) > pos+1 and word[pos+1] == 's': pass # handle under 'z' else: phonemes.append('D') elif letter == 'e': if previous == 'e': pass elif len(word) > pos+1 and word[pos+1] == 'e': phonemes.append('EY0') elif len(word) > pos+2 and word[pos+1] == 'l' and word[pos+2] == 'y': phonemes.append('EY0') else: phonemes.append('EH0') elif letter == 'g': if len(word) > pos+1 and word[pos+1] == 'y': phonemes.append('JH') else: phonemes.append('G') elif letter == 'i': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass else: phonemes.append('IH0') # IY0? elif letter == 'j': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass else: phonemes.append('Y') elif letter == 'l': if len(word) > pos+1 and word[pos+1] == 'y': pass # handled under y - ly is close enough to just IY else: phonemes.append('L') elif letter == 'o': if len(word) > pos+1 and word[pos+1] in ['i', 'j', 'y']: phonemes.append('OY0') elif len(word) > pos+2 and word[pos+1] == 'l' and word[pos+2] == 'y': phonemes.append('OY0') else: phonemes.append('AO0') elif letter == 'q': # loan words phonemes.append('K') phonemes.append('W') elif letter == 's': if previous == 'c': pass elif len(word) > pos+2 and word[pos+1] == 's' and word[pos+2] == 'z': # ssz pass elif len(word) > pos+1 and word[pos+1] == 'z' and previous == 's': # ssz phonemes.append('S') phonemes.append('S') elif len(word) > pos+1 and word[pos+1] == 'z': # sz phonemes.append('S') else: phonemes.append('SH') elif letter == 'x': # loan words only phonemes.append('K') phonemes.append('S') elif letter == 'y': if previous in ['a', 'o', u'\N{LATIN SMALL LETTER A WITH ACUTE}']: pass elif previous == 'g': pass # handled under g elif previous == 't': phonemes.append('Y') elif previous == 'n': pass # close enough to just n, although more like Spanish ñ else: phonemes.append('IY0') elif letter == 'z': if len(word) > pos+1 and word[pos+1] == 's' and previous == 'd': # dzs phonemes.append('JH') elif previous == 'z' and len(word) > pos+1 and word[pos+1] == 's': # zzs phonemes.append('ZH') phonemes.append('ZH') elif len(word) > pos+1 and word[pos+1] == 's': # zs phonemes.append('ZH') elif len(word) > pos+2 and word[pos+1] == 'z' and word[pos+2] == 's': # probably zzs pass elif previous == 'd': # dz phonemes.append('D') phonemes.append('S') elif previous == 's': pass # handled under s elif previous == 'c': pass # handled under c else: phonemes.append('Z') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon.split()[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys('aàáâãäåæeèéêëiìíîïoòóôõöøœuùúûü').has_key phonemes = [] simple_convert = { 'j': 'JH', 'k': 'K', 'q': 'K', 'v': 'V', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S' # ç } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == len(word) > pos + 1 and word[pos + 1]: phonemes.append({letter}) elif letter in ['b', 'd', 'g', 'p', 'x'] and pos + 1 == len(word): # silent at end of words pass elif letter in ['a', accented_a]: if (len(word) > pos + 2 and word[pos + 1] in ['i', accented_i]) and word[pos + 2] != 'l': # ai phonemes.append('EH0') elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]: # au phonemes.append('AO0') else: phonemes.append('AE0') elif letter in ['e', accented_e]: if pos + 1 == len(word) and len(word) == 2: # takes care of words like 'je' phonemes.append('EH0') elif previous == 'u' and pos + 1 == len(word) and len(word) == 3 and word[pos - 2] == 'q': # que phonemes.append('EH0') elif pos + 1 == len(word) and len(word) > 2: # takes care of words like 'parle' pass elif previous == 'l' and word[pos + 1] == 's' and len(word) == 5 and word[pos - 2] == 'l': # elles pass elif len(word) > pos + 2 and word[pos + 1] == 'a' and word[pos + 2] == 'u': pass elif previous in ['o', accented_o]: pass elif word[0] == letter and ( len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['m', 'n']) and ( word != 'ennemmi'): phonemes.append('AE0') elif previous != 'i' and (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or ( len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']): phonemes.append('AE0') elif previous == 'f' and len(word) > pos + 3 and word[pos + 1] == 'm' and word[pos + 2] == 'm' and word[ pos + 3] == 'e': phonemes.append('AE0') elif previous == 'u' and word[pos - 2] == 'q' and pos == len(word): pass else: phonemes.append('EH0') elif letter in ['i', accented_i]: if previous in ['e', accented_e] and ((len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[ pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']) or (len(word) == pos + 2)): pass elif previous in ['f', 't', 'v', 's'] and word[-1] == 'n' and len(word) > 1 and letter == word[-2]: phonemes.append('EH0') elif len(word) > pos + 2 and word[pos + 1] == 'm' and word[pos + 2] in ['b', 'p']: phonemes.append('EH0') elif len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in ['c', 'd', 'f', 'g', 'j', 'l', 'q', 's', 't', 'v']: phonemes.append('EH0') phonemes.append('NG') elif previous in ['a', accented_a] and len(word) > pos + 1 and word[pos + 1] != 'l': phonemes.append('EH0') elif previous in ['o', accented_o] and len(word) == pos + 2 and word[pos + 1] == 'n': phonemes.append('EH0') elif previous in ['o', accented_o] and len(word) > pos + 2 and word[pos + 1] == 'n' and word[pos + 2] in [ 'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']: phonemes.append('EH0') elif previous in ['o', accented_o] and not (len(word) > pos + 2 and word[pos + 1] == 'n' and ( word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] or pos + 2 == len(word))): phonemes.append('AE0') else: phonemes.append('IH0') elif letter in ['o', accented_o]: if previous == 'm' and len(word) > pos + 6 and word[pos + 1] == 'n' and word[pos + 2] == 's' and word[ pos + 3] == 'i' and word[pos + 4] == 'e' and word[pos + 5] == 'u' and word[pos + 6] == 'r': phonemes.append('EH0') # monsieur elif len(word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('W') phonemes.append('AE0') elif len(word) > pos + 2 and word[pos + 1] == 'i' and word[pos + 2] in ['m', 'n']: phonemes.append('W') phonemes.append('EH0') elif len(word) > pos + 2 and word[pos + 1] == 'u' and word[pos + 2] in ['i', accented_i]: # stress vowel phonemes.append('W') elif len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]: phonemes.append('W') elif len(word) > pos + 1 and word[pos + 1] in ['u', accented_u]: phonemes.append('UW0') elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]: phonemes.append('EH0') else: phonemes.append('AO0') elif letter in ['u', accented_u]: if previous == 'l' and len(word) > pos + 3 and word[pos + 1] == 'n' and word[pos + 2] == 'd' and word[ pos + 3] == 'i': phonemes.append('EH0') # lundi elif previous == 'o' and len(word) > pos + 1 and word[pos + 1] in ['i', accented_i]: pass elif previous in ['b', 'c', 'd', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'w', 'x', 'z'] and len(word) > pos + 1 and word[pos + 1] == 'i': phonemes.append('W') elif (len(word) == pos + 2 and word[pos + 1] in ['m', 'n']) or ( len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']): phonemes.append('EH0') elif previous in ['a', accented_a]: phonemes.append('AO0') elif previous in ['g', 'q']: pass elif previous in ['o', accented_o]: phonemes.append('UW0') elif len(word) > pos + 1 and word[pos + 1] in ['a', accented_a]: phonemes.append('AE0') elif len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]: phonemes.append('EH0') elif previous == 'g' and len(word) > pos + 1 and word[pos + 1] in ['e', accented_e]: phonemes.append('JH') else: phonemes.append('UW0') elif letter == 'y': if letter == word[0]: phonemes.append('Y') elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u]: phonemes.append('Y') elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and len(word) == pos + 2: phonemes.append('EH0') elif len(word) > pos + 2 and word[pos + 1] in ['m', 'n'] and word[pos + 2] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']: phonemes.append('EH0') else: phonemes.append('IH0') elif letter == 'b': if len(word) > pos + 1 and word[pos + 1] in ['s', 't']: phonemes.append('P') else: phonemes.append('B') elif letter == 'c': if len(word) > pos + 2 and word[pos + 1] == 'q' and word[pos + 2] == 'u': pass elif word[pos - 2] == 'p' and previous in ['e', accented_e] and len(word) == pos + 2 and word[ pos + 1] == 't': # takes care of words like 'respect' pass elif previous == 's' and len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]: pass elif len(word) > pos + 1 and word[pos + 1] == word[-1] and word[-1] in ['e', accented_e]: phonemes.append('Z') elif len(word) > pos + 1 and ( word[pos + 1] in ['a', 'o', 'u', 'l', accented_a, accented_o, accented_u] or word[ pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']): phonemes.append('K') elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]: phonemes.append('S') elif previous == 'n' and len(word) == pos + 1: pass else: pass elif letter == 'd': if len(word) > pos + 1 and word[pos + 1] in ['-', '_']: phonemes.append('T') elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']: pass else: phonemes.append('D') elif letter == 'f': if len(word) > pos + 1 and word[pos + 1] in ['-', '_']: phonemes.append('V') else: phonemes.append('F') elif letter == 'g': if previous == 'n': phonemes.append('NG') elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', 'y', accented_e, accented_i]: phonemes.append('JH') elif len(word) > pos + 1 and word[pos + 1] in ['s', 't']: pass else: phonemes.append('G') elif letter == 'h': if previous == 'c' and len(word) > pos + 1 and word[pos + 1] == 'r': phonemes.append('K') elif previous == 'c' and len(word) > pos + 1 and word[pos + 1] != 'r': phonemes.append('SH') else: pass elif letter == 'l': if word[pos - 2] in ['m', 'v', 'h', 'k'] and previous == 'i' and word[pos - 3] not in ['a', '']: # mil*, vil* phonemes.append('L') elif word[pos - 3] in ['m', 'v'] and word[pos - 2] == 'i' and previous == 'l' and word[pos - 4] not in ['a', '']: # mill* ,vill* phonemes.append('L') elif word[pos - 3] == 'q' and word[pos - 2] == 'u' and previous == 'i': # tranquil* phonemes.append('L') elif word[pos - 3] == 'u' and word[pos - 2] == 'i' and previous == 'l' and word[ pos - 4] == 'q': # tranquill* phonemes.append('L') elif ((previous == 'i' or (previous == 'i' and len(word) > pos + 1 and word[pos + 1] == letter) or ( previous == 'i' and len(word) > pos + 2 and word[pos + 1] == letter and word[ pos + 2] == 'e'))): phonemes.append('Y') elif ((word[pos - 2] == 'i' and previous == letter) or ( word[pos - 2] == 'i' and previous == letter and len(word) > pos + 1 and word[ pos + 1] == 'e')): # il, ill,ille phonemes.append('Y') else: phonemes.append('L') elif letter == 'm': if previous == 'a' and len(word) > pos + 1 and word[pos + 1] == 'n': pass elif letter == word[-1] and word[-2] == 'i' and word[-3] == 'a': phonemes.append('NG') elif previous in ['a', 'e', 'i', 'o', 'u'] and (len(word) == pos + 1 or ( len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'])): phonemes.append('NG') else: phonemes.append('M') elif letter == 'n': if previous == 'o' and len(word) > pos + 5 and word[pos + 1] == 's' and word[pos + 2] == 'i' and word[ pos + 3] == 'e' and word[pos + 4] == 'u' and word[pos + 5] == 'r': pass elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and ( len(word) == pos + 1 or ( len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'])): # n was forcefully added phonemes.append('NG') else: phonemes.append('N') elif letter == 'p': if len(word) > pos + 1 and word[pos + 1] in ['-', '_']: phonemes.append('P') elif previous == 'm' and len(word) > pos + 1 and word[pos + 1] == 't': # mpt pass elif len(word) > pos + 1 and word[pos + 1] == 'h': # ph phonemes.append('F') else: phonemes.append('P') elif letter == 'r': if word[pos - 2] == 'e' and previous == 'u': phonemes.append('R') elif pos + 1 == len(word): pass else: phonemes.append('R') elif letter == 's': if pos + 1 == len(word) and not ((word[pos - 3] == 'i' and word[pos - 2] == 'l' and previous == 'i') or ( word[pos - 3] in ['e', accented_e, 't'] and word[ pos - 2] == 'l' and previous == 'a') or ( word[pos - 3] == 'f' and word[pos - 2] == 'i' and previous == 'l') or word == 'lis'): pass elif len(word) > pos + 2 and word[pos + 1] == 'c' and word[pos + 2] == 'h': pass elif previous in ['d', 't']: pass elif previous == 'e' and pos + 2 == len(word) and len(word) == 3 and word[pos + 1] == 't': # est pass elif previous in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u]: phonemes.append('Z') else: phonemes.append('S') elif letter == 't': if pos + 1 == len(word) and previous not in ['i', 'c', accented_i] and word != 'gadget' or word[ pos - 2] in ['a', accented_a]: pass elif len(word) > pos + 1 and word[pos + 1] == 's': pass elif previous in ['d', 'g']: pass elif word[pos - 3] == 'p' and word[pos - 2] == 'e' and previous == 'c' and len(word) == pos + 1: pass elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']: phonemes.append('T') elif len(word) > pos + 3 and word[pos + 1] == 'i' and word[pos + 2] == 'o' and word[pos + 3] == 'n' or len( word) > pos + 5 and word[pos + 1] == 'i' and word[pos + 2] == 'e' and word[pos + 3] == 'n' and word[ pos + 4] == 'c' and word[pos + 5] == 'e': phonemes.append('S') # takes care of words ending with 'ience' else: phonemes.append('T') elif letter == 'w': if len(word) > pos + 4 and word[1:] == 'agon': phonemes.append('V') # wagon else: phonemes.append('W') elif letter == 'x': if previous == 'u' and pos == len(word): pass elif len(word) > pos + 1 and word[pos + 1] in ['-', '_']: phonemes.append('Z') elif (len(word) > pos + 1 and word[pos + 1] in ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'y', 'z']) or ( word[pos - 2] == 't' and previous != 'a'): phonemes.append('K') phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] in ['a', 'e', 'h', 'i', 'o', 'u', accented_a, accented_e, accented_i, accented_o, accented_u] and ( word[pos - 2] != 't' and previous not in ['a', accented_a]): phonemes.append('Z') else: phonemes.append('K') phonemes.append('S') elif letter == 'y': if previous == 'a': # ay phonemes.append('EH0') else: phonemes.append('IH0') elif letter == 'z': if word[-1] == letter and word[:-1] == 'berlio': phonemes.append('Z') elif word[-1] == letter and len(word) > 1: pass else: phonemes.append('Z') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter[0]), True) if phon: phonemes.append(phon[0]) # ~ else: # ~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def syllablesToPhonemes(syllables, recursive=False): isvowel = dict.fromkeys('aeiou').has_key phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'h': 'HH', 'j': 'Y', # SH in some words borrowed from French 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'p': 'P', 'r': 'R', 's': 'S', 't': 'T', 'v': 'F', # English F mixed with English V 'w': 'V', # closer to soft English V than the English W - pronounced back in mouth, not with pursed lips 'z': 'Z' } easy_consonants = simple_convert.keys() syllable_pos, letter_pos = 0,1 pos = [1,1] # syllable 1, letter 1 previous_letter = ' ' for syllable in syllables: for letter in syllable: if letter == previous_letter and not isvowel(letter): # double consonants pass # ===================== consonants ========================== elif letter == "b" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]): # last letter in word phonemes.append("P") elif letter == "d" and pos[syllable_pos] == len(syllables) and pos[letter_pos] == len(syllables[-1]): # last letter in word phonemes.append("T") elif letter == "n" and len(syllable) > pos[letter_pos] and syllable[letter_pos] == "g": # ng pass # handled in next case elif letter == "g" and previous_letter == "n": # ng phonemes.append("NG") elif letter == 'g': phonemes.append("HH") # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?) # elif letter == 'c' and len(syllable) > pos[letter_pos]-1 and syllable[pos[letter_pos]] == 'h': elif letter == 'c' and len(syllable) > pos[letter_pos]+1 and syllable[pos[letter_pos]] == 'h': # ch pass elif letter == 'h' and previous_letter == 'c': # ch phonemes.append("HH") # not accurate, but the nearest phoneme in CMU? (use K instead? put in a G anyway?) elif letter == 't'and len(syllable) > pos[letter_pos] and syllable[pos[letter_pos]] == 'h': # th pass # handled in next case elif letter == 'h' and previous_letter == 't': # th phonemes.append("TH") elif letter == 'j'and previous_letter == 'i': pass # handled in vowels elif letter == 'w'and previous_letter == 'u': pass # handled in vowels elif letter == 'x': # rare, mostly borrowed words phonemes.append("K") phonemes.append("S") elif letter == 'q': # rare, mostly borrowed words phonemes.append("K") phonemes.append("W") elif letter == 'c': if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] in "ei": # c before e and i pronounce as s phonemes.append("S") else: phonemes.append("K") # c before a consonant, at the end of a word and before a, o, u pronounce as k; elif letter in easy_consonants: phonemes.append(simple_convert[letter]) # =============== vowels ================ # ------------ A ------------------------------- elif letter == 'a': # short AH, long AA if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'a': # double a phonemes.append("AA0") elif previous_letter == 'a': # double a handled by case above pass elif pos[letter_pos] == len(syllable): # long a reduced to single letter phonemes.append("AA0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # au phonemes.append("AW0") # occasionally as UW0 in some words borrowed from French else: phonemes.append('AH0') # like English short u (cut, hut) # ------------ E ------------------------------- elif letter == 'e': # e short EH long EY if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # double e phonemes.append("EY0") elif previous_letter == 'e': # double e handled by case above pass elif previous_letter == 'i': # ie handled at i stage pass elif previous_letter == 'o': # oe handled at o stage pass elif pos[letter_pos] == len(syllable): # long e reduced to single letter phonemes.append("EY0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # eu phonemes.append("ER0") # less R than English equivalent, closer to French eu elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ei phonemes.append("AY0") else: phonemes.append('EH0') # closer to a (bad B AH D) than English short EH0 (bed = B EH D) # ------------ I ------------------------------- elif letter == 'i': # i short IH long IY if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # double i phonemes.append("IY0") elif previous_letter == 'u': # ui handled at u stage pass elif previous_letter == 'i': # double i handled by case above pass elif previous_letter == 'e': # ei handled at ei stage pass # elif previous_letter == 'a': # !!!FIXME!!! handle aai, aaij # pass elif pos[letter_pos] == len(syllable): # long i reduced to single letter phonemes.append("IY0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'j': # ij phonemes.append("AY0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # iu phonemes.append("IY0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ie phonemes.append("IY0") # elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # ieuw !!!FIXME!!! handle ieuw IY UW ??? # phonemes.append("IY0") else: phonemes.append('IH0') # ------------ O ------------------------------- elif letter == 'o': # o short AA long OW if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'o': # double o phonemes.append("OW0") elif previous_letter == 'o': # double o handled by case above pass elif pos[letter_pos] == len(syllable): # long o reduced to single letter phonemes.append("OW0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'e': # oe phonemes.append("UW0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # ou phonemes.append("AW0") else: phonemes.append('AO0') # ------------ U ------------------------------- elif letter == 'u': if pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'u': # double u phonemes.append("UW0") elif previous_letter == 'u': # double u handled by case above pass elif previous_letter == 'a': # au handled at a stage pass elif previous_letter == 'e': # handled at e stage pass elif previous_letter == 'i': phonemes.append("UW0") elif previous_letter == 'o': # handled at o stage pass elif pos[letter_pos] == len(syllable): # long u reduced to single letter phonemes.append("UW0") elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'w': # uw phonemes.append("UW0") # uw = EW in English DEW IY UW ??? elif pos[letter_pos] < len(syllable) and syllable[pos[letter_pos]] == 'i': # ui phonemes.append("UH0") # - not accurate but the nearest phoneme in CMU? (use UW instead?) else: phonemes.append('ER0') # - not accurate but the nearest phoneme in CMU? (use AH instead?) # ------------ TREMA (looks like German Umlaut, but different meaning) ------------------------------- elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': # δ phonemes.append('AH0') # like English short u (cut, hut) elif letter == u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': # λ phonemes.append('EH0') # closer to a (bad B AH D) than English short EH0 (bed = B EH D) elif letter == u'\N{LATIN SMALL LETTER I WITH DIAERESIS}': # ο phonemes.append('IH0') elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': # φ phonemes.append('AO0') elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': # ό phonemes.append('ER0') # - not accurate but the nearest phoneme in CMU? (use AH instead?) elif letter == u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' or letter == u'\N{LATIN SMALL LETTER Y WITH ACUTE}': # '�' or 'ύ' # LATIN SMALL LETTER Y WITH DIAERESIS # LATIN SMALL LETTER Y WITH ACUTE phonemes.append("AY0") elif len(hammer(letter)) == 1: if not recursive: phon = syllablesToPhonemes(hammer(letter), True) if phon: phonemes.append(phon[0]) pos[letter_pos] += 1 previous_letter = letter pos[syllable_pos] += 1 pos[letter_pos] = 1 previous_letter = ' ' # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(input_word, recursive=False): """breaks down a word into phonemes """ # word = input_word.decode(input_encoding) # decode input into Python default internal format (utf-16) from the GUI input format word = input_word word = word.lower() previous = u'' word_index = 0 breakdown_word = [] for letter in word: if letter == u'b': if word_index == 0 or previous in [u'm', u'n']: breakdown_word.append('B') else: breakdown_word.append('V') elif letter == u'c': if word_index < len(word) - 1 and word[word_index + 1] == u'h': breakdown_word.append('CH') elif previous == u'c': breakdown_word.append('S') elif word_index < len(word) - 1 and word[word_index + 1] == u's': pass elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']: # should this be SH before 'e', S before 'i' ?? breakdown_word.append('S') # South American, Castilian Spanish uses 'TH' else: breakdown_word.append('K') elif letter == u'd': if word_index == 0 or previous in [u'l', u'n']: breakdown_word.append('D') else: breakdown_word.append('DH') elif letter == u'e': if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']: breakdown_word.append('EY0') else: breakdown_word.append('EH0') elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}': if word_index == len(word) - 1 or word[word_index + 1] in [u'a', u'e', u'i', u'o', u'u']: breakdown_word.append('EY1') else: breakdown_word.append('EH1') elif letter == u'g': if word_index < len(word) - 1 and word[word_index + 1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': breakdown_word.append('V') elif word_index < len(word) - 1 and word[word_index + 1] in [u'e', u'i']: breakdown_word.append('HH') else: breakdown_word.append('G') elif letter == u'h': pass elif letter == u'l': if word_index < len(word) - 1 and word[word_index + 1] == u'l': pass elif previous == u'l': breakdown_word.append('Y') else: breakdown_word.append('L') elif letter == u'n': if word_index < len(word) - 1 and word[word_index + 1] == u'v': breakdown_word.append('M') else: breakdown_word.append('N') elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}': breakdown_word.append('N') breakdown_word.append('Y') elif letter == u'o': if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o', u'u']: # last bit necessary ? breakdown_word.append('AO0') else: breakdown_word.append('OW0') elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}': if word_index < len(word) - 1 and word[word_index + 1] not in [u'a', u'e', u'i', u'o', u'u']: # last bit necessary ? breakdown_word.append('AO1') else: breakdown_word.append('OW1') elif letter == u'p': if word_index == len(word) - 1: pass else: breakdown_word.append('P') elif letter == u'r': if previous == u'r': pass elif word_index < len(word) - 1 and word[word_index + 1] == u'r': breakdown_word.append('R') # RR - trilled a lot else: breakdown_word.append('R') # only a little trilled elif letter == u's': if word_index < len(word) - 1 and word[word_index + 1] in [u'd', u'g', u'l', u'm', u'n']: breakdown_word.append('Z') else: breakdown_word.append('S') elif letter == u'u': if previous == u'q': pass elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']: pass else: breakdown_word.append('UW0') elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}': if previous == u'q': pass elif previous == u'g' and word_index < len(word) - 1 and word[word_index + 1] in [u'u', u'i']: pass else: breakdown_word.append('UW1') elif letter == u'v': if word_index == 0 or previous in [u'm', u'n']: breakdown_word.append('B') else: breakdown_word.append('V') elif letter == u'x': if previous in [u'a', u'e', u'i', u'o', u'u'] and word_index < len(word) - 1 and word[word_index + 1] in [ u'a', u'e', u'i', u'o', u'u']: breakdown_word.append('K') breakdown_word.append('S') else: breakdown_word.append('S') elif letter == u'y': if len(word) == 1: breakdown_word.append('IY1') elif word_index == len(word) - 1: breakdown_word.append('IY0') else: breakdown_word.append('Y') elif letter in unconditional_conversions.keys(): breakdown_word.append(unconditional_conversions[letter]) elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: breakdown_word.append(phon[0]) previous = letter word_index += 1 breakdown_word = stressSpanishWord(breakdown_word) # return breakdown_word temp_phonemes = [] previous_phoneme = " " for phoneme in breakdown_word: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys( u'aeiouy\N{LATIN SMALL LETTER A WITH RING ABOVE}\N{LATIN SMALL LETTER AE}\N{LATIN SMALL LETTER O WITH STROKE}' ).has_key phonemes = [] simple_convert = { 'b': 'B', 'c': 'S', 'f': 'F', 'm': 'M', 'p': 'P', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'V', 'z': 'S', } short_vowels = { u'a': 'AA0', u'e': 'EH0', u'i': 'IH0', u'o': 'UH0', u'u': 'UH0', u'y': 'IH0', u'\N{LATIN SMALL LETTER AE}': 'AE0', u'\N{LATIN SMALL LETTER O WITH STROKE}': 'AH0', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AA0' } long_vowels = { u'a': 'AA0', u'e': 'EY0', u'i': 'IY0', u'o': 'OW0', u'u': 'UW0', u'y': 'IY0', u'\N{LATIN SMALL LETTER AE}': 'AE0', u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0' } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if isvowel(letter): if len(word) == pos + 3 and word[pos + 1] == 'r' and word[pos + 1] == 'd': phonemes.append(long_vowels[letter]) elif letter == 'a' and len(word) > pos + 1 and word[pos + 1] == 'i': phonemes.append('AY0') elif letter == 'a' and len(word) > pos + 1 and word[pos + 1] == 'u': phonemes.append('AW0') elif letter == 'e' and len(word) > pos + 1 and word[pos + 1] == 'i': phonemes.append('AY0') elif letter == 'e' and len(word) > pos + 1 and word[pos + 1] == 'r': phonemes.append('AE0') elif letter == 'o' and len(word) > pos + 1 and word[pos + 1] == 'i': phonemes.append('OY0') elif letter == 'o' and len(word) > pos + 1 and word[pos + 1] == 'i': phonemes.append('UW0') phonemes.append('IY0') elif letter == u'\N{LATIN SMALL LETTER O WITH STROKE}' and len( word) > pos + 1 and word[pos + 1] == 'y': phonemes.append('OW0') phonemes.append('IY0') elif len(word) == pos + 2 and word[pos + 1] == 'm': phonemes.append(short_vowels[letter]) elif len(word) > pos + 2 and word[pos + 1] == word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append(short_vowels[letter]) elif len(word) == pos + 3 and word[pos + 1] == 'r' and word[pos + 2] == 'd': phonemes.append(long_vowels[letter]) elif len(word) > pos + 2 and word[pos + 1] != word[ pos + 2] and not isvowel(word[pos + 1]): phonemes.append(long_vowels[letter]) else: phonemes.append(long_vowels[letter]) elif letter == 'd': if len(word ) == pos + 1 and previous == 'r': # ends in d, e.g. jord pass elif len(word) == pos + 1 and isvowel( previous): # ends in long vowel then d, e.g. god pass elif previous in ['l', 'n']: # holde, land pass else: phonemes.append('D') elif letter == 'g': if len(word) > pos + 1 and word[pos + 1] == 'j': # gjær pass # handled as a normal j elif len(word) == pos + 1 and previous == 'i': # ærlig pass # silent at end of word elif previous == 'n': pass # handled under n elif len(word) > pos + 1 and word[pos + 1] in ['i', 'y']: phonemes.append('Y') elif len(word) > pos + 2 and word[pos + 1] == 'e' and word[pos + 2] == 'i': phonemes.append('Y') else: phonemes.append('G') elif letter == 'h': if len(word) > pos + 1 and word[pos + 1] == 'j': # hjem pass # handled as a normal j if len(word) > pos + 1 and word[pos + 1] == 'v': # hver pass # handled as a normal v else: phonemes.append('HH') elif letter == 'j': if previous == 'k': pass # handled under k elif previous == 's': pass # handled under s else: phonemes.append('Y') elif letter == 'k': if previous == 's' and len(word) > pos + 1 and word[pos + 1] in [ u'j', u'i', u'y', u'\N{LATIN SMALL LETTER O WITH STROKE}' ]: phonemes.append('SH') # sjkære, ski, skøyter elif previous == 's': phonemes.append('S') phonemes.append('K') elif len(word) > pos + 1 and word[pos + 1] in [u'i', u'y' ]: # kirke, kyss phonemes.append('HH') elif len(word) > pos + 1 and word[pos + 1] == 'j': # kjønn phonemes.append('HH') else: phonemes.append('K') # kaffe elif letter == 'l': if len(word) > pos + 1 and word[pos + 1] == 'j': # ljug pass # handled as a normal j else: phonemes.append('L') elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'g': # fang phonemes.append('NG') else: phonemes.append('N') # ni elif letter == 'q': # foreign language loan-words? phonemes.append('K') phonemes.append('UW0') elif letter == 's': if previous == 'r': phonemes.append( 'SH') # Eastern Norway - norsk, person, for sent elif len(word) > pos + 1 and word[pos + 1] == 'k': pass # handled under k elif len(word) > pos + 1 and word[pos + 1] == 'j': # sjø phonemes.append('SH') elif len(word) > pos + 1 and word[pos + 1] == 'l': phonemes.append('SH') # informal usage else: phonemes.append('S') # syv elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: phonemes.append(phon) # ~ else: # ~ print "not handled", letter, word pos += 1 previous = letter temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] simple_convert = { 'a': 'AA0', 'b': 'B', 'd': 'D', 'f': 'F', 'h': 'HH', 'i': 'IY0', 'j': 'IY0', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'o': 'AO0', 'p': 'P', 'r': 'R', 't': 'T', 'u': 'UW0', 'v': 'V', 'x': 'HH', # use 'Y' ?? 'K'?? u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN CAPITAL LETTER S WITH CARON}': 'SH', # Š u'\N{LATIN CAPITAL LETTER Z WITH CARON}': 'ZH', # Ž # Cyrillic u'\N{CYRILLIC SMALL LETTER A}': 'AA0', u'\N{CYRILLIC SMALL LETTER BE}': 'B', u'\N{CYRILLIC SMALL LETTER VE}': 'V', u'\N{CYRILLIC SMALL LETTER CHE}': 'CH', u'\N{CYRILLIC SMALL LETTER DE}': 'D', u'\N{CYRILLIC SMALL LETTER E}': 'EH0', u'\N{CYRILLIC SMALL LETTER EF}': 'F', u'\N{CYRILLIC SMALL LETTER GHE}': 'G', u'\N{CYRILLIC SMALL LETTER I}': 'IY0', u'\N{CYRILLIC SMALL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC SMALL LETTER SHORT I}': 'IY0', u'\N{CYRILLIC SMALL LETTER KA}': 'K', u'\N{CYRILLIC SMALL LETTER EL}': 'L', u'\N{CYRILLIC SMALL LETTER EM}': 'M', u'\N{CYRILLIC SMALL LETTER EN}': 'N', u'\N{CYRILLIC SMALL LETTER O}': 'AO0', u'\N{CYRILLIC SMALL LETTER PE}': 'P', u'\N{CYRILLIC SMALL LETTER ER}': 'R', u'\N{CYRILLIC SMALL LETTER ES}': 'S', u'\N{CYRILLIC SMALL LETTER SHA}': 'SH', u'\N{CYRILLIC SMALL LETTER TE}': 'T', u'\N{CYRILLIC SMALL LETTER U}': 'UW0', u'\N{CYRILLIC SMALL LETTER HARD SIGN}': '', u'\N{CYRILLIC SMALL LETTER SOFT SIGN}': '', u'\N{CYRILLIC SMALL LETTER YERU}': 'IH0', u'\N{CYRILLIC SMALL LETTER ZHE}': 'ZH', u'\N{CYRILLIC SMALL LETTER ZE}': 'Z', u'\N{CYRILLIC CAPITAL LETTER A}': 'AA0', u'\N{CYRILLIC CAPITAL LETTER BE}': 'B', u'\N{CYRILLIC CAPITAL LETTER VE}': 'V', u'\N{CYRILLIC CAPITAL LETTER CHE}': 'CH', u'\N{CYRILLIC CAPITAL LETTER DE}': 'D', u'\N{CYRILLIC CAPITAL LETTER E}': 'EH0', u'\N{CYRILLIC CAPITAL LETTER EF}': 'F', u'\N{CYRILLIC CAPITAL LETTER GHE}': 'G', u'\N{CYRILLIC CAPITAL LETTER I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER HA}': 'HH', # 'Y'?? 'K'?? u'\N{CYRILLIC CAPITAL LETTER SHORT I}': 'IY0', u'\N{CYRILLIC CAPITAL LETTER KA}': 'K', u'\N{CYRILLIC CAPITAL LETTER EL}': 'L', u'\N{CYRILLIC CAPITAL LETTER EM}': 'M', u'\N{CYRILLIC CAPITAL LETTER EN}': 'N', u'\N{CYRILLIC CAPITAL LETTER O}': 'AO0', u'\N{CYRILLIC CAPITAL LETTER PE}': 'P', u'\N{CYRILLIC CAPITAL LETTER ER}': 'R', u'\N{CYRILLIC CAPITAL LETTER ES}': 'S', u'\N{CYRILLIC CAPITAL LETTER SHA}': 'SH', u'\N{CYRILLIC CAPITAL LETTER TE}': 'T', u'\N{CYRILLIC CAPITAL LETTER U}': 'UW0', u'\N{CYRILLIC CAPITAL LETTER HARD SIGN}': '', u'\N{CYRILLIC CAPITAL LETTER SOFT SIGN}': '', u'\N{CYRILLIC CAPITAL LETTER YERU}': 'IH0', u'\N{CYRILLIC CAPITAL LETTER ZHE}': 'ZH', u'\N{CYRILLIC CAPITAL LETTER ZE}': 'Z', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter == 'c': if previous == 's' and len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('SH') # as in phonemes.append('CH') # freSH CHeese elif len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('CH') else: phonemes.append('T') phonemes.append('S') elif letter == 'b' and len(word) == pos + 1: phonemes.append('P') elif letter == 'd' and len(word) == pos + 1: phonemes.append('T') elif letter in ['e', u'\N{CYRILLIC SMALL LETTER IE}']: if pos == 0: phonemes.append('Y') phonemes.append('EH0') if len(word) > pos + 1 and word[pos + 1] in ['h', '^']: phonemes.append('EH0') else: phonemes.append('EH0') elif letter == '^': pass elif letter == 'g': if len(word) == pos + 1: phonemes.append('K') elif previous in [ 'e', 'o' ] and len(word) == pos + 2 and word[pos + 1] == 'o': phonemes.append('V') # possessive endings -ogo and -ego else: phonemes.append('G') elif letter == 'h': pass elif letter == 's': if len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('SH') else: phonemes.append('S') elif letter == 'v' and len(word) == pos + 1: phonemes.append('F') elif letter == 'y': if len(word) > pos + 1 and word[pos + 1] == 'a': phonemes.append('Y') else: phonemes.append('IH0') elif letter == 'z': if len(word) > pos + 1 and word[pos + 1] == 'h': if len(word) == pos + 2: phonemes.append('SH') else: phonemes.append('ZH') else: if len(word) == pos + 1: phonemes.append('S') else: phonemes.append('Z') elif letter in [ u'\N{CYRILLIC CAPITAL LETTER SHCHA}', u'\N{CYRILLIC SMALL LETTER SHCHA}' ]: phonemes.append('SH') # phonemes.append('CH') elif letter in [ u'\N{CYRILLIC CAPITAL LETTER TSE}', u'\N{CYRILLIC SMALL LETTER TSE}' ]: phonemes.append('T') phonemes.append('S') elif letter == u'\N{CYRILLIC CAPITAL LETTER YA}' or letter == u'\N{CYRILLIC SMALL LETTER YA}': if pos == 0: phonemes.append('IY0') phonemes.append('AA1') elif letter == u'\N{CYRILLIC CAPITAL LETTER YU}' or letter == u'\N{CYRILLIC SMALL LETTER YU}': if pos == 0: phonemes.append('Y') phonemes.append('UW0') elif letter in [ u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', u'\N{CYRILLIC SMALL LETTER IO}' ]: if pos == 0: phonemes.append('Y') phonemes.append('AO0') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon) # ~ else: # ~ print "not handled", letter.encode(output_encoding), word.encode(output_encoding) pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(input_word, recursive=False): """breaks down a word into phonemes """ # word = input_word.decode(input_encoding) # decode input into Python default internal format (utf-16) from the GUI input format word = input_word word = word.lower() previous = u'' word_index = 0 breakdown_word = [] for letter in word : if letter == u'b' : if word_index == 0 or previous in [u'm', u'n']: breakdown_word.append('B') else : breakdown_word.append('V') elif letter == u'c' : if word_index < len(word)-1 and word[word_index+1]==u'h' : breakdown_word.append('CH') elif previous == u'c' : breakdown_word.append('S') elif word_index < len(word)-1 and word[word_index+1]==u's' : pass elif word_index < len(word)-1 and word[word_index+1] in [u'e', u'i'] : # should this be SH before 'e', S before 'i' ?? breakdown_word.append('S') # South American, Castilian Spanish uses 'TH' else : breakdown_word.append('K') elif letter == u'd' : if word_index == 0 or previous in [u'l', u'n']: breakdown_word.append('D') else : breakdown_word.append('DH') elif letter == u'e' : if word_index == len(word)-1 or word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] : breakdown_word.append('EY0') else : breakdown_word.append('EH0') elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}' : if word_index == len(word)-1 or word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] : breakdown_word.append('EY1') else : breakdown_word.append('EH1') elif letter == u'g' : if word_index < len(word)-1 and word[word_index+1] == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': breakdown_word.append('V') elif word_index < len(word)-1 and word[word_index+1] in [u'e', u'i'] : breakdown_word.append('HH') else : breakdown_word.append('G') elif letter == u'h' : pass elif letter == u'l' : if word_index < len(word)-1 and word[word_index+1] == u'l' : pass elif previous == u'l' : breakdown_word.append('Y') else : breakdown_word.append('L') elif letter == u'n' : if word_index < len(word)-1 and word[word_index+1] == u'v' : breakdown_word.append('M') else : breakdown_word.append('N') elif letter == u'\N{LATIN SMALL LETTER N WITH TILDE}': breakdown_word.append('N') breakdown_word.append('Y') elif letter == u'o' : if word_index < len(word)-1 and word[word_index+1] not in [u'a',u'e',u'i',u'o',u'u']: # last bit necessary ? breakdown_word.append('AO0') else : breakdown_word.append('OW0') elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}': if word_index < len(word)-1 and word[word_index+1] not in [u'a',u'e',u'i',u'o',u'u']: # last bit necessary ? breakdown_word.append('AO1') else : breakdown_word.append('OW1') elif letter == u'p' : if word_index == len(word)-1 : pass else : breakdown_word.append('P') elif letter == u'r' : if previous == u'r' : pass elif word_index < len(word)-1 and word[word_index+1] == u'r' : breakdown_word.append('R') # RR - trilled a lot else : breakdown_word.append('R') # only a little trilled elif letter == u's' : if word_index < len(word)-1 and word[word_index+1] in [u'd',u'g',u'l',u'm',u'n'] : breakdown_word.append('Z') else: breakdown_word.append('S') elif letter == u'u' : if previous == u'q' : pass elif previous == u'g' and word_index < len(word)-1 and word[word_index+1] in [u'u',u'i'] : pass else : breakdown_word.append('UW0') elif letter == u'\N{LATIN SMALL LETTER U WITH ACUTE}': if previous == u'q' : pass elif previous == u'g' and word_index < len(word)-1 and word[word_index+1] in [u'u',u'i'] : pass else : breakdown_word.append('UW1') elif letter == u'v' : if word_index == 0 or previous in [u'm', u'n']: breakdown_word.append('B') else : breakdown_word.append('V') elif letter == u'x' : if previous in [u'a',u'e',u'i',u'o',u'u'] and word_index < len(word)-1 and word[word_index+1] in [u'a',u'e',u'i',u'o',u'u'] : breakdown_word.append('K') breakdown_word.append('S') else : breakdown_word.append('S') elif letter == u'y' : if len(word) == 1 : breakdown_word.append('IY1') elif word_index == len(word)-1 : breakdown_word.append('IY0') else : breakdown_word.append('Y') elif letter in unconditional_conversions.keys() : breakdown_word.append(unconditional_conversions[letter]) elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: breakdown_word.append(phon[0]) previous = letter word_index = word_index + 1 breakdown_word = stressSpanishWord(breakdown_word) # return breakdown_word temp_phonemes = [] previous_phoneme = " " for phoneme in breakdown_word: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(vowels).has_key phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'j': 'IY0', # actual pronunciation varies with word origin 'k': 'K', # actual pronunciation varies with word origin 'l': 'L', 'm': 'M', 'p': 'P', 'q': 'K', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'W', # actual pronunciation varies with word origin 'x': 'K S', # actual pronunciation varies with word origin 'y': 'IY0', # actual pronunciation varies with word origin } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter in [u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']: phonemes.append('AA0') elif letter == 'c': if previous == 's': # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('SH') else: phonemes.append('S') phonemes.append('K') elif len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('CH') else: phonemes.append('K') elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']: phonemes.append('EH0') # long is "EY0" elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}': phonemes.append('EY0') elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}': phonemes.append('EH0') elif letter == 'g': if len(word) > pos+1 and word[pos+1] in ['e', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']: phonemes.append('JH') elif len(word) > pos+1 and word[pos+1] in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: pass # handled under 'i' elif len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('G') elif len(word) > pos+1 and word[pos+1] == 'l': pass # handled nuder 'l' elif len(word) > pos+1 and word[pos+1] == 'n': pass # handled under 'n' elif len(word) > pos+1 and word[pos+1] == 'u': phonemes.append('G') phonemes.append('W') else: phonemes.append('G') elif letter == 'h': pass elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: if previous == 'c' and len(word) > pos+1 and isvowel(word[pos+1]): pass elif previous == 'g': if len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']: # or isvowel(word[pos+1]) ?? phonemes.append('JH') else: phonemes.append('G') phonemes.append('IY0') else: phonemes.append('IY0') elif letter == 'l': if previous == 'g': # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('L') phonemes.append('IY0') else: phonemes.append('L') phonemes.append('G') else: phonemes.append('L') elif letter == 'n': if previous == 'g': if len(word) > pos+1 and isvowel(word[pos+1]): phonemes.append('N') phonemes.append('Y') else: phonemes.append('G') phonemes.append('N') else: phonemes.append('N') elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']: phonemes.append('OW0') # when closed, when open as 'AO0' ? elif letter == 's': if len(word) > pos+1 and word[pos+1] == 'c': pass # handled under c elif isvowel(previous) and len(word) > pos+1 and isvowel(word[pos+1]): phonemes.append('Z') elif pos == 0: if len(word) > pos+1 and isvowel(word[pos+1]): phonemes.append('S') elif len(word) > pos+1 and word[pos+1] in ['c', 'f', 'p', 'q', 's', 't']: phonemes.append('S') elif len(word) > pos+1 and word[pos+1] in ['b', 'd', 'g', 'l', 'm', 'n', 'r', 'v']: phonemes.append('Z') else: phonemes.append('S') elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}']: if previous == 'q': phonemes.append('W') elif previous == 'g': pass # handled under 'g' else: phonemes.append('UW0') elif letter == 'z': if pos == 0: phonemes.append('Z') elif previous == 'z': phonemes.append('T') phonemes.append('S') elif len(word) > pos+1 and word[pos+1] == 'z': pass else: phonemes.append('Z') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes # return " ".join(phonemes) temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(vowels).has_key phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'j': 'IY0', # actual pronunciation varies with word origin 'k': 'K', # actual pronunciation varies with word origin 'l': 'L', 'm': 'M', 'p': 'P', 'q': 'K', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'W', # actual pronunciation varies with word origin 'x': 'K S', # actual pronunciation varies with word origin 'y': 'IY0', # actual pronunciation varies with word origin } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass if letter in [ u'a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}' ]: phonemes.append('AA0') elif letter == 'c': if previous == 's': # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î'] if len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' ]: phonemes.append('SH') else: phonemes.append('S') phonemes.append('K') elif len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' ]: phonemes.append('CH') else: phonemes.append('K') elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']: phonemes.append('EH0') # long is "EY0" elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}': phonemes.append('EY0') elif letter == u'\N{LATIN SMALL LETTER E WITH GRAVE}': phonemes.append('EH0') elif letter == 'g': if len(word) > pos + 1 and word[pos + 1] in [ 'e', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}' ]: phonemes.append('JH') elif len(word) > pos + 1 and word[pos + 1] in [ 'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' ]: pass # handled under 'i' elif len(word) > pos + 1 and word[pos + 1] == 'h': phonemes.append('G') elif len(word) > pos + 1 and word[pos + 1] == 'l': pass # handled nuder 'l' elif len(word) > pos + 1 and word[pos + 1] == 'n': pass # handled under 'n' elif len(word) > pos + 1 and word[pos + 1] == 'u': phonemes.append('G') phonemes.append('W') else: phonemes.append('G') elif letter == 'h': pass elif letter in [ 'i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' ]: if previous == 'c' and len(word) > pos + 1 and isvowel( word[pos + 1]): pass elif previous == 'g': if len(word) > pos + 1 and word[pos + 1] in [ 'a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}' ]: # or isvowel(word[pos+1]) ?? phonemes.append('JH') else: phonemes.append('G') phonemes.append('IY0') else: phonemes.append('IY0') elif letter == 'l': if previous == 'g': # ['e', 'i', 'é', 'í', 'è', 'ì', 'ê', 'î'] if len(word) > pos + 1 and word[pos + 1] in [ 'e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' ]: phonemes.append('L') phonemes.append('IY0') else: phonemes.append('L') phonemes.append('G') else: phonemes.append('L') elif letter == 'n': if previous == 'g': if len(word) > pos + 1 and isvowel(word[pos + 1]): phonemes.append('N') phonemes.append('Y') else: phonemes.append('G') phonemes.append('N') else: phonemes.append('N') elif letter in [ 'o', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH GRAVE}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}' ]: phonemes.append('OW0') # when closed, when open as 'AO0' ? elif letter == 's': if len(word) > pos + 1 and word[pos + 1] == 'c': pass # handled under c elif isvowel(previous) and len(word) > pos + 1 and isvowel( word[pos + 1]): phonemes.append('Z') elif pos == 0: if len(word) > pos + 1 and isvowel(word[pos + 1]): phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] in [ 'c', 'f', 'p', 'q', 's', 't' ]: phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] in [ 'b', 'd', 'g', 'l', 'm', 'n', 'r', 'v' ]: phonemes.append('Z') else: phonemes.append('S') elif letter in [ 'u', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}', u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}' ]: if previous == 'q': phonemes.append('W') elif previous == 'g': pass # handled under 'g' else: phonemes.append('UW0') elif letter == 'z': if pos == 0: phonemes.append('Z') elif previous == 'z': phonemes.append('T') phonemes.append('S') elif len(word) > pos + 1 and word[pos + 1] == 'z': pass else: phonemes.append('Z') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes # return " ".join(phonemes) temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys('aeiouäöü').has_key phonemes = [] simple_convert = { 'f': 'F', 'j': 'Y', 'k': 'K', 'l': 'L', 'm': 'M', 'p': 'P', 'q': 'K', 'r': 'R', # use AH0 or ER0 for final letter in word ?? u'\N{LATIN SMALL LETTER SHARP S}': 'S', 't': 'T', 'v': 'F', # non-native loan-words, 'V' 'w': 'V', 'y': 'IH0', # actual pronunciation varies with word origin } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == previous and not isvowel(letter): pass elif letter == 'a': if len(word) > pos+1 and word[pos+1] == 'i': # ai phonemes.append('AY0') elif len(word) > pos+1 and word[pos+1] == 'u': # au phonemes.append('AW0') elif previous == 'a': pass elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append('AH0') elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('AH0') elif len(word) == pos+1 and not isvowel(previous): phonemes.append('AA0') else: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': if len(word) > pos+1 and word[pos+1] == 'u': # äu phonemes.append('OY0') elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append('EH0') elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('EH0') elif len(word) == pos+1: phonemes.append('EY0') else: phonemes.append('EY0') elif letter == 'b': if len(word) == pos+1: phonemes.append('P') elif len(word) > pos+1 and word[pos+1] in ['s', 't']: phonemes.append('P') else: phonemes.append('B') elif letter == 'c': if previous == 's' and len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') elif len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('HH') # use 'K'?? else: phonemes.append('K') elif letter == 'd': if len(word) == pos+1: phonemes.append('T') elif len(word) > pos+1 and word[pos+1] in ['s', 't']: phonemes.append('T') else: phonemes.append('D') elif letter == 'e': if previous == 'i': pass # covered under 'i' elif len(word) == pos+2 and word[pos+1] in ['l', 'n', 'r']: # -en, -er, -el phonemes.append('EH0') elif len(word) > pos+1 and word[pos+1] == 'i': # ei phonemes.append('AY0') elif len(word) > pos+1 and word[pos+1] == 'u': # eu phonemes.append('OY0') elif len(word) > pos+1 and word[pos+1] == 'e': # ee phonemes.append('EY0') elif previous == 'e': pass elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append('EH0') elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('EH0') elif len(word) == pos+1 and not isvowel(previous): phonemes.append('EH0') else: phonemes.append('EY0') elif letter == 'g': if previous == 'n': phonemes.append('NG') elif len(word) == pos+1 and previous == 'i': phonemes.append('HH') elif len(word) == pos+1: phonemes.append('K') elif len(word) > pos+1 and word[pos+1] in ['s', 't']: phonemes.append('K') else: phonemes.append('G') elif letter == 'h': if isvowel(previous): pass # silent elif previous == 'c': pass # covered under 'c' else: phonemes.append('HH') elif letter == 'i': if previous in ['a', 'e']: pass # covered under other vowel elif len(word) > pos+1 and word[pos+1] == 'e': # ie phonemes.append('IY0') elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append('IH0') elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('IH0') elif len(word) == pos+1 and not isvowel(previous): phonemes.append('IY0') # also use IH0 here instead? elif pos == 0: phonemes.append('IH0') else: phonemes.append('IH0') # also use IH0 here instead? elif letter == 'n': if len(word) > pos+1 and word[pos+1] == 'g': pass # covered under 'g' else: phonemes.append('N') elif letter == 'o': if previous == 'o': pass elif len(word) == pos+1 and not isvowel(previous): phonemes.append('AO0') else: phonemes.append('AO0') # somtimes o in on, not covered in CMU/USA elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': phonemes.append('ER0') elif letter == 's': if pos == 0 and len(word) > pos+1 and word[pos+1] in ['p', 't']: phonemes.append('SH') elif len(word) > pos+2 and word[pos+1] == 'c' and word[pos+2] == 'h': pass # covered under 'c' elif pos == 0: phonemes.append('Z') # at beginning of word elif len(word) == pos+1: phonemes.append('S') # at end of word else: phonemes.append('S') # default sound - or 'Z' ?? elif letter == 'u': if previous in ['a', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', 'e']: pass elif previous == 'q': phonemes.append('V') elif len(word) > pos+2 and word[pos+1] == word[pos+2] and not isvowel(word[pos+1]): phonemes.append('UH0') elif len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER SHARP S}': phonemes.append('UH0') elif len(word) == pos+1 and not isvowel(previous): phonemes.append('UW0') else: phonemes.append('UW0') elif letter == u'\N{LATIN SMALL LETTER U WITH DIAERESIS}': phonemes.append('UW0') elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter == 'z': phonemes.append('T') phonemes.append('S') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter[0]), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'g': 'G', 'j': 'JH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'p': 'P', 'q': 'K', 'r': 'R', 's': 'S', 't': 'T', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z', u'\N{LATIN SMALL LETTER C WITH CEDILLA}':'S' # ç } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter in ['a',accented_a]: # a phonemes.append('AI') elif letter in ['e',accented_e]: # e phonemes.append('E') elif letter in ['i',accented_i]: # i phonemes.append('AI') elif letter in ['o',accented_o]: # o phonemes.append('O') elif letter in ['u',accented_u]: # u phonemes.append('U') elif letter in ['m',accented_u]: # u phonemes.append('MBP') elif letter in ['b',accented_u]: # u phonemes.append('MBP') elif letter in ['p',accented_u]: # u phonemes.append('MBP') elif letter in ['p',accented_u]: # u phonemes.append('MBP') elif letter in ['f',accented_u]: # u phonemes.append('FV') elif letter in ['v',accented_u]: # u phonemes.append('FV') elif letter in ['w',accented_u]: # u phonemes.append('WQ') elif letter in ['q',accented_u]: # u phonemes.append('WQ') elif letter in ['l',accented_u]: # u phonemes.append('L') elif letter == 'c': if len(word) > pos+1 and word[pos+1] == 'h': # ch phonemes.append('CH') elif len(word) > pos+1 and word[pos+1] in ['e','i','y',accented_e,accented_i]: #ce, ci phonemes.append('S') elif len(word) > pos+1 and word[pos+1] in ['a','o','r','u',accented_a,accented_o,accented_u]: # ca, co, cu, cr phonemes.append('K') else: phonemes.append('K') elif letter == 'h': if previous in ['c','s']: pass else: phonemes.append('HH') # h elif letter == 's': if len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') # sh else: phonemes.append('S') # s elif letter == 'x': # x if pos+1==len(word): phonemes.append('Z') else: phonemes.append('K') phonemes.append('S') elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter[0]), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key phonemes = [] simple_convert = { 'd': 'D', 'h': 'HH', 'j': 'Y', 'k': 'K', 'l': 'L', 'm': 'M', 'p': 'P', 'r': 'R', 's': 'S', 't': 'T', 'v': 'V', # in foreign and borrowed words and names u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0', # ??? # á u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0', # à u'\N{LATIN SMALL LETTER AE}': 'AE0', # æ - Norwegian / Danish 'b': 'B', 'c': 'K', # S ??? u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH', # ç - French, etc u'\N{LATIN SMALL LETTER C WITH CARON}': 'S', # ??? - Northern Sámi u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D', # ??? - Northern Sámi u'\N{LATIN SMALL LETTER ETH}': 'DH', # ð - Icelandic u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', # é u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'EH0', # ??? # ë - scientific names 'f': 'F', u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G', # ??? - other Sámi u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G', # ??? - other Sámi u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y', # ñ - Spanish u'\N{LATIN SMALL LETTER ENG}': 'N', # - Northern Sámi u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0', # ??? # ø - Norwegian / Danish u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0', # ??? # õ - Estonian 'q': 'K', u'\N{LATIN SMALL LETTER SHARP S}': 'S', # ß - German u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T', # - Northern Sámi u'\N{LATIN SMALL LETTER THORN}': 'TH', # Þ - Icelandic u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', # ??? # ü - German / Estonian 'w': 'V', 'z': 'Z' } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: #~ if letter == previous: #~ pass if letter == 'a': if len(word) > pos+1 and word[pos+1] in ['i', 'u', ]: pass # handled under following letter else: phonemes.append('AA0') elif letter == 'e': if len(word) > pos+1 and word[pos+1] in ['i',]: pass # handled under following letter elif previous == 'i': # ie phonemes.append('IY0') # ??? else: phonemes.append('EH0') elif letter == 'i': prev_match_i = { 'a': 'AY0', # ai 'e': 'EY0', # ei 'o': 'OY0', # oi 'u': 'UW0', # ui 'y': 'IY0' # yi # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ??? # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ??? } if previous in prev_match_i: phonemes.append(prev_match_i[previous]) else: phonemes.append('IH0') elif letter == 'o': if len(word) > pos+1 and word[pos+1] in ['i', 'u']: pass # handled under following letter elif previous == 'u': # uo phonemes.append('OW0') # ??? else: phonemes.append('OY0') elif letter == 'u': prev_match_u = { 'a': 'AW0', # au 'o': 'OW0' # AO??? # ou # eu ??? # iu ??? } if len(word) > pos+1 and word[pos+1] in ['i',]: pass # handled under following letter elif previous in prev_match_u: phonemes.append(prev_match_u[previous]) else: phonemes.append('UH0') elif letter == 'y': # äy ??? # öy ??? if len(word) > pos+1 and word[pos+1] in ['i',]: pass # handled under following letter else: phonemes.append('UW0') # ??? elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': # ä phonemes.append('AE0') elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': # ö # yö ??? phonemes.append('ER0') # ??? elif letter == 'g': if previous == 'n': phonemes.append('NG') else: phonemes.append('G') elif letter == 'n': if len(word) > pos+1 and word[pos+1] == 'g': pass # handled under g else: phonemes.append('N') elif letter in simple_convert: phonemes.append(simple_convert[letter]) elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() # isvowel = dict.fromkeys('aeiouáéíóúàèìòùâêîôû').has_key phonemes = [] simple_convert = { 'd': 'D', 'h': 'HH', 'j': 'Y', 'k': 'K', 'l': 'L', 'm': 'M', 'p': 'P', 'r': 'R', 's': 'S', 't': 'T', 'v': 'V', # in foreign and borrowed words and names u'\N{LATIN SMALL LETTER S WITH CARON}': 'SH', # š u'\N{LATIN SMALL LETTER Z WITH CARON}': 'ZH', # ž u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AA0', # ??? # á u'\N{LATIN SMALL LETTER A WITH GRAVE}': 'AA0', # à u'\N{LATIN SMALL LETTER AE}': 'AE0', # æ - Norwegian / Danish 'b': 'B', 'c': 'K', # S ??? u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'SH', # ç - French, etc u'\N{LATIN SMALL LETTER C WITH CARON}': 'S', # ??? - Northern Sámi u'\N{LATIN SMALL LETTER D WITH STROKE}': 'D', # ??? - Northern Sámi u'\N{LATIN SMALL LETTER ETH}': 'DH', # ð - Icelandic u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', # é u'\N{LATIN SMALL LETTER E WITH DIAERESIS}': 'EH0', # ??? # ë - scientific names 'f': 'F', u'\N{LATIN SMALL LETTER G WITH STROKE}': 'G', # ??? - other Sámi u'\N{LATIN SMALL LETTER G WITH BREVE}': 'G', # ??? - other Sámi u'\N{LATIN SMALL LETTER N WITH TILDE}': 'N Y', # ñ - Spanish u'\N{LATIN SMALL LETTER ENG}': 'N', # - Northern Sámi u'\N{LATIN SMALL LETTER O WITH STROKE}': 'ER0', # ??? # ø - Norwegian / Danish u'\N{LATIN SMALL LETTER O WITH TILDE}': 'ER0', # ??? # õ - Estonian 'q': 'K', u'\N{LATIN SMALL LETTER SHARP S}': 'S', # ß - German u'\N{LATIN SMALL LETTER T WITH STROKE}': 'T', # - Northern Sámi u'\N{LATIN SMALL LETTER THORN}': 'TH', # Þ - Icelandic u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', # ??? # ü - German / Estonian 'w': 'V', 'z': 'Z' } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # ~ if letter == previous: # ~ pass if letter == 'a': if len(word) > pos + 1 and word[pos + 1] in [ 'i', 'u', ]: pass # handled under following letter else: phonemes.append('AA0') elif letter == 'e': if len(word) > pos + 1 and word[pos + 1] in [ 'i', ]: pass # handled under following letter elif previous == 'i': # ie phonemes.append('IY0') # ??? else: phonemes.append('EH0') elif letter == 'i': prev_match_i = { 'a': 'AY0', # ai 'e': 'EY0', # ei 'o': 'OY0', # oi 'u': 'UW0', # ui 'y': 'IY0' # yi # u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': ä ??? # u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': öi ??? } if previous in prev_match_i: phonemes.append(prev_match_i[previous]) else: phonemes.append('IH0') elif letter == 'o': if len(word) > pos + 1 and word[pos + 1] in ['i', 'u']: pass # handled under following letter elif previous == 'u': # uo phonemes.append('OW0') # ??? else: phonemes.append('OY0') elif letter == 'u': prev_match_u = { 'a': 'AW0', # au 'o': 'OW0' # AO??? # ou # eu ??? # iu ??? } if len(word) > pos + 1 and word[pos + 1] in [ 'i', ]: pass # handled under following letter elif previous in prev_match_u: phonemes.append(prev_match_u[previous]) else: phonemes.append('UH0') elif letter == 'y': # äy ??? # öy ??? if len(word) > pos + 1 and word[pos + 1] in [ 'i', ]: pass # handled under following letter else: phonemes.append('UW0') # ??? elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': # ä phonemes.append('AE0') elif letter == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': # ö # yö ??? phonemes.append('ER0') # ??? elif letter == 'g': if previous == 'n': phonemes.append('NG') else: phonemes.append('G') elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'g': pass # handled under g else: phonemes.append('N') elif letter in simple_convert: phonemes.append(simple_convert[letter]) elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) # ~ else: # ~ print "not handled", letter, word pos += 1 previous = letter # return " ".join(phonemes) # return phonemes temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownSwedishSyllable(word, recursive=False, phonetic=False): word = word.lower() # isvowel = dict.fromkeys('aeiou').has_key phonemes = [] simple_convert = { # u'\N{LATIN SMALL LETTER A WITH ACUTE}': 'AH0', u'\N{LATIN SMALL LETTER E WITH ACUTE}': 'EY0', # u'\N{LATIN SMALL LETTER I WITH ACUTE}': 'IY0', # u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UH0', # u'\N{LATIN SMALL LETTER O WITH ACUTE}': 'UW0', # u'\N{LATIN SMALL LETTER Y WITH ACUTE}': 'ER0', 'a' : 'AH0', # not exact - AO0 ?? 'b': 'B', 'f': 'F', 'm': 'M', 'o': 'UH0', # compromise, actually UW0 or AA0 (not), sometimes AO0 'q': 'K', 'v': 'V', 'w': 'V', 'z': 'S', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}': 'AO0', # not exact u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': 'ER0', } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: if letter == 'c': if len(word) > pos+1 and word[pos+1] == 'c': pass # cc, handle on next case elif previous == 'c' and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: phonemes.append('K') phonemes.append('S') elif len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: phonemes.append('S') elif len(word) > pos+1 and word[pos+1] == 'h': phonemes.append('SH') #~ if previous == 's': #~ phonemes.append('SH') #~ else: #~ phonemes.append('CH') # sometimes 'K' as in English 'chorus', but no rule else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('K') elif letter == 'd': if pos == 0 and len(word) > pos+1 and word[pos+1] == 'j': # dj at beginning of word pass # same as j alone else: phonemes.append('D') elif letter == 'e': if phonetic: phonemes.append('EH0') elif len(word) == pos+2 and word[pos+1] == 'r': # ends in er phonemes.append('AE0') else: phonemes.append('EH0') # sometimes 'IY0', sometimes 'EY0' elif letter == 'g': if previous in ['l', 'r']: phonemes.append('Y') elif len(word) > pos+2 and word[pos+1] == 'i' and word[pos+2] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': phonemes.append('SH') elif len(word) > pos+1 and word[pos+1] == 'n' and previous in ['a', 'o', 'u', 'e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: phonemes.append('NG') elif previous == 'n': # ng phonemes.append('NG') elif len(word) > pos+1 and word[pos+1] == 'j': # gj pass # same as 'j' alone elif len(word) == pos+2 and word[pos+1] == 'e': # ends in 'ge' - French loan-word such as garage ? phonemes.append('SH') elif pos==0 and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: # ??? if e is unstressed (how to tell?), pronounce as 'G' phonemes.append('Y') elif pos==0 and len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('G') elif previous == 'g': pass else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('G') elif letter == 'h': if previous == 'c': pass # handled under c elif len(word) > pos+1 and word[pos+1] == 'j': pass # same as 'j' alone elif pos == 1 and previous == 's': # probably a foreign loan-word phonemes.append('SH') else: phonemes.append('HH') elif letter == 'i': if previous == 'g' and len(word) > pos+1 and word[pos+1] == u'\N{LATIN SMALL LETTER O WITH DIAERESIS}': pass elif previous == 's' and len(word) > pos+1 and word[pos+1] == 'o': # sio e.g mission phonemes.append('UH0') else: phonemes.append('IY0') # sometimes 'IH0' elif letter == 'k': # needs to be handled before j to handle skj sound if pos == 0 and word in [u'kefir', u'kex', u'kille', u'kis', u'kissa', u'kisse']: phonemes.append('K') elif pos == 0 and len(word) > pos+1 and word[pos+1] in ['e', 'i', 'y', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}']: phonemes.append('CH') elif word == unicode('människa', input_encoding): phonemes.append('SH') elif word == unicode('människor', input_encoding): phonemes.append('SH') elif len(word) == pos+1 and previous == 's': # ends in SK phonemes.append('S') phonemes.append('K') elif len(word) > pos+1 and word[pos+1] == 'j': # phonemes.append('SH') phonemes.append('CH') # more Finnish-Swedish than Swedish ??? elif len(word) == pos+1 and previous == 'c': pass elif previous == 's' and len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}']: phonemes.append('S') phonemes.append('K') elif previous == 's' and pos == 1: # sk at beginning of word phonemes.append('SH') else: # elif len(word) > pos+1 and word[pos+1] in ['a', 'o', 'u', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}'] phonemes.append('K') elif letter == 't': # needs to be handled before j to handle stj sound if previous == 's' and len(word) > pos+1 and word[pos+1] == 'j': phonemes.append('SH') if previous == 't' and len(word) == pos+1: pass elif len(word) > pos+1 and word[pos+1] == 'j': # tj pass # handled under j else: phonemes.append('T') elif letter == 'j': if previous == 's': phonemes.append('SH') elif previous == 't': if word[pos-2] == 's': # stj, handled under 't' pass else: phonemes.append('CH') elif previous == 'k': pass # handled under k else: phonemes.append('Y') elif letter == 'l': if len(word) > pos+1 and word[pos+1] == 'j': pass # same as 'j' alone else: phonemes.append('L') elif letter == 'n': if len(word) > pos+1 and word[pos+1] == 'g': # ng pass # handled under 'g' elif len(word) > pos+1 and word[pos+1] == 'k': # ng phonemes.append('NG') else: phonemes.append('N') elif letter == 'p': if previous == 'p': pass else: phonemes.append('P') elif letter == 'r': if len(word) > pos+1 and word[pos+1] == 's': pass # handled under s else: phonemes.append('R') elif letter == 's': if len(word) > pos+2 and word[pos+1] == 'c' and word[pos+2] == 'h': pass # handled under 'c' elif len(word) > pos+2 and word[pos+1] == 't' and word[pos+2] == 'j': pass # handled under 't' elif len(word) > pos+1 and word[pos+1] == 'k': pass # handled under 'k' elif len(word) > pos+1 and word[pos+1] == 'j': pass # handled under 'j' elif len(word) > pos+1 and word[pos+1] == 's': pass elif len(word) > pos+1 and word[pos+1] == 'i' and len(word) > pos+2 and word[pos+2] == 'o': ## might need more breakdown phonemes.append('SH') elif pos == 0 and len(word) > pos+1 and word[pos+1] == 'h': pass # handled under 'h' elif previous == 'r': phonemes.append('SH') # not entirely accurate, use HH ?? else: phonemes.append('S') elif letter == 'u': if previous == 'q': phonemes.append('V') else: phonemes.append('UW0') # inaccurate, no accurate CMU equiivalent elif letter == 'x': phonemes.append('K') phonemes.append('S') elif letter == 'y': if word in [u'yoga', u'yoghurt']: phonemes.append('Y') elif word == u'fyrtio': phonemes.append('ER0') else: phonemes.append('UW0') # not exact elif letter == u'\N{LATIN SMALL LETTER A WITH DIAERESIS}': if phonetic: phonemes.append('AE0') elif len(word) > pos+1 and word[pos+1] == 'r': phonemes.append('AE0') # not exact, and skips exceptions--- else: phonemes.append('EH0') # not exact, and skips exceptions elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: if not recursive: phon = " ".join(breakdownSwedishSyllable(hammer(letter), True, phonetic)) if phon: phonemes.append(phon) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(setvowels).has_key phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'j': 'ZH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'p': 'P', 'q': 'K', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'W', 'y': 'IY0', 'z': 'Z', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S', # ç } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass #A if letter in ['a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}': phonemes.append('AE0') #E elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']: phonemes.append('EY0') elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}': phonemes.append('EH0') #I elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('IY0') #O elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']: phonemes.append('OW0') elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}': phonemes.append('OY0') elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}': phonemes.append('AW0') # U elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']: #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # Special rule to digraphs consonant: # qu and gu (followed by e or i): aquilo, questão, quilo, querida, guerra, águia # ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiça, aguentar #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ if previous == 'q': # digraph consonant Qu # ['e', 'i', 'é', 'í', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('K') else: phonemes.append('UW0') elif previous == 'g': # digraph consonant Gu # ['e', 'i', 'é', 'í', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('G') else: phonemes.append('UW0') else: phonemes.append('UW0') #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # consonants with combinations #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #C elif letter == 'c': if previous == 's': # digraph consonant sC #asCender # ['e', 'i', 'é', 'í', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('S') phonemes.append('K') if previous == 'x': # digraph consonant xC #exCelente # ['e', 'i', 'é', 'í', 'ê', 'î'] if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('S') phonemes.append('K') #ce #ci elif len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('K') #G elif letter == 'g': #ge #gi if len(word) > pos+1 and word[pos+1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('ZH') else: phonemes.append('G') #H elif letter == 'h': # silent letter if previous == 'n': phonemes.append('N') # digraph consonant Nh else: pass #M elif letter == 'm': # ['i', 'o', 'u', 'í', 'ó', 'ú', 'î', 'ô', õ] if previous in ['i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH TILDE}'] and word[-1]==('m') or len(word) > pos+1 and not isvowel(word[pos+1]): pass # digraphs vowel am em im om um else: phonemes.append('M') #N elif letter == 'n': if len(word) > pos+1 and word[pos+1] == 'h': pass #Nh handled under #H elif isvowel(previous) and word[-1]==('n') or len(word) > pos+1 and not isvowel(word[pos+1]): pass # digraphs vowel an en in on un else: phonemes.append('N') #S elif letter == 's': if len(word) > pos+1 and word[pos+1] == 'c': pass #sC handled under #C elif isvowel(previous) and len(word) > pos+1 and isvowel(word[pos+1]): # check if have vowel before and after S #caSa phonemes.append('Z') else: phonemes.append('S') #X elif letter == 'x': if len(word) > pos+1 and word[pos+1] == 'c': pass #xC handled under #C else: phonemes.append('SH') # There are some exceptions where X have phoneme "KS" like táxi = T A K S I # elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) #~ else: #~ print "not handled", letter, word pos += 1 previous = letter # return phonemes # return " ".join(phonemes) temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes
def breakdownWord(input_word, recursive=False): word = input_word word = word.lower( ) # trasformando tutte le parole in minuscolo si diminuiscono le combinazioni da gestire previous = u'' word_index = 0 breakdown_word = [] for letter in word: if letter == u'c': # ci if word_index < len(word) and word[word_index + 1] == u'i': breakdown_word.append('EH0') # ce elif word_index < len(word) and word[word_index + 1] == u'e': breakdown_word.append('EH0') # cci elif word_index < len(word) - 1 and word[ word_index + 1] == u'c' and word[word_index + 2] == u'i': breakdown_word.append('EH0') else: breakdown_word.append('K') elif letter == u'g': # gi if word_index < len(word) and word[word_index + 1] == u'i': breakdown_word.append('JH') # gli elif word_index < len(word) - 1 and word[ word_index + 1] == u'l' and word[word_index + 2] == u'i': breakdown_word.append('JH') else: breakdown_word.append('G') elif letter == u'i': # ci, #gi if previous == u'c' or previous == u'g': previous = letter word_index += 1 continue else: breakdown_word.append('EH0') elif letter == u'\N{LATIN SMALL LETTER I WITH ACUTE}': # cí, #gí if previous == u'c' or previous == u'g': previous = letter word_index += 1 continue else: breakdown_word.append('EH1') elif letter == u'\N{LATIN SMALL LETTER I WITH GRAVE}': # cì, #gì if previous == u'c' or previous == u'g': previous = letter word_index += 1 continue else: breakdown_word.append('EH1') elif letter == u'h': # ch if previous == u'c': previous = letter word_index += 1 continue else: breakdown_word.append('HH') elif letter == u'j': if 0 < word_index < len(word): breakdown_word.append('JH') else: breakdown_word.append('EH0') elif letter == u'l': # gli if word_index < len(word) and previous == u'g' and word[ word_index + 1] == u'i': previous = letter word_index += 1 continue else: breakdown_word.append('L') elif letter in unconditional_conversions.keys(): breakdown_word.append(unconditional_conversions[letter]) elif letter == " ": pass elif len(hammer(letter)) == 1: # print "hammer" if not recursive: phon = " ".join(breakdownWord(hammer(letter), True)) if phon: breakdown_word.append(phon.split()[0]) # ~ else: # ~ print "not handled", letter, word previous = letter word_index += 1 return breakdown_word
def breakdownWord(word, recursive=False): word = word.lower() isvowel = dict.fromkeys(setvowels).has_key phonemes = [] simple_convert = { 'b': 'B', 'd': 'D', 'f': 'F', 'j': 'ZH', 'k': 'K', 'l': 'L', 'm': 'M', 'n': 'N', 'p': 'P', 'q': 'K', 'r': 'R', 't': 'T', 'v': 'V', 'w': 'W', 'y': 'IY0', 'z': 'Z', u'\N{LATIN SMALL LETTER C WITH CEDILLA}': 'S', # з } easy_consonants = simple_convert.keys() pos = 0 previous = ' ' for letter in word: # if letter == previous and not isvowel(letter): # double consonants # pass # A if letter in ['a', u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}']: phonemes.append('AA0') elif letter == u'\N{LATIN SMALL LETTER A WITH TILDE}': phonemes.append('AE0') # E elif letter in ['e', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}']: phonemes.append('EY0') elif letter == u'\N{LATIN SMALL LETTER E WITH ACUTE}': phonemes.append('EH0') # I elif letter in ['i', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('IY0') # O elif letter in ['o', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}']: phonemes.append('OW0') elif letter == u'\N{LATIN SMALL LETTER O WITH ACUTE}': phonemes.append('OY0') elif letter == u'\N{LATIN SMALL LETTER O WITH TILDE}': phonemes.append('AW0') # U elif letter in ['u', u'\N{LATIN SMALL LETTER U WITH ACUTE}']: # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # Special rule to digraphs consonant: # qu and gu (followed by e or i): aquilo, questгo, quilo, querida, guerra, бguia # ?need fix exceptions when vowel u is pronounced : cinquenta, frequente, tranquilo, linguiзa, aguentar # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ if previous == 'q': # digraph consonant Qu # ['e', 'i', 'й', 'н', 'к', 'о'] if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('K') else: phonemes.append('UW0') elif previous == 'g': # digraph consonant Gu # ['e', 'i', 'й', 'н', 'к', 'о'] if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('G') else: phonemes.append('UW0') else: phonemes.append('UW0') # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # consonants with combinations # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # C elif letter == 'c': if previous == 's': # digraph consonant sC #asCender # ['e', 'i', 'й', 'н', 'к', 'о'] if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('S') phonemes.append('K') if previous == 'x': # digraph consonant xC #exCelente # ['e', 'i', 'й', 'н', 'к', 'о'] if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('S') phonemes.append('K') # ce #ci elif len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('S') else: phonemes.append('K') # G elif letter == 'g': # ge #gi if len(word) > pos + 1 and word[pos + 1] in ['e', 'i', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}']: phonemes.append('ZH') else: phonemes.append('G') # H elif letter == 'h': # silent letter if previous == 'n': phonemes.append('N') # digraph consonant Nh else: pass # M elif letter == 'm': # ['i', 'o', 'u', 'н', 'у', 'ъ', 'о', 'ф', х] if previous in ['i', 'o', 'u', u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER O WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH TILDE}'] and word[-1] == ('m') or len( word) > pos + 1 and not isvowel(word[pos + 1]): pass # digraphs vowel am em im om um else: phonemes.append('M') # N elif letter == 'n': if len(word) > pos + 1 and word[pos + 1] == 'h': pass # Nh handled under #H elif isvowel(previous) and word[-1] == ('n') or len(word) > pos + 1 and not isvowel(word[pos + 1]): pass # digraphs vowel an en in on un else: phonemes.append('N') # S elif letter == 's': if len(word) > pos + 1 and word[pos + 1] == 'c': pass # sC handled under #C elif isvowel(previous) and len(word) > pos + 1 and isvowel( word[pos + 1]): # check if have vowel before and after S #caSa phonemes.append('Z') else: phonemes.append('S') # X elif letter == 'x': if len(word) > pos + 1 and word[pos + 1] == 'c': pass # xC handled under #C else: phonemes.append('SH') # There are some exceptions where X have phoneme "KS" like tбxi = T A K S I # elif letter in easy_consonants: phonemes.append(simple_convert[letter]) elif letter == ' ': pass elif len(hammer(letter)) == 1: if not recursive: phon = breakdownWord(hammer(letter), True) if phon: phonemes.append(phon[0]) # ~ else: # ~ print "not handled", letter, word pos += 1 previous = letter # return phonemes # return " ".join(phonemes) temp_phonemes = [] previous_phoneme = " " for phoneme in phonemes: if phoneme != previous_phoneme: temp_phonemes.append(phoneme) previous_phoneme = phoneme return temp_phonemes