forked from AmmarahCassim/FullStackSolutionsCapstoneProject
-
Notifications
You must be signed in to change notification settings - Fork 0
/
english_breakdown.py
190 lines (177 loc) · 7.67 KB
/
english_breakdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/local/bin/python
# -*- coding: cp1252 -*-
# this language module is written to be part of
# Papagayo-NG, a lip-sync tool for use with several different animation suites
# Original Copyright (C) 2005 Mike Clifton
#
# this module Copyright (C) 2016 Azia Giles Abuara
# Contact information at aziacomics-com.webs.com, aziagiles@gmail.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
'''
***TUTORIAL IN USING THIS MODULE
-Spell any word as they are pronounced rather than how they are really spelled. Do this with respect to the english
alphabet in mind i.e a-z,sh,ch. In summary play just with this 28 sounds to spell your words i.e
a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,ch,sh
***E.g:
'laugh' will be spelled as 'laf', 'the' as 'de', 'say' as 'seh','nation' as 'nehshon', 'chalk' as 'cholk', 'chlorine' as 'klorin'
'genius' as 'jinius', 'pharmacy' as 'famaci', 'cough' as 'cof', 'ghetto' as 'gheto' etc
***Objective: The idea behind this module is to help you breakdown any other language or dialect with respect to english
'''
"""functions to take any Word in Any Language or Dialect and return a list of phonemes
"""
# from breakdowns.unicode_hammer import latin1_to_ascii as hammer
from unicode_hammer import latin1_to_ascii as hammer
import locale, sys, json
import string
input_encoding = locale.getdefaultlocale()[1] # standard system encoding??
# input_encoding = 'cp1252'
# input_encoding = 'utf-8'
# input_encoding = 'utf-16'
# input_encoding = 'latin-1'
# input_encoding = 'iso-8859-1'
# lists containing different accented vowels
accented_a = [u'\N{LATIN SMALL LETTER A WITH ACUTE}', u'\N{LATIN SMALL LETTER A WITH GRAVE}', u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER A WITH TILDE}', u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', u'\N{LATIN SMALL LETTER A WITH RING ABOVE}', u'\N{LATIN SMALL LETTER AE}']
accented_e = [u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', u'\N{LATIN SMALL LETTER E WITH GRAVE}', u'\N{LATIN SMALL LETTER E WITH ACUTE}', u'\N{LATIN SMALL LIGATURE OE}']
accented_i = [u'\N{LATIN SMALL LETTER I WITH ACUTE}', u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER I WITH GRAVE}', u'\N{LATIN SMALL LETTER I WITH DIAERESIS}']
accented_o = [u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', u'\N{LATIN SMALL LETTER O WITH STROKE}',u'\N{LATIN SMALL LETTER O WITH GRAVE}',u'\N{LATIN SMALL LETTER O WITH ACUTE}',u'\N{LATIN SMALL LETTER O WITH TILDE}']
accented_u = [u'\N{LATIN SMALL LETTER U WITH ACUTE}', u'\N{LATIN SMALL LETTER U WITH GRAVE}',u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}',u'\N{LATIN SMALL LETTER U WITH DIAERESIS}']
def breakdownWord(word, recursive=False):
word = word.lower()
phonemes = []
simple_convert = {
'b': 'B',
'd': 'D',
'f': 'F',
'g': 'G',
'j': 'JH',
'k': 'K',
'l': 'L',
'm': 'M',
'n': 'N',
'p': 'P',
'q': 'K',
'r': 'R',
's': 'S',
't': 'T',
'v': 'V',
'w': 'W',
'y': 'Y',
'z': 'Z',
u'\N{LATIN SMALL LETTER C WITH CEDILLA}':'S' # ç
}
easy_consonants = simple_convert.keys()
pos = 0
previous = ' '
for letter in word:
if letter in ['a',accented_a]: # a
phonemes.append('AI')
elif letter in ['e',accented_e]: # e
phonemes.append('E')
elif letter in ['i',accented_i]: # i
phonemes.append('AI')
elif letter in ['o',accented_o]: # o
phonemes.append('O')
elif letter in ['u',accented_u]: # u
phonemes.append('U')
elif letter in ['m',accented_u]: # u
phonemes.append('MBP')
elif letter in ['b',accented_u]: # u
phonemes.append('MBP')
elif letter in ['p',accented_u]: # u
phonemes.append('MBP')
elif letter in ['p',accented_u]: # u
phonemes.append('MBP')
elif letter in ['f',accented_u]: # u
phonemes.append('FV')
elif letter in ['v',accented_u]: # u
phonemes.append('FV')
elif letter in ['w',accented_u]: # u
phonemes.append('WQ')
elif letter in ['q',accented_u]: # u
phonemes.append('WQ')
elif letter in ['l',accented_u]: # u
phonemes.append('L')
elif letter == 'c':
if len(word) > pos+1 and word[pos+1] == 'h': # ch
phonemes.append('CH')
elif len(word) > pos+1 and word[pos+1] in ['e','i','y',accented_e,accented_i]: #ce, ci
phonemes.append('S')
elif len(word) > pos+1 and word[pos+1] in ['a','o','r','u',accented_a,accented_o,accented_u]: # ca, co, cu, cr
phonemes.append('K')
else:
phonemes.append('K')
elif letter == 'h':
if previous in ['c','s']:
pass
else:
phonemes.append('HH') # h
elif letter == 's':
if len(word) > pos+1 and word[pos+1] == 'h':
phonemes.append('SH') # sh
else:
phonemes.append('S') # s
elif letter == 'x': # x
if pos+1==len(word):
phonemes.append('Z')
else:
phonemes.append('K')
phonemes.append('S')
elif letter in easy_consonants:
phonemes.append(simple_convert[letter])
elif letter == ' ':
pass
elif len(hammer(letter)) == 1:
if not recursive:
phon = breakdownWord(hammer(letter[0]), True)
if phon:
phonemes.append(phon[0])
#~ else:
#~ print "not handled", letter, word
pos += 1
previous = letter
# return " ".join(phonemes)
# return phonemes
temp_phonemes = []
previous_phoneme = " "
for phoneme in phonemes:
temp_phonemes.append(phoneme)
previous_phoneme = phoneme
return temp_phonemes
def read_in():
lines = sys.stdin.readlines()
return json.loads(lines[0])
def main():
words = []
newWords = []
wordMapping = {}
temp = []
tempWord = ""
lines = read_in()
total_sum_inArray = ""
words = [item.encode('ascii','ignore') for item in lines]
for word in words:
#word = string.replace(word, "'", "")
newWords.append(string.replace(word, "'", ""))
for word in newWords:
wordMapping[word] = breakdownWord(unicode(word, input_encoding))
print wordMapping
if __name__ == "__main__":
main()
''' testwords = ['okay','lets','party']
#testwords = sys.argv[1]
for word in testwords:
#print(str(word, breakdownWord(unicode(word, input_encoding))))
print(word, breakdownWord(unicode(word, input_encoding))) '''