-
Notifications
You must be signed in to change notification settings - Fork 0
/
synth.py
695 lines (517 loc) · 27.9 KB
/
synth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
__author__ = 'B083126'
# Developed with monophones
import sys
import os
import SimpleAudio as SA
import argparse
from nltk.corpus import cmudict
import numpy as np
import re
import datetime as dt
# Arguments
parser = argparse.ArgumentParser(
description='A basic text - to - speech app that synthesises an input phrase using monophone unit selection.')
parser.add_argument('--monophones', default="monophones", help="Folder containing monophone wavs")
parser.add_argument('--play', '-p', action="store_true", default=False, help="Play the output audio")
parser.add_argument('--outfile', '-o', action="store", dest="outfile", type=str, help="Save the output audio to a file",
default=None)
parser.add_argument('phrase', nargs='+', help="The phrase to be synthesised")
# Arguments for extensions
parser.add_argument('--spell', '-s', action="store_true", default=False,
help="Spell the phrase instead of pronouncing it")
parser.add_argument('--volume', '-v', default=None, type=float,
help="A float between 0.0 and 1.0 representing the desired volume")
args = parser.parse_args()
def type_check(var_types):
""" Takes a list of variables with their required types, and raises a TypeError exception if the type does not match.
Each variable may also accept None value, if the third tuple element passes True.
Args:
var_type (list of tuple<str, type, bool>): List of variable, their required type and
thirdly whether or not they can accept None
Raises:
TypeError (if a variable does not match its required type)
"""
v, t, accept_none = zip(*var_types) # Split list of tuples into vars and types
t_real = tuple([type(i) for i in v])
for i in xrange(len(v)):
condition = isinstance(v[i], t[i])
if accept_none[i]:
condition = condition or (v[i] == None)
if not condition:
raise TypeError(str(v[i]) + " is not " + str(t[i]))
class Synth(object):
""" Class for a speech synthesiser object which generates, plays and saves a speech sequence, given a list of phonemes.
Stores an output audio file representation, a list of phoneme files
and a word-phoneme dictionary; with additional methods for speech synthesisation.
Attributes:
out (Audio): The output, as generated by the synthesiser as a wave format Audio object.
phones (dict of str : Audio): Dictionary of audio sequences for each phoneme, as taken from a folder of wave files.
word_phones_dict (dict of str : list<unicode strings> ): The Carnegie Mellon (CMU) Pronouncing Dictionary, contains a list of phoneme sequences for a large lexicon of English words.
"""
def __init__(self, wav_folder):
self.out = SA.Audio(rate=16000) # Create a blank audio for output, with a frequency of 16000
self.phones = self.get_wavs(wav_folder) # Add wavs as audio objects for each phoneme
# and additional elements for pause breaks
self.add_phone_break('comma - break', 250)
self.add_phone_break('sentence - break', 500)
self.word_phones_dict = cmudict.dict()
def get_wavs(self, wav_folder):
""" Reads and stores wave files from a given folder
Looks in a dictionary and checks for each .wav file.
Creates an Audio object, loaded from the given wav file.
Each Audio is then appended to a dictionary to be returned.
Args:
wav_folder (str): The filepath to read from.
Returns:
wavs: A dictionary of all waves as Audio objects mapped to a string key based on their filename, in format {filename: Audio}
"""
type_check([(wav_folder, str, False)])
wavs = {}
for root, dirs, files in os.walk(wav_folder, topdown=False):
for file in files:
name, ext = os.path.splitext(file)
if ext.lower() == '.wav':
audio = SA.Audio()
audio.load(wav_folder + '/' + file)
wavs.update({name: audio})
return wavs
def add_phone_break(self, name, length, frequency=16000):
""" Creates an Audio object representing a pause of a given length, adding it as a dictionary element to the phones.
Calculates the sample rate for a millisecond.
Creates an Audio object, and fills it with a numpy array of zeros.
The number of zeros is calculated as the length * ms sample rate.
The Audio object is then added to the phones dictionary
Args:
name (str): A string name to be the dictionary index
length (int): The length of the break in ms
frequency (int - optional) : Optional integer value for frequency, defaults to 16000
"""
type_check([(name, str, False), (length, int, False), (frequency, int, False)])
ms = frequency / 1000 # sample rate for miliseconds
audio = SA.Audio()
audio.data = np.zeros(length * ms, np.int16)
self.phones.update({name: audio})
def concat_phone_seq(self, phone_seq):
""" Takes a sequence of phonemes and concantenates them as the output data.
Creates a tuple containing the data for each phoneme in the sequence.
The sequence is then concentenated as a numpy array.
The output audio object is then updated to store the concatenation.
Args:
phone_seq (list of str): A sequence of phoneme strings, each mapped to an element in the phones dict
"""
type_check([(phone_seq, list, False)])
datas = tuple([self.phones[p].data for p in phone_seq])
self.out.data = np.concatenate(datas)
def play_and_save(self, play=False, volume=None, saveout=None):
""" Play or save the audio output
Args:
play (bool -optional): Boolean option whether to play the audio or not. Defaults to False
volume (float): Float value for play volume. Will only be set if play is True.
saveout (str -optional): A string filename without file extension. Defaults to None (in which case no file will be created)
"""
type_check([(play, bool, False), (saveout, str, True)]) # saveout may also accept None
if play:
self.adjust_volume(volume)
self.out.play()
if saveout != None:
self.out.save(saveout)
print "Saved audio sequence as " + saveout + ".wav"
def adjust_volume(self, volume):
""" Adjust the volume of the audio output
Args:
volume (float): Float value for new volume. Must range between 0.0 and 1.0 inclusive.
Note: Integer and other number values may be accepted if they can be cast to float.
"""
if volume != None:
try:
float(volume)
except ValueError:
raise ValueError(volume + " must be a float")
if 0.0 < volume > 1.0:
raise ValueError(volume + " must range between 0.0 and 1.0 inclusive")
self.out.rescale(volume)
''' Text processing and normalisation methods
These methods are outside the scope of the Synth class, as they process text before it is input into the synthesiser.
* The synthesiser will only take in a list phoneme sequence, so any text must be converted to this format first.
* There are also methods to normalise text(including transforming numbers and dates into spoken text), before
the phoneme conversion.
'''
def normalise_text(tokens, spell=False):
""" Given a phrase, normalise this into a standard format. Includes options for spelling, dates and numbers.
Performs a number of regex operations to transform the text phrase including stripping of
any excess punctuation, lowercasing all letters and looking for any date patterns to normalise.
Will also normalise all integers and floating point numbers as text.
Tokenises words and punctuation, if spelling is on tokenises all characters.
Tokens are returned as a list.
Args:
tokens (list of strings): An initial list of individual tokens which together form a phrase
spell (bool -optional): Optional parameter to normalise using spelling rules
- ie tokenising by character. Defaults to false
Return:
list of str: List of normalised tokens, which must be converted into a
phoneme sequence before the Synthesiser can read it.
"""
type_check([(tokens, list, False), (spell, bool, False)])
# Will be performing regex operations on a string, so join these up with
# spaces for now.
phrase = ' '.join(tokens)
# We only want to keep alphanumberic, forward slashes, commas, fullstops, question and exclaimation marks.
# We will also keep apostrophes if they are in the middle of a word, though any other apostrophes will be stripped.
phrase = re.sub(r"\s'|'\s", "", phrase) # Remove any non-infixed apostrophe
phrase = re.sub(r"[^\w\s\.,?!'/]", "", phrase).lower() # Remove extra punctuation and lowercase.
# Check for date patterns to normalise
ddmm = r'(\b\d{2} /\d{2}\b)'
ddmmyy = r'(\b\d{2} /\d{2} /\d{2}\b)'
ddmmyyyy = r'(\b\d{2} /\d{2} /\d{4}\b)'
phrase = re.sub(ddmmyyyy + r'|'+ ddmmyy + r'|'+ ddmm, lambda x: normalise_date(x.group(), spell), phrase)
# Normalise floating point numbers
phrase = re.sub(r'\d +\.\d +', lambda x: normalise_number(x.group()), phrase)
# If spelling is on: Add in extra pause breaks between words and split token by letter
if spell:
# Add a temp pause character (denoted as ';') after each space
phrase = re.sub(r'\s +', ';', phrase)
# Surround any letters and numbers with spaces (to split them)
phrase = re.sub(r'(\w)', r' \1 ', phrase)
# Now free to replace any temp ';' with a ','. As punctuation commas have
# been pre-ambiguated
phrase = re.sub(r';', ', ', phrase)
else:
# Seperate any non-word punctuation (Not apostrophes, as they are part of a word)
phrase = re.sub(r'([\., !?/ ])', r' \1 ', phrase)
# Normalise integer numbers (already did floats, doing integers now incase
# any were left over from dates.)
phrase = re.sub(r'\d +', lambda x: normalise_number(x.group()), phrase)
# Remove any extra spaces if there are any
phrase = re.sub(r'\s{2, }', ' ', phrase)
# Split back into tokens
tokens = phrase.strip().split(' ')
return tokens
def normalise_number(match):
""" Normalise integer and floating point numbers into a spoken word sequence.
Distinguishes between floating point and integers and calls a recursive function as appropriate
to build up a spoken text representation of each part. The text representation is then returned.
Args:
match (str/int): A string of numbers which have been matched. Will accept int, but will be cast to string.
Note: We store this as a string so it is easy to seperate each digit and inorder to keep any trailing 0s
Return:
str: Spoken text normalisation of the number
"""
if isinstance(match, int):
match = str(match)
type_check([(match, str, False)])
def num_to_word(nums, pre="", split_digits=False):
""" Recursive function which evaluates a string of digits, left to right one character at a time;
and appends this to a string before returning.
Follows different rules depending on position of the digit (Hundreds, Tens, Units)
and specific cases of the numbers present.
If split_digits is on, will evaluate each digit as a unit.
Note: Numbers over 3 digits or beginning with 0 will be evaluated as split_digits.
Args:
nums (str): A string of numbers still left to be evaluated.
pre (str): A string 'prefix', of the the currently evaluated text representation.
split_digits (bool -optional): Option to evaluate all digits as units.
Return:
Recursive call on itself with the remaining numbers and the prefix as evaluated so far until eventually:-
str: Final evaluation of the full number as a string words.
"""
type_check([(nums, str, False), (pre, str, False),
(split_digits, bool, False)])
# Ordered list of unit numbers in word form.
units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
# Dictionaries for special cases
xty = {0: '', 1: 'ten', 2: 'twenty', 3: 'thirty', 4: 'forty', 5: 'fifty', 8: 'eighty'}
xteen = {1: 'eleven', 2: 'twelve', 3: 'thirteen', 5: 'fifteen', 8: 'eighteen'}
def get_xty(t, pre=""):
""" Append the name for the tens digit to pre.
Will find the name for t and append a 'ty',
unless there's a special rule in the xty dictionary instead.
Args:
t (int): The tens digit
pre (str): A string 'prefix', of the the currently evaluated text representation.
"""
type_check([(t, int, False), (pre, str, False)])
if t in xty:
pre += xty[t]
else:
pre += units[t] + 'ty'
return pre.rstrip()
def get_xteen(u, pre=""):
""" Append the of the correct teen number to pre.
Will find the name for u and append a 'teen',
unless there's a special rule in the xteen dictionary instead.
Args:
u (int): The units digit
pre (str): A string 'prefix', of the the currently evaluated text representation.
"""
type_check([(t, int, False), (pre, str, False)])
if u in xteen:
pre += xteen[u]
else:
pre += units[u] + 'teen'
return pre.rstrip()
# Boolean check that the length is inrange not to be a unit evaluation
nonunit_range = len(nums) in [2, 3]
# Boolean check for trailing 0s (at first iteraton, with an empty pre)
trailing_zero = (pre == '') and (nums[0] == '0')
if trailing_zero or not nonunit_range:
split_digits = True # Will evaluate as units
if split_digits:
# When evaluating units, just look up each digit and append with a space
for d in nums:
pre += units[int(d)] + ' '
return pre.rstrip()
else:
# Evaluating the tens
if len(nums) == 2:
t, u = int(nums[0]), int(nums[1])
# Special terminating case for when ending in 0 (ie multiple of ten,
# X-ty), as no more units to evaluate.
if u == 0:
return get_xty(t, pre)
# Special terminating case for teens, as nothing else left to evaluate
elif t == 1:
return get_xteen(u, pre) # special case: teens can terminate as is.
# Normal case: Evaluate tens (+ty), append to pre and call recursively with remaining digit
else:
pre = get_xty(t, pre) + ' '
return num_to_word(nums[1:], pre)
# Evaluating the hundreds
elif len(nums) == 3:
h, t, u = int(nums[0]), int(nums[1]), int(nums[2])
pre += units[h] + ' hundred'
# Special terminating case: Return pre when tens and units are 0 (No "one
# hundred and zero")
if t == u == 0:
return pre.rstrip()
# Normal case: Append an 'and' to pre and call recursively with remaining digit
else:
pre += ' and '
return num_to_word(nums[1:], pre)
match = str(match)
is_float = '.' in match # Match is considered a float if it contains a decimal point
# If float, evaluate each segment individually
if is_float:
i = match.split('.')[0]
f = match.split('.')[1]
integ = num_to_word(i)
fract = num_to_word(f, split_digits=True)
return integ + ' point ' + fract
else:
return num_to_word(match)
def normalise_date(match, spell=True):
""" Normalise dates into a spoken word sequence.
Matches a number of different date formats.
If there are only two digits in the year, find the closest year.
Will use the datetime library to check whether a given date is valid.
And if it is will proceed to translate into English, by finding the month and ordinal name
for the day.
The year will also be translated into words, based on general English language conventions.
Will finally join these segments together to form a full date in words.
Args:
match (str): A string of dates which have been matched. Should be in the formats: 'dd/mm', 'dd/mm/yy' or 'dd/mm/yyyy'
Note: Dates with only single digit dates or months will not match, there must be a trailing 0 to match the 2-digit format.
spell (bool): If spell is True, separate each character instead with no text conversion.
Return:
str: Spoken text normalisation of the number
"""
type_check([(match, str, False), (spell, bool, False)])
def get_ordinal(num):
""" Get the ordinal name for a number.
Given any integer, find its name and follow the rules of English to
append an ordinal suffix correctly onto it.
Args:
num (int): A given number, all integers upto 99 will be evaluated correctly
Return:
str: The input as a written ordinal number
"""
type_check([(num, int, False)])
# All special cases
ordinals = {0: '', 1: 'first', 2: 'second', 3: 'third', 5: 'fifth', 8: 'eighth', 9: 'ninth',
12: 'twelfth', 20: 'twentieth', 30: 'thirtieth'}
# Floor number, essentially replacing the last digit with 0.
# This part does not generally need suffixing
floor = num / 10 * 10
# Initialise the units, this will be the part where suffix is appended
unit = num
# This is essentially the case for non-multiples of ten and ten itself (no 20, 30, ...)"
if floor > 10 and unit != floor:
ordinal_day = normalise_number(floor) + ' ' # Initialise with the floor
unit = num - floor # Just keep the units for suffixing
else:
ordinal_day = ' ' # As floor is not being evaluated, initialise as space
# If a special rule, follow it and append
if unit in ordinals:
ordinal_day += ordinals[unit]
# Otherwise stick a "th" onto the unit name, and append that
else:
ordinal_day += normalise_number(unit) + 'th'
return ordinal_day
def get_closest_year(num):
""" Given the last two digits of a year, find the closest 4-digit year
Given the ending digits, add it to the current, previous and next century to get three years.
Compare the distance between the year today and each of the 3 years, to decide on the closest.
Args:
num (int): A given integer number.
Return:
int: The closest year.
"""
type_check([(num, int, False)])
current_year = dt.datetime.now().year # Today's year
current_century = current_year / 100 * 100 # The last centurial year
possibilities = [current_century + int(num)]
possibilities.extend([possibilities[0] + 100, possibilities[0] - 100])
# Find absolute distances for each possibility
diff = [abs(possible - current_year) for possible in possibilities]
closest = diff.index(min(diff))
return str(possibilities[closest])
def normalise_year(year):
""" Given a four digit integer year, normalise this in spoken words following English language conventions.
Follows a number of rules to generate dates such as:
"two thousand and one", "nineteen hundred", "nineteen oh two",
"nineteen eighty four", "twenty fifteen"
Any dates not in a four digit format will be normalised as a normal integer.
Args:
num (str): The given year as an integer
Return:
str: Spoken text normalisation of the year.
"""
type_check([(year, str, False)])
# These rules follow for full four digit years
if len(year) == 4:
year = int(year)
# Split into two halves: the century and the units.
century = year / 100
units = year - (century * 100)
# Normalise the units as normal
words_units = normalise_number(units)
# Thousand rule (eg "X thousand and Y" where X is any thousand except 1000 and Y is 0..9 )
thousand_rule = century > 10 and (century % 10 == 0) and units < 10
if thousand_rule:
words_century = normalise_number(century / 10) + ' thousand'
# Only append "and Y" when units are not 0
if units != '0':
concat = words_century + ' and ' + words_units
else: # eg "# nineteen hundred, nineteen oh two, nineteen eighty four / twenty fifteen"
# Normalise century as usual
words_century = normalise_number(century)
if units == 0:
concat = words_century + ' hundred'
elif 0 < units < 10:
concat = words_century + ' oh ' + words_units
else:
concat = words_century + ' ' + words_units
return concat
else: #When year is not 4 digits, just normalise as general number
return normalise_number(year)
# If spelling is on, just separate each character and return (as we don't
# want to spell out the text of dates)
if spell:
return " ".join(match)
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
date_segments = match.split('/')
# Get numbers for day and month already
day, month = date_segments[0], date_segments[1]
# We'll try to do the same with year
try:
year = date_segments[2]
if len(year) == 2:
year = get_closest_year(int(year))
# But we don't always capture year, so then store None
except IndexError:
year = None
if year != None:
test_date = day +'/'+ month +'/'+ year # A test date in a standard format dd/mm/yyyy for the strptime function
else:
# Strptime still requires a placeholde year
# Set it to a leap year (2016) so 29th Feb can be a valid possibility.
test_date = match + '/' + '2016'
# Date validation , try to test the date
try:
dt_date = dt.datetime.strptime(test_date, '% d / %m / %Y')
# If one of the values is out of range, this is not a valid date
except ValueError:
# Return the match as it is, to be be normalised as numbers and slashes instead
return match
day = get_ordinal(int(day))
month = months[int(month) - 1] # Months can also taken from dt.strptime(), but only for years from 1900
concat = 'the ' + day + ' of ' + month
# Only add the year if we have one
if year != None:
year = normalise_year(year)
concat += ' '+ year
return concat
def get_phone_seq(tokens, word_phones_dict, override_phones={}):
""" Converts a list of tokens into a phoneme sequence, which can be read into the Synthesiser
Iterates through each token in turn, looking it up in the Synthesiser's word-phoneme dictionary as provided.
Then takes the first phoneme list found, and appends this to our sequence.
If a word is not found here, an error message will be returned and the program will exit.
As well as looking in the Synthesiser's dictionary, a special dictionary of override rules can also be checked.
There are also checks for punctuation breaks, which are known to the Synthesiser. These can be checked in lieu of a
phoneme sequence, to insert a pre-defined break in the synthesiser's speech process.
Args:
tokens (list of str): List of tokens, which have been previously normalised
word_phones_dict (dict of str : list<unicode strings> ) : A dictionary of words with their phoneme sequences.
In this case the Carnegie Mellon (CMU) Pronouncing
Dictionary, as provided in the Synth class.
override_phones (dict str : list<unicode strings> -optional ) : An additional dictionary, provided extra and
additional rules which can override those in
the default dictionary.
Defaults to an empty dictionary
Return:
list of str: A sequence of phoneme strings, which the synthesiser can read.
"""
type_check([(tokens, list, False), (word_phones_dict,
dict, False), (override_phones, dict, False)])
terminating_punc = ['.', '!', '?']
phone_seq = []
special_exists = override_phones != None
for token in tokens:
# Ignore any empty strings, if any remain
if token == "":
pass
# Look in special phonem rules first
elif token in override_phones:
phones = override_phones[token]
# If ',': we will add a 250ms break, as stored in Synth class phones dict
elif token == ', ':
phones = ['comma - break']
# Likewise, 500ms break at end of sentence, as stored in Synth class phones dict
elif token in terminating_punc:
phones = ['sentence - break']
# Otherwise check in the CMU phonetic dictionary (as stored in the Synth class)
elif token in word_phones_dict:
# Automatically just pick the first phonem sequence
phones = word_phones_dict[token][0]
else:
print "Sorry, '" + token + "' is not in the CMU dictionary."
sys.exit()
phone_seq.extend(phones)
cleaned_seq = [re.sub(r"[\d]+", "", phonem).lower() for phonem in phone_seq] #Clean out phoneme stress digits
return cleaned_seq
if __name__ == "__main__":
# Special dictionary to override or add additional phoneme sequence rules
# on top of the CMU dictionary stored in Synth class
special_dict = {
# Override default 'Z' sound
'z': [u'Z', u'EH1', u'D'],
# When pronouncing individual punctuation (during spelling)
'[, ]': [u'K', u'AA1', u'M', u'AH0'],
'[.]': [u'F', u'UH1', u'L', u'S', u'T', u'AA1', u'P'],
'[?]': [u'K', u'W', u'EH1', u'S', u'CH', u'AH0', u'N', u'M', u'AA1', u'R', u'K'],
'[!]': [u'IH0', u'K', u'S', u'K', u'L', u'AH', u'M', u'EY1', u'SH', u'AH0', u'N', u'M', u'AA1', u'R', u'K'],
"[']": [u'AH0', u'P', u'AA1', u'S', u'T', u'R', u'AH0', u'F', u'IY0'],
"[/]": [u'S', u'L', u'AE1', u'SH'],
# Slashes are pronounced regardless of spelling, such as for invalid dates
"/": [u'S', u'L', u'AE1', u'SH']
}
# Create Synth object, with phonemes from monophones folder
S = Synth(wav_folder=args.monophones)
# Phrase -> phoneme sequence
normalised_tokens = normalise_text(args.phrase, spell=args.spell)
phone_seq = get_phone_seq(normalised_tokens, S.word_phones_dict, special_dict)
# If not playing, just print the output.
if not args.play:
print "Phoneme sequence: \n \t" + str(phone_seq)
# Pass phoneme sequence to S
S.concat_phone_seq(phone_seq)
S.play_and_save(args.play, args.volume, args.outfile)