def crossfade_diphones(self, diphone_seq, crossover): # function to crossfade a diphone sequence, over a specified number of points(crossover value) milliseconds = int(self.out.rate * 0.001) # create long and short pause longpause = simpleaudio.Audio() longpause.create_tone(0, 400 * milliseconds, 1) shortpause = simpleaudio.Audio() shortpause.create_tone(0, 200 * milliseconds, 1) for diphone in diphone_seq: # for each diphone in the sequence try: # try to load the corresponding diphone file from wav_folder diphone_audio = simpleaudio.Audio() diphone_audio.load(diphone_synth.diphones[diphone]) # and then crossfade the obtained audio array into the self.out audio data self.crossfade_arrays(diphone_audio, crossover) except KeyError: # if the diphone is not in the wav_folder, check if it is valid punctuation # and crossfade in the relevant length of silence try: if diphone in ['!', '?', ':', '.']: self.crossfade_arrays(longpause, crossover) elif diphone in [',']: self.crossfade_arrays(shortpause, crossover) else: raise KeyError # if the diphone is not in the wav_folder or valid punctuation, user is alerted except KeyError: print( 'Sorry, I am unable to retrieve the audio for the diphone {}. Please recheck that it' 'is in the diphones folder supplied.'.format(diphone))
def get_letters_voice(self, letter_phone_list): letter_voice_audio = simpleaudio.Audio() for letter in letter_phone_list: if re.match('[A-Z]{1,2}[1]', letter): letter = letter[:-1] temp = simpleaudio.Audio() temp.load(self.phones[letter]) letter_voice_audio.data = np.append(letter_voice_audio.data, temp.data) letter_voice_audio.change_speed(0.4) letter_voice_audio.rescale(self.vol) letter_voice_audio.play()
def get_words_voice(self, word_phone_list): upper_letters = list('QWERTYUIOPASDFGHJKLZXCVBNM') voice_upper = False """Extension B – Punctuation contains a comma – insert 250ms of silence period, question mark or exclamation mark – insert 500ms of silence""" for w_index in range(0, len(word_phone_list)): #print(word_phone_list[w_index]) if word_phone_list[w_index] in list(',.!?'): if word_phone_list[w_index] == ',': temp = simpleaudio.Audio(rate=16000) temp.create_tone(0, int(0.25 * temp.rate), 0) #print(temp.data) self.res_voice.data = np.append(self.res_voice.data, temp.data) continue else: temp = simpleaudio.Audio(rate=16000) temp.create_tone(0, int(0.5 * temp.rate), 0) #print(temp.data) self.res_voice.data = np.append(self.res_voice.data, temp.data) continue if word_phone_list[w_index] in list('{}'): continue #print(word_phone_list[w_index-1]) for index in range(0, len(word_phone_list[w_index])): #for phone_item in word_phone_list[w_index]: phone_key = '' for phone_item_lower_item in word_phone_list[w_index][ index]: #不懂?? if phone_item_lower_item in upper_letters: phone_key += phone_item_lower_item """ Extension D – Emphasis markup emphasis the word in{} """ temp = simpleaudio.Audio() temp.load(self.phones[phone_key]) if w_index > 0 and word_phone_list[w_index - 1] == '{': voice_upper = True if index == (len(word_phone_list[w_index]) - 1): voice_upper = False if voice_upper: temp.data = temp.data * 5 #pass#temp.rescale(1) else: pass #pass#temp.rescale(self.vol) self.res_voice.data = np.append(self.res_voice.data, temp.data)
def get_audio(self, rate=RATE): """ Return synthesized output as an `Audio` object containing the concatenated audio for the input diphone sequence. """ # Create audio sequence from diphones output_audio = [] for diphone in self.diphones: filename = self.get_filename(diphone) audio = self.audio[filename] output_audio.append(audio.data) # Instantiate output `Audio` object output = simpleaudio.Audio(rate=rate) # Concatenate audio and rescale output.data = numpy.concatenate(output_audio) output.rescale(1.0) return output
def __init__(self, diphones, directory): """ Initialize synthesizer. - `diphones` (list): sequence of diphones - `audio` (dict): dictionary of filename-audio pairs """ self.diphones = diphones # Create mapping from diphone filenames to audio self.audio = {} for diphone in self.diphones: filename = self.get_filename(diphone) if filename not in self.audio: # Ensure that file exists path = os.path.join(directory, filename) if not os.path.isfile(path): sys.exit(f"Couldn't locate '{filename}'") # Load its contents and add to dictionary audio = simpleaudio.Audio() audio.load(path) self.audio[filename] = audio
def concat_diphones(self, diph_emphasis=None, smoother=False): """ Description: Input : A string of missing diphone Output: A string of corresponding subsitution diphone """ # Audio instance to store the TTS audio output output = simpleaudio.Audio(rate=16000) # Variable to track diphone index and processing diphone_index diphone_index = 0 # Go through the diphones in the ordered diphone sequence for each_diphone in self.diphone_seq: # Create an Audio instance to store temporary audio data temp_diphone = simpleaudio.Audio(rate=16000) temp_diphone.data = self.diphones[each_diphone] # Extension D Emphasis markup # If any emphasis marking is used if len(diph_emphasis) > 0: # Dont rescale silence (Avoid numpy warning error) if each_diphone == "s_short" or each_diphone == "s_long": continue # Adjust the volume of the diphone if it's index is marked as emphasis diphone in the set elif diphone_index in diph_emphasis: # (EXTRA) Loud fricatives make unpleasant noise, thus the adjustment volume (0.60) is slight less than other emphasis diphones (0.65) if 's' in each_diphone or 'th' in each_diphone or 'f' in each_diphone: adjust_value = 0.60 else: adjust_value = 0.65 temp_diphone.rescale(adjust_value) # (EXTRA) To make a smoother transition of emphasis enhancement, also slightly adjust the diphone that directly follow the emphasis diphones (0.525) elif diphone_index - 1 in diph_emphasis: adjust_value = 0.525 temp_diphone.rescale(adjust_value) # Normal concatenation without smoother if smoother == False: output.data = np.concatenate((output.data, temp_diphone.data)) # If smoother is used, implement Extension E - Smoother Concatenation else: adjust_level = 0.0 # This loop rescales the 160 data points (10 msc) near the both edges of the diphone for index in range(0, 161): if diphone_index > 0: # Except the first diphone: # Scale the data points in the initial 10 msc of current working diphone # Order: Start scaling from the 1st point, 2nd, 3rd... througout the loop (From edge of diphone towards the middle) temp_diphone.data[index] = temp_diphone.data[ index] * adjust_level / 160.0 if diphone_index < len(diphone_seq) - 1: # Except the last diphone: # Scale the data points in the last 10 msc of of current working diphone # Order: Start scaling from the last point, 2nd last, 3rd last... througout the loop (From edge of diphone towards the middle) temp_diphone.data[-(index + 1)] = temp_diphone.data[-( index + 1)] * adjust_level / 160.0 # Turn louder when moving inward in the next round of the loop adjust_level += 1 # After rescale all, seperate the whole diphone into two portions: (1) initial 10msc, and (2) everything after 10msc np.initial10msc = temp_diphone.data[:160] np.after10msc = temp_diphone.data[160:] # Combine diphone portions together in the output.data if diphone_index == 0: # For the 1st diphone, concatenate the whole rocessed diphone data output.data = np.concatenate( (output.data, temp_diphone.data)) else: # For later diphones, addup/cross-fade the first 10 msc of the current diphone with last 10 msc of the previous diphone (which saved in the output.data in the previous round) output.data[-160:] = output.data[-160:] + np.initial10msc # Concatenate the remaining part of the processed diphone data output.data = np.concatenate((output.data, np.after10msc)) # Increase monitereing index diphone_index += 1 # Return return output
def get_wavs(self, wav_folder): """ Description: Construct a unique set of required diphones, load the corresponding numpy array from their .wav file, and save the array to a dictionary Input : A path to wav_folder, a list of requested diphones, a list of diphone features Output: A dictionary of diphone audio numpy array NOTE : Focus on efficiecy (1) Only load data from database for each REQUIRED UNIQUE diphones (2) Save data in numpy array instead of object instance """ # Variables to store diphones diphone_path = dict([]) diphones = dict([]) # To ensure efficiency, I create a list of unique diphones that we need to retrive from the file. # This avoid reloading the same file again and again if the syntheisis sentence is long and contains # lots of repeating words, e.g. Long sentence with repeating the, a, he, she .... unique_diphones = set( map(lambda each_diphone: each_diphone, self.diphone_seq)) # Only go through the database once, storing a complete dictionary of avaliable diphone and their path # is still necessary because my required diphone might be a missing diphone, I need to know what other # similar diphones in the database I can use. for root, dirs, files in os.walk(wav_folder, topdown=False): for file in files: diphone_path[file] = root + '/' + file # Go through the required diphones, use the method in an Audio instance to load the numpy array data, # then only store the np array data in the diphone dictionary (i.e. key: diphone, value: np array) for required_diphone in unique_diphones: # Audio instance to handle audio information sound_obj = simpleaudio.Audio(rate=16000) # Extension B Punctuation: Short silence (200 ms) if required_diphone == "s_short": sound_obj.create_noise(3200, 0) diphones[required_diphone] = sound_obj.data # Extension B Punctuation: Long silence (400 ms) elif required_diphone == "s_long": sound_obj.create_noise(6400, 0) diphones[required_diphone] = sound_obj.data else: # Handle normal diphones try: # Load the audio data from the corresponding path path = diphone_path[required_diphone + ".wav"] sound_obj.load(path) # Save the array data in a dictionary diphones[required_diphone] = sound_obj.data except KeyError: # Show error message to user when there is a KeyError which refers to missing diphone in the diphone database print("*** This is a missing diphone: ", required_diphone) # Instead of quiting the program, use the method sub_diphone to find corresponding suitable subsitude diphone sub_diphone = self.sub_diphone(required_diphone) # NOTE: Show message to user about subsitution of diphone print("*** Using subsitude diphone: ", sub_diphone) # Save the array data in the dictionary path = diphone_path[sub_diphone + ".wav"] sound_obj.load(path) diphones[required_diphone] = sound_obj.data # Return the complete dictionary that contains diphone array data return diphones
if play == True: object.play() # (PART V) Main module if __name__ == "__main__": # Step 1 - Create an Utterance instance to handle text normalization and annotatioin (incl. translation of number) of input text utt = Utterance(input_text=args.phrase[0]) # Step 2 - Get diphone sequence and feature information (emphrasis) after text normalization diph_emphasis = utt.diph_emphasis diphone_seq = utt.diphone_seq # Step 3 - Create a Synth instance to work on synthesing sound based on the given infomation diphone_synth = Synth(wav_folder=args.diphones, diphone_seq=diphone_seq, diph_emphasis=diph_emphasis) # Step 4 - Clone the data from the Synth instance 'diphone_synth' that contains concatenated audio data to an output Audio instance 'output' output = simpleaudio.Audio(rate=16000) output.data = diphone_synth.output.data # Step 5 - Further adjustment on overall volume to the final output (if the user use -v <0-100>) output = adjust_volume(volume=args.volume, object=output) # Step 6 - Save it to the target file (if the user use -o <args.outfile>) save(output_file=args.outfile, object=output) # Step 7 - Play the final sound output (if the user use -p) play_audio(play=args.play, object=output)
def get_wavs(self, wav_folder): ''' This function produces the full wave for the phrase, by concatenating the diphone files together''' # list the entire collection of available diphone sounds from the diphone folder in self.diphones for root, dirs, files in os.walk(wav_folder, topdown=False): for file in files: self.diphones.append(file) # the diphone sounds for the phrase will be added to this list in numpy array format, dtype = int16 diphone_sounds = [] if args.spell: # loop through the letters in the word(s) in the normalised phrase, # load the corresponding diphones from the diphone folder, # access the numpy array of this diphone and append it to diphone sounds for word in norm_phrase.split(): for letter in word: for diphone in diphone_seq[letter]: d = sa.Audio() d.load("diphones/{}".format(diphone)) num_array = d.data diphone_sounds.append(num_array) else: # loop through the list objects in phrase_punc (these are words and allowed punctuation), # load the corresponding diphones for the words from the diphone folder, and # access the numpy array of this diphone and append it to diphone sounds for i in range(len(phrase_punc)): if phrase_punc[i] in diphone_seq.keys(): for diphone in diphone_seq[phrase_punc[i]]: d = sa.Audio() d.load("args.diphones/{}".format(diphone)) num_array = d.data diphone_sounds.append(num_array) # for the punctuation list objects else: try: # insert a pause from the end of the previous word before the silence d = sa.Audio() d.load("diphones/{}-pau.wav".format( phone_seq[phrase_punc[i - 1]][-1])) num_array = d.data diphone_sounds.append(num_array) if phrase_punc[i] == ',': # insert silence in place of the punctuation silence = np.zeros(2000, dtype=np.int16) diphone_sounds.append(silence) if phrase_punc[i] == '.' or phrase_punc[ i] == ':' or phrase_punc[ i] == '!' or phrase_punc[i] == '?': # insert 400ms of silence in place of the punctuation silence = np.zeros(4000, dtype=np.int16) diphone_sounds.append(silence) if i <= range(len(phrase_punc))[-2]: # insert a pause after the silence to the beginning of the next word, only if # there is a next word d.load("diphones/pau-{}.wav".format( phone_seq[phrase_punc[i + 1]][0])) num_array = d.data diphone_sounds.append(num_array) except KeyError: print("Ignoring consecutive punctuation:{}".format( phrase_punc[i])) # concatenate the diphone sounds to produce the phrase sound phrase_sound = np.concatenate(diphone_sounds) # create the instance of the phrase wave file x = sa.Audio(rate=16000) x.data = phrase_sound if args.play: if args.volume: # scale the volume integer entered by the user so that it can be understood # by rescale in SimpleAudio volume = int(args.volume) * 0.01 x.rescale(volume) x.play() if args.outfile: x.save(args.outfile)
def __init__(self, wav_folder): self.diphones = {} self.sound = simpleaudio.Audio() self.get_wavs(wav_folder)
dip_seq.append("PAU") # add a pause at end of the diphone sequence result = " ".join(dip_seq) return result if __name__ == "__main__": # Initialize utt class, get the diphone sequence and os path utt = Utterance(args.phrase[0]) diphone_seq = utt.get_phone_seq() diphone_synth = Synth(os.path.join(os.getcwd(), args.diphones)) diphone_seq = normalise_diphone_seq(diphone_seq) # out is the Audio object which will become your output # you need to modify out.data to produce the correct synthesis out = sa.Audio(rate=16000) print(diphone_seq) # insert silence for comma and .?! for token in diphone_seq: d = sa.Audio(rate=16000) if token in ',': # 200ms which is 0.2s for comma insert_silence(out, 0.20) elif token in '.?!': insert_silence(out, 0.40) else: # load the wav file d.load(path=diphone_synth.diphones[token]) # smooth the date using function smoother
def synthesize(self, diphonelist, crossfade=False): """ This function checks for silence and appends diphones to a :param diphonelist: a list of diphones to be synthesized :param crossfade: argument passed through argpass that decides whether to crossfade diphones :return: """ self.diphonesound = simpleaudio.Audio(rate=16000) self.diphone_wavdata_list = [] for key in diphonelist: self.silence_length = 0 try: # Which diphone file should be loaded? # Delete silence specification in string form (for now...) key_no_sil = re.sub('[24]', '', key) # Create the string that can find the diphone file diphone_file = str(self.wav_folder + '/' + self.diphones[key_no_sil]) # load it self.diphonesound.load(diphone_file) # put audio data into the list (diphone_wavdata_list is a list of arrays) self.diphone_wavdata_list.append(self.diphonesound.data) except Exception as e: strings = [ 'Diphone {} not present in dictionary.'.format(e), 'Backing off...', 'Searching for a diphone to fill in for {}'.format(e) ] printdots(strings) # Attempt an emergency key search backupkey = self.emergency_diphone(key) # Create the string that can find the diphone file diphone_file = str(self.wav_folder + '/' + self.diphones[backupkey]) # load it self.diphonesound.load(diphone_file) # put audio data into the list (diphone_wavdata_list is a list of arrays) self.diphone_wavdata_list.append(self.diphonesound.data) # investigate if a pau item had if key[-1] == '2': # 200ms of silence self.silence_length = 0.2 if key[-1] == '4': # 400ms of silence self.silence_length = 0.4 # append silence to the list if a value was added to variable self.silence_length during loop self.add_silence() if self.silence_length != 0 else None # reuse this from loading diphones, as the waveform settings/ internal objects will be correct self.new_object = self.diphonesound # join audio data chunks into one waveform self.crossfade() if args.crossfade else self.naively_concatenate() return self.new_object
def main(): # Step 1 - Get input utterance sequence 获取语音序列 inputseq = args.phrase[0] # Step 2 - Put the text in a Sequence instance inputseq = Sequence(inputseq) print("inputseq:", inputseq) print("inputseq.tokens:", inputseq.tokens) # hkcan_corpus = pc.hkcancor() # for each in inputseq.tokens: # wordinfo = hkcan_corpus.search(character=each) # pprint(len(wordinfo)) # pprint(wordinfo[:3]) for eachtoken in inputseq.tokens: for eachchar in eachtoken.chars: print("eachchar:", eachchar) eachchar.eachphone = simpleaudio.Audio() # Audio instance to handle audio information sound_obj = simpleaudio.Audio(rate=48000) if eachchar.phone[0] in ["sil_200", "sil_400"]: if eachchar.phone[0] == "sil_200": sound_obj.create_noise(9600, 0) if eachchar.phone[0] == "sil_400": sound_obj.create_noise(19200, 0) eachchar.eachphone.data = sound_obj.data else: phone = str(eachchar.phone[0]) if not phone[-1].isdigit(): phone = phone + "5" eachchar.path = path + phone + ".wav" # print('eachchar.path路径:',eachchar.path) # print('path路径:',path) # print('phone路径:',phone) eachchar.eachphone.load(eachchar.path) output = simpleaudio.Audio() # Variable to track diphone index and processing char_index char_index = 0 # Normal concatenation without smoother charlist = [] for eachtoken in inputseq.tokens: for eachchar in eachtoken.chars: charlist.append(eachchar.char) for eachtoken in inputseq.tokens: for eachchar in eachtoken.chars: empty_spacing = simpleaudio.Audio(rate=16000) empty_spacing.create_noise(40, 0) temp_diphone = simpleaudio.Audio(rate=16000) temp_diphone.data = eachchar.eachphone.data if args.crossfade == False: output.data = np.concatenate((output.data, temp_diphone.data)) output.data = np.concatenate((output.data, empty_spacing.data)) # If smoother is used, implement Extension E - Smoother Concatenation else: adjust_level = 0.0 # This loop rescales the 320 data points (10 msc) near the both edges of the diphone for index in range(0, 321): if char_index > 0: # Except the first diphone: # Scale the data points in the initial 10 msc of current working diphone # Order: Start scaling from the 1st point, 2nd, 3rd... througout the loop (From edge of diphone towards the middle) temp_diphone.data[index] = temp_diphone.data[ index] * adjust_level / 320.0 if char_index < len(charlist) - 1: # Except the last diphone: # Scale the data points in the last 10 msc of of current working diphone # Order: Start scaling from the last point, 2nd last, 3rd last... througout the loop (From edge of diphone towards the middle) temp_diphone.data[-(index + 1)] = temp_diphone.data[-( index + 1)] * adjust_level / 320.0 # Turn louder when moving inward in the next round of the loop adjust_level += 1 # After rescale all, seperate the whole diphone into two portions: (1) initial 10msc, and (2) everything after 10msc np.initial10msc = temp_diphone.data[:320] np.after10msc = temp_diphone.data[320:] # Combine diphone portions together in the output.data if char_index == 0: # For the 1st diphone, concatenate the whole rocessed diphone data output.data = np.concatenate( (output.data, temp_diphone.data)) else: # For later diphones, addup/cross-fade the first 10 msc of the current diphone with last 10 msc of the previous diphone (which saved in the output.data in the previous round) output.data[-320:] = output.data[-320:] + np.initial10msc # Concatenate the remaining part of the processed diphone data output.data = np.concatenate((output.data, np.after10msc)) # Increase monitereing index char_index += 1 # Step 5 - Further adjustment on overall volume to the final output (if the user use -v <0-100>) output = adjust_volume(volume=args.volume, object=output) # Todo:volume直接写成要传递的值 # Step 6 - Save it to the target file (if the user use -o <args.outfile>) save(output_file=args.outfile, object=output) # Todo:output_file直接写成要传递的值 save_pickle(output_file=args.outfile, object=output) # Todo:output_file直接写成要传递的值 # Step 7 - Play the final sound output (if the user use -p) play_audio(play=args.play, object=output)
def __init__(self, wav_folder): # initialize Synth by creating a dictionary of the diphone wav files in the wav_folder self.diphones = {} self.out = simpleaudio.Audio() self.get_wavs(wav_folder)
def __init__(self, wav_folder): self.phones = {} self.vol = 0.2 self.get_wavs(wav_folder) self.res_voice = simpleaudio.Audio()
signal_reverse = False if args.reverse == 'words': words_reverse = True elif args.reverse == 'phones': phones_reverse = True elif args.reverse == 'signal': signal_reverse = True if args.spell: phone_seq = utt.get_spell_diphone_seq(words_r=words_reverse, phones_r=phones_reverse) else: phone_seq = utt.get_diphone_seq(words_r=words_reverse, phones_r=phones_reverse) diphone_synth = Synth(wav_folder=args.diphones) output_filename = args.outfile if args.outfile else 'out_file.wav' diphone_synth.get_diphone_seq_concatenation(phone_seq, output_filename, signal_r=signal_reverse, emphasis_i=utt.emphasis_markup(), crossfade=args.crossfade) out = simpleaudio.Audio() out.load(output_filename) if args.volume: if args.volume < 0 or args.volume > 100: raise ValueError("Expected volume value between 0 and 100.") out.rescale(args.volume / 100) if args.play: out.play()