def synthesize(self, text): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq = text_to_sequence(text, cleaner_names) feed_dict = { self.model.inputs: [np.asarray(seq, dtype=np.int32)], self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) wav = audio.inv_preemphasis(wav) wav = wav[:audio.find_endpoint(wav)] out = io.BytesIO() audio.save_wav(wav, out) return out.getvalue()
def _get_next_example(self): '''Loads a single example (input, mel_target, linear_target, cost) from disk''' if self._offset >= len(self._metadata): self._offset = 0 random.shuffle(self._metadata) meta = self._metadata[self._offset] self._offset += 1 text = meta[3] if self._cmudict and random.random() < _p_cmudict: text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')]) input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) linear_target = np.load(os.path.join(self._datadir, meta[0])) mel_target = np.load(os.path.join(self._datadir, meta[1])) return (input_data, mel_target, linear_target, len(linear_target))
def get_text(self, text, lang_code): text_norm = torch.IntTensor( text_to_sequence(text, self.text_cleaners, lang_code, self.cmudict)) return text_norm
def get_text(text): return torch.IntTensor(text_to_sequence(text, hps.text_cleaners))
def transform_text(text, text_cleaners): return text_to_sequence(text, text_cleaners)
def generate(self): for message in self.messages: if message.voice in self.models_22khz: self.hparams.sampling_rate = self.default_sampling_rate waveglow_path = "" if message.voice == "vader:" or message.voice == "duke:": waveglow_path = self.models_path + \ self.waveglow_22khz["vader:"] elif message.voice == "keanu:" or message.voice == "hal:": waveglow_path = self.models_path + \ self.waveglow_22khz["david:"] elif message.voice == "johnny:": waveglow_path = self.models_path + \ self.waveglow_22khz["johnny:"] else: waveglow_path = self.models_path + \ self.waveglow_22khz["default"] waveglow = torch.load(waveglow_path)["model"] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) if len(message.text) > 127: self.hparams.max_decoder_steps = 100000 else: self.hparams.max_decoder_steps = 10000 trimmed_message_length = len("".join(c for c in message.text if c.isalnum())) if trimmed_message_length < 4: if message.voice == "vader:" or message.voice == "carlson:": self.hparams.max_decoder_steps = 1000 self.hparams.gate_threshold = 0.001 if any(char.isdigit() for char in message.text): self.hparams.max_decoder_steps = 10000 self.hparams.gate_threshold = 0.5 if trimmed_message_length >= 4 and trimmed_message_length < 7: self.hparams.gate_threshold = 0.01 if message.voice == "vader:" or message.voice == "carlson:": self.hparams.gate_threshold = 0.01 if any(char.isdigit() for char in message.text): self.hparams.gate_threshold = 0.5 else: self.hparams.gate_threshold = 0.01 if any(char.isdigit() for char in message.text): self.hparams.gate_threshold = 0.1 elif trimmed_message_length >= 7 and trimmed_message_length < 15: self.hparams.gate_threshold = 0.1 if message.voice == "vader:" or message.voice == "carlson:": self.hparams.gate_threshold = 0.01 if any(char.isdigit() for char in message.text): self.hparams.gate_threshold = 0.5 else: self.hparams.gate_threshold = 0.1 if any(char.isdigit() for char in message.text): self.hparams.gate_threshold = 0.2 else: self.hparams.gate_threshold = 0.5 message_extended = False if trimmed_message_length < 11: if message.voice == "vader:": message.text = "{} -. -------. -------.".format( message.text) else: message.text = "{} -------. -------.".format( message.text) message_extended = True model = load_model(self.hparams) model.load_state_dict( torch.load(self.models_path + self.models_22khz[message.voice])["state_dict"]) _ = model.cuda().eval().half() sequence = np.array( text_to_sequence(message.text, ["english_cleaners"]))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs_postnet, requires_cutting = model.inference( sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=1) # audio_denoised = denoiser(audio, strength=0.001)[:, 0] # if np.isnan(audio_denoised.cpu().numpy()[0][0]): # audio_data = audio.cpu().numpy()[0] # else: # audio_data = audio_denoised.cpu().numpy()[0] audio_data = audio.cpu().numpy()[0] scaled_audio = np.int16(audio_data / np.max(np.abs(audio_data)) * self.audio_length_parameter) if message_extended or requires_cutting: cut_index = 0 silence_length = 0 for i, val in enumerate(scaled_audio): if val == 0: silence_length += 1 if silence_length > 500: cut_index = i break scaled_audio = scaled_audio[:cut_index] if message.voice == "vader:": _, effect = read("extras/breathing.wav") scaled_audio = np.concatenate((effect, scaled_audio)) scaled_audio = np.concatenate((scaled_audio, self.silence)) self.joined_audio = np.concatenate( (self.joined_audio, scaled_audio)) if requires_cutting: torch.cuda.empty_cache() else: engine = pyttsx3.init() if self.current_os == "Windows": engine.setProperty( "voice", self.synth_voices_windows[message.voice]) else: engine.setProperty("voice", self.synth_voices_linux[message.voice]) engine.setProperty("rate", 120) engine.save_to_file(message.text, self.temp_file) engine.runAndWait() while not os.path.isfile(self.temp_file): time.sleep(1.5) if os.path.isfile(self.temp_file): del engine file = read( os.path.join(os.path.abspath("."), self.temp_file)) audio = np.array(file[1], dtype=np.int16) audio = np.concatenate((audio, self.silence)) self.joined_audio = np.concatenate( (self.joined_audio, audio)) os.remove(self.temp_file) scaled_audio = np.int16(self.joined_audio / np.max(np.abs(self.joined_audio)) * self.audio_length_parameter) if scaled_audio[0] == self.audio_length_parameter: scaled_audio = scaled_audio[1:] return scaled_audio, self.hparams.sampling_rate
def synthesize(self, texts=None, tokens=None, base_path=None, paths=None, speaker_ids=None, start_of_sentence=None, end_of_sentence=True, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, manual_attention_mode=0, base_alignment_path=None, librosa_trim=False, attention_trim=True, isKorean=True): # manual_attention_mode가 on되면, manual attention 적용하지 않음 버전과 적용한 버전해서, 2개가 만들어 진다. # Possible inputs: # 1) text=text # 2) text=texts # 3) tokens=tokens, texts=texts # use texts as guide if type(texts) == str: texts = [texts] if texts is not None and tokens is None: sequences = np.array([text_to_sequence(text) for text in texts]) sequences = _prepare_inputs(sequences) elif tokens is not None: sequences = tokens #sequences = np.pad(sequences,[(0,0),(0,5)],'constant',constant_values=(0)) # case by case ---> overfitting? if paths is None: paths = [None] * len(sequences) if texts is None: texts = [None] * len(sequences) time_str = get_time() def plot_and_save_parallel(wavs, alignments, use_manual_attention, mels): items = list( enumerate(zip(wavs, alignments, paths, texts, sequences, mels))) fn = partial(plot_graph_and_save_audio, base_path=base_path, start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, pre_word_num=pre_word_num, post_word_num=post_word_num, pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, use_short_concat=use_short_concat, use_manual_attention=use_manual_attention, librosa_trim=librosa_trim, attention_trim=attention_trim, time_str=time_str, isKorean=isKorean) return parallel_run(fn, items, desc="plot_graph_and_save_audio", parallel=False) #input_lengths = np.argmax(np.array(sequences) == 1, 1)+1 input_lengths = [np.argmax(a == 1) + 1 for a in sequences] fetches = [ #self.wav_output, self.model.linear_outputs, self.model. alignments, # # batch_size, text length(encoder), target length(decoder) self.model.mel_outputs, ] feed_dict = { self.model.inputs: sequences, self.model.input_lengths: input_lengths, } if base_alignment_path is None: feed_dict.update({ self.model.manual_alignments: np.zeros([1, 1, 1]), self.model.is_manual_attention: False, }) else: manual_alignments = [] #alignment_path = os.path.join(base_alignment_path,os.path.basename(base_path)) alignment_path = os.path.join(os.path.basename(base_path), base_alignment_path) for idx in range(len(sequences)): numpy_path = "{}{}.npy".format(alignment_path, idx) manual_alignments.append(np.load(numpy_path)) alignments_T = np.transpose(manual_alignments, [0, 2, 1]) feed_dict.update({ self.model.manual_alignments: alignments_T, self.model.is_manual_attention: True }) if speaker_ids is not None: if type(speaker_ids) == dict: speaker_embed_table = sess.run(self.model.speaker_embed_table) speaker_embed = [ speaker_ids[speaker_id] * speaker_embed_table[speaker_id] for speaker_id in speaker_ids ] feed_dict.update({self.model.speaker_embed_table: np.tile()}) else: feed_dict[self.model.speaker_id] = speaker_ids wavs, alignments, mels = self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel( wavs, alignments, use_manual_attention=False, mels=mels ) # use_manual_attention = True/False는 출력파일명에 'manual'을 넣고 빼고 차이 뿐. if manual_attention_mode > 0: # argmax one hot if manual_attention_mode == 1: alignments_T = np.transpose( alignments, [0, 2, 1] ) # [batch_size, Encoder length, Decoder_length] ==> [N,D,E]. (1, 50, 200) -->((1,200,50) new_alignments = np.zeros_like( alignments_T) # model에서 attention은 (N,D,E)이므로 for idx in range(len(alignments)): # batch에 대한 loop argmax = alignments[idx].argmax( 1) # text가 소리의 어디쯤에서 가장 영향을 많이 주었나? 즉 어디서 발음되나? new_alignments[idx][(argmax, range(len( argmax)))] = 1 # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 0 # sharpening elif manual_attention_mode == 2: new_alignments = np.transpose( alignments, [0, 2, 1]) # [N, E, D] ==> [N,D,E] for idx in range(len(alignments)): # batch에 대한 loop # 분산, 평균을 계산한 후, 사용하지도 않네... 뭐야!!! var = np.var( new_alignments[idx], 1 ) # variance [N,D]. 각 Decoder time별 attention variance mean_var = var[:input_lengths[idx]].mean() new_alignments[idx] = np.power(new_alignments[idx], 2) # prunning elif manual_attention_mode == 3: new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] for idx in range(len(alignments)): argmax = alignments[idx].argmax(1) new_alignments[idx][(argmax, range(len( argmax)))] = 1 # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 유지 feed_dict.update({ self.model.manual_alignments: new_alignments, self.model.is_manual_attention: True, }) new_wavs, new_alignments = self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel(new_wavs, new_alignments, True) return results
# print(mel_spec.shape) # plt.imsave("note/test.jpg",mel_spec.numpy(),cmap='hot') checkpoint = './news_output_22k/checkpoint_50000' model = load_model(hp) model.load_state_dict(torch.load(checkpoint)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path, map_location="cpu")['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() denoiser = Denoiser(waveglow).cuda() # 自定义文本 text="各位老师,大家早上好,这是我目前取得的初步结果,生成的嘴唇和声音,可以保持一定的唇音同步,感谢各位老师的指导。" text, _ = get_pyin(text) sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence).cuda().long()) mel_output, mel_output_posnet, _, alignment = model.inference(sequence) mel_output = mel_output.float().data.cpu()[0] mel_output_posnet = mel_output_posnet.float().data.cpu()[0] mel = mel_output_posnet.unsqueeze(0) #mel=mel_spec.unsqueeze(0) denoiser_strength = 0.1 output_dir = "note/" sampling_rate = 22050 sigma = 0.66 i=1 with torch.no_grad(): audio = waveglow.infer(mel.cuda(), sigma=sigma) audio = denoiser(audio, 0.1)
# where the clip file will be written: save_path = 'audio_test.wav' # where the pre-trained model is located: # Inputs for the synthesis: test_text = "the recommended book for natural language interaction is neural network methods from goldberg" #GST scores gst_head_scores = np.array([0.4, 0.2, 0.4]) gst_scores = torch.from_numpy(gst_head_scores).cuda().float() print('Input sequence and GST weights loaded...') # TEXT2MEL: torch.manual_seed(1234) from text import text_to_sequence #preprocessing: sequence = np.array(text_to_sequence(test_text, ['english_cleaners']))[None, :] sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64) print("Input text sequence pre-processed successfully...") #text to mel inference: t1 = time.time() with torch.no_grad(): mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, gst_scores) # MEL2WAV : from audio_processing import griffin_lim from nn_layers import TacotronSTFT torch.manual_seed(1234) # Griffin Lim vocoder synthesis: # griffin_iters = 60
def collect_features(self, text): return np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
def infer(text, model): sequence = text_to_sequence(text, hps.text_cleaners) sequence = mode(torch.IntTensor(sequence)[None, :]).long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) return (mel_outputs, mel_outputs_postnet, alignments)
def get_text(self, text): if self.add_space: text = " " + text.strip() + " " text_norm = torch.IntTensor( text_to_sequence(text, self.text_cleaners, getattr(self, "cmudict", None))) return text_norm
# "models/mellotron_libritts.pt" mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = 'models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist_korean.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) file_idx = 0 audio_path, text, sid, lang_code = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, int(lang_code), arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path) print(audio_path, text) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *',
gst_scores = torch.from_numpy(gst_head_scores[j]) gst_scores = torch.autograd.Variable(gst_scores).cuda().float() gst_name = gst_head_names[j] # is a string for i in range(3): test_short = test_text_short[i] test_medium = test_text_medium[i] test_large = test_text_large[i] tests_aux = (test_short, test_medium, test_large) for k in range(3): sequence = np.array( text_to_sequence(tests_aux[k], ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # text2mel: mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, gst_scores) # save the predicted outputs from tacotron2: mel_outputs_path = predicted_melspec_folder + "output.pt" mel_outputs_postnet_path = predicted_melspec_folder + "output_postnet.pt" alignments_path = predicted_melspec_folder + "alignment.pt" torch.save(mel_outputs, mel_outputs_path) torch.save(mel_outputs_postnet, mel_outputs_postnet_path) torch.save(alignments, alignments_path) print("text2mel prediction successfully performed...")
model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() """**`Step 7: Load WaveGlow for mel2audio synthesis and denoiser`**""" waveglow_path = '/content/drive/MyDrive/SSMT/waveglow_256channels_universal_v5.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) """**Step 8: Prepare text input**""" text = "मैं बाज़ार जाता हूँ " sequence = np.array(text_to_sequence(text, ['transliteration_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() """**Step 9: Decode text input and plot results**""" mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) """***A short summary about melspectrograms:*** Sound is heard as a result of the variation of pressure with time. However, speech signals are complex entities and a simple pressure variation does not capture enough information for the deep learning model to be trained. Hence in short, a melspectrogram, is a graph which plots three quanitites - Time on the X axis, Frequency on the Y axis and the colors represent the loudness of the sound. The alignment graph seen above is a simple representation of the trajectory of the final output compared to its initial text input
def get_text(self, text): sequence = text_to_sequence(text, self.text_cleaners) text_norm = torch.IntTensor(sequence) ctc_text_norm = torch.IntTensor(sequence_to_ctc_sequence(sequence)) return text_norm, ctc_text_norm
def _process_utterance(out_dir, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescaling: # hparams.rescale = True wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: # hparams.trim_silence = True wav = audio.trim_silence(wav, hparams) # Trim leading and trailing silence #Mu-law quantize, default 값은 'raw' if hparams.input_type == 'mulaw-quantize': #[0, quantize_channels) out = audio.mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif hparams.input_type == 'mulaw': #[-1, 1] out = audio.mulaw(wav, hparams.quantize_channels) constant_values = audio.mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # raw #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: # hparams.use_lws = False #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.fft_size, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk wav_id = os.path.splitext(os.path.basename(wav_path))[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) linear_filename = '{}-linear.npy'.format(wav_id) npz_filename = '{}.npz'.format(wav_id) npz_flag = True if npz_flag: # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다. data = { 'audio': out.astype(out_dtype), 'mel': mel_spectrogram.T, 'linear': linear_spectrogram.T, 'time_steps': time_steps, 'mel_frames': mel_frames, 'text': text, 'tokens': text_to_sequence(text), # eos(~)에 해당하는 "1"이 끝에 붙는다. 'loss_coeff': 1 # For Tacotron } np.savez(os.path.join(out_dir, npz_filename), **data, allow_pickle=False) else: np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, npz_filename)
def execute_this_fn(self, TOKEN, min_donation, channel, se_opts, use_cuda, model, waveglow, offset, prev_time, startup_time, progress_callback, elapsed_callback, text_ready, fn_callback): # TODO: refactor this messy block fn_callback.emit(('GUI: start of polling loop', None)) text_ready.emit("Sta2:Connecting to StreamElements") url = "https://api.streamelements.com/kappa/v2/tips/" + self.channel_id headers = { 'accept': 'application/json', "Authorization": "Bearer " + TOKEN } text_ready.emit('Log2:Initializing') text_ready.emit('Log2:Minimum amount for TTS: ' + str(min_donation)) while True: _mutex2.lock() if _running2 == False: _mutex2.unlock() break else: _mutex2.unlock() if not channel.get_busy(): #print('Polling', datetime.datetime.utcnow().isoformat()) text_ready.emit("Sta2:Waiting for incoming donations . . .") current_time = datetime.datetime.utcnow().isoformat() # TODO: possible bug: missed donations once time pasts midnight querystring = { "offset": offset, "limit": "1", "sort": "createdAt", "after": startup_time, "before": current_time } response = requests.request("GET", url, headers=headers, params=querystring) data = json.loads(response.text) for dono in data['docs']: text_ready.emit("Sta2:Processing donations") dono_time = dono['createdAt'] offset += 1 if dono_time > prev_time: # Str comparison amount = dono['donation']['amount'] # Int if float(amount) >= min_donation and dono[ 'approved'] == 'allowed': name = dono['donation']['user']['username'] msg = dono['donation']['message'] if msg.isspace(): break # Check for empty line ## TODO Allow multiple speaker in msg currency = dono['donation']['currency'] dono_id = dono['_id'] text_ready.emit( "Log2:\n###########################") text_ready.emit("Log2:" + name + ' donated ' + currency + str(amount)) text_ready.emit("Log2:" + msg) lines = preprocess_text(msg) if se_opts[ 'read dono amount'] == 1: # reads dono name and amount msg = '{} donated {} {}.'.format( name, str(amount), cleaners.expand_currency(currency)) lines.insert(0, msg) # Add to head to list output = [] for count, line in enumerate(lines): fn_callback.emit( ('GUI: progress bar 2 text', (count, len(lines)))) sequence = np.array( text_to_sequence( line, ['english_cleaners']))[None, :] # Inference device = torch.device( 'cuda' if use_cuda else 'cpu') sequence = torch.autograd.Variable( torch.from_numpy(sequence)).to( device).long() # Decode text input mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence) with torch.no_grad(): audio = waveglow.infer( mel_outputs_postnet, sigma=0.666, progress_callback=progress_callback, elapsed_callback=None, get_interruptflag=self. get_interruptflag2) if type(audio) != torch.Tensor: # Catches when waveglow is interrupted and returns none break fn_callback.emit( ('GUI: progress bar 2 text', (count + 1, len(lines)))) wav = audio[0].data.cpu().numpy() output.append(wav) _mutex3.lock() if _running3 == True: _mutex3.unlock() outwav = np.concatenate(output) # Playback fn_callback.emit(('Wav: playback', outwav)) else: _mutex3.unlock() prev_time = dono_time # Increment time time.sleep(0.5) fn_callback.emit(('GUI: end of polling loop', None)) text_ready.emit('Log2:\nDisconnected') text_ready.emit('Sta2:Ready') fn_callback.emit(('Var: offset', offset)) fn_callback.emit(('Var: prev_time', prev_time)) return #'Return value of execute_this_fn'
def synthesize(self, texts=None, tokens=None, base_path=None, paths=None, speaker_ids=None, start_of_sentence=None, end_of_sentence=True, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, base_alignment_path=None, librosa_trim=False, attention_trim=True, isKorean=True): # Possible inputs: # 1) text=text # 2) text=texts # 3) tokens=tokens, texts=texts # use texts as guide if type(texts) == str: texts = [texts] if texts is not None and tokens is None: sequences = np.array([text_to_sequence(text) for text in texts]) sequences = _prepare_inputs(sequences) elif tokens is not None: sequences = tokens #sequences = np.pad(sequences,[(0,0),(0,5)],'constant',constant_values=(0)) # case by case ---> overfitting? if paths is None: paths = [None] * len(sequences) if texts is None: texts = [None] * len(sequences) time_str = get_time() def plot_and_save_parallel(wavs, alignments, mels): items = list( enumerate(zip(wavs, alignments, paths, texts, sequences, mels))) fn = partial(plot_graph_and_save_audio, base_path=base_path, start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, pre_word_num=pre_word_num, post_word_num=post_word_num, pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, use_short_concat=use_short_concat, librosa_trim=librosa_trim, attention_trim=attention_trim, time_str=time_str, isKorean=isKorean) return parallel_run(fn, items, desc="plot_graph_and_save_audio", parallel=False) #input_lengths = np.argmax(np.array(sequences) == 1, 1)+1 input_lengths = [np.argmax(a == 1) + 1 for a in sequences] fetches = [ #self.wav_output, self.model.linear_outputs, self.model. alignments, # # batch_size, text length(encoder), target length(decoder) self.model.mel_outputs, ] feed_dict = { self.model.inputs: sequences, self.model.input_lengths: input_lengths, } if speaker_ids is not None: if type(speaker_ids) == dict: speaker_embed_table = sess.run(self.model.speaker_embed_table) speaker_embed = [ speaker_ids[speaker_id] * speaker_embed_table[speaker_id] for speaker_id in speaker_ids ] feed_dict.update({self.model.speaker_embed_table: np.tile()}) else: feed_dict[self.model.speaker_id] = speaker_ids wavs, alignments, mels = self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel(wavs, alignments, mels=mels) return results
denoiser = Denoiser(waveglow).cuda().eval() # ## Setup dataloaders arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) # ## Load data file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict, 0.0))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path) # print(audio_path, text) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) ## Define Speakers Set speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/aidatatang_speakerinfo.txt',
import torch from torch import nn import numpy as np from text import text_to_sequence a = '안녕하세요' b = '요안' print(text_to_sequence(a)) print(text_to_sequence(b))
if hparams.bert: bert, tokenizer = load_bert(args.bert_folder) # Extract phonemic features with open(args.text, 'r') as f: texts = [] for line in f.readlines(): name, sen = line.strip().split(' ') if sen[-1] not in ['。', '?', '!']: texts.append((name, sen + '。')) else: texts.append((name, sen)) for i, (name, text) in tqdm(enumerate(texts)): phone_seq = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :] phones = torch.autograd.Variable( torch.from_numpy(phone_seq)).cuda().long() if hparams.bert == False: sequence = phones # Extract BERT embeddings else: features = extract_embeddings(bert, tokenizer, text) sequence = (phones, features) mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence) if args.alignment: plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T),
audiopath, test_text, speaker = dataloader.audiopaths_and_text[batch_idx[i]] #copyfile(audiopath, os.path.join(output_dir, 'ref_true.wav')) fname_wav = os.path.join(output_dir, 'ref_true_{}.wav'.format(i)) mel_outputs_postnet = batch['support']['mel_padded'][ref_idx:ref_idx+1] # remove pad #mel_len = int(batch['support']['f0_padded'][ref_idx].sum().item()) mel_len = (mel_outputs_postnet.mean(1) != 0).sum() mel_outputs_postnet = mel_outputs_postnet[:,:,:mel_len] audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:,0] write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy()) save_figure(mel_outputs_postnet[0].data.cpu().numpy(), np.zeros((10,10)), fname_wav.replace('.wav', '.png'), description=test_text) text_encoded = torch.LongTensor( text_to_sequence(test_text, hparams.text_cleaners, arpabet_dict) )[None,:].cuda() text_lengths = torch.LongTensor( [len(text_encoded)]).cuda() input_dict = {'query': {'text_padded': text_encoded, 'input_lengths': text_lengths}, 'support': batch['support']} with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, alignments = model.inference(input_dict) audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:,0] fname_wav = os.path.join(output_dir, 'ref_pred_{}.wav'.format(i)) write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy()) save_figure(mel_outputs_postnet[0].data.cpu().numpy(),
copyfile(audio_path, fname_wav) # save waveglow original mel mel = load_mel(audio_path) fname_wav = os.path.join(output_dir, 'ref_recon_{}.wav'.format(idx)) with torch.no_grad(): audio = denoiser(waveglow.infer(mel, sigma=0.8), 0.01)[:, 0] write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy()) fname_fig = os.path.join(output_dir, 'true_mel_{}.png'.format(idx)) save_figure(mel[0].data.cpu().numpy(), np.zeros((10, 10)), fname_fig, text) # save waveglow prediction mel fname_wav = os.path.join(output_dir, 'pred_{}.wav'.format(idx)) text_encoded = torch.LongTensor(\ text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None,:].cuda().long() with torch.no_grad(): _, mel_post, _, attn = model.inference((text_encoded, mel)) audio = denoiser(waveglow.infer(mel_post, sigma=0.8), 0.01)[:, 0] write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy()) fname_fig = os.path.join(output_dir, 'pred_mel_{}.png'.format(idx)) save_figure(mel_post[0].data.cpu().numpy(), attn[0].data.cpu().numpy(), fname_fig, text) print(idx, text) # non-parallel predictions for text in test_text_list: text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners,
def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence( text, self.text_cleaners)) # self.cmudict, self.p_arpabet)) return text_norm
def get_text(self, transcript): text = text_to_sequence(transcript, cleaner_names=hps.cleaner_names) text = torch.IntTensor(text) return text
melgan_path = 'models/multi_speaker.pt' load_vocoder_melgan(melgan_path) ## Setup dataloaders arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'data/examples_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) ## Load data file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path) print(audio_path, text) ## Define Speakers Set speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *', names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME']) speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1) female_speakers = cycle( speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist()) male_speakers = cycle( speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
def get_text(self, text): #print(text) pyin, txt = get_pyin(text) #print(pyin) text_norm = torch.IntTensor(text_to_sequence(pyin, self.text_cleaners)) return text_norm
def synthesize(self, texts=None, tokens=None, base_path=None, paths=None, speaker_ids=None, start_of_sentence=None, end_of_sentence=True, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, manual_attention_mode=0, base_alignment_path=None, librosa_trim=False, attention_trim=True, isKorean=True): # Possible inputs: # 1) text=text # 2) text=texts # 3) tokens=tokens, texts=texts # use texts as guide if type(texts) == str: texts = [texts] if texts is not None and tokens is None: sequences = [text_to_sequence(text) for text in texts] elif tokens is not None: sequences = tokens if paths is None: paths = [None] * len(sequences) if texts is None: texts = [None] * len(sequences) time_str = get_time() def plot_and_save_parallel(wavs, alignments, use_manual_attention): items = list( enumerate(zip(wavs, alignments, paths, texts, sequences))) fn = partial(plot_graph_and_save_audio, base_path=base_path, start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, pre_word_num=pre_word_num, post_word_num=post_word_num, pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, use_short_concat=use_short_concat, use_manual_attention=use_manual_attention, librosa_trim=librosa_trim, attention_trim=attention_trim, time_str=time_str, isKorean=isKorean) return parallel_run(fn, items, desc="plot_graph_and_save_audio", parallel=False) input_lengths = np.argmax(np.array(sequences) == 1, 1) fetches = [ #self.wav_output, self.model.linear_outputs, self.model.alignments, ] feed_dict = { self.model.inputs: sequences, self.model.input_lengths: input_lengths, } if base_alignment_path is None: feed_dict.update({ self.model.manual_alignments: np.zeros([1, 1, 1]), self.model.is_manual_attention: False, }) else: manual_alignments = [] alignment_path = os.path.join(base_alignment_path, os.path.basename(base_path)) for idx in range(len(sequences)): numpy_path = "{}.{}.npy".format(alignment_path, idx) manual_alignments.append(np.load(numpy_path)) alignments_T = np.transpose(manual_alignments, [0, 2, 1]) feed_dict.update({ self.model.manual_alignments: alignments_T, self.model.is_manual_attention: True, }) if speaker_ids is not None: if type(speaker_ids) == dict: speaker_embed_table = sess.run(self.model.speaker_embed_table) speaker_embed = [speaker_ids[speaker_id] * \ speaker_embed_table[speaker_id] for speaker_id in speaker_ids] feed_dict.update({self.model.speaker_embed_table: np.tile()}) else: feed_dict[self.model.speaker_id] = speaker_ids wavs, alignments = \ self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel(wavs, alignments, True) if manual_attention_mode > 0: # argmax one hot if manual_attention_mode == 1: alignments_T = np.transpose(alignments, [0, 2, 1]) # [N, E, D] new_alignments = np.zeros_like(alignments_T) for idx in range(len(alignments)): argmax = alignments[idx].argmax(1) new_alignments[idx][(argmax, range(len(argmax)))] = 1 # sharpening elif manual_attention_mode == 2: new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] for idx in range(len(alignments)): var = np.var(new_alignments[idx], 1) mean_var = var[:input_lengths[idx]].mean() new_alignments = np.pow(new_alignments[idx], 2) # prunning elif manual_attention_mode == 3: new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] for idx in range(len(alignments)): argmax = alignments[idx].argmax(1) new_alignments[idx][(argmax, range(len(argmax)))] = 1 feed_dict.update({ self.model.manual_alignments: new_alignments, self.model.is_manual_attention: True, }) new_wavs, new_alignments = \ self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel(new_wavs, new_alignments, True) return results
def get_text(self, text): text_norm = torch.LongTensor(text_to_sequence(text, [self.text_cleaners])) return text_norm
def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm
def build_from_path(config): warning("Sampling rate: {}".format(hparams.sample_rate)) executor = ProcessPoolExecutor(max_workers=config.num_workers) futures = [] index = 1 base_dir = os.path.dirname(config.metadata_path) data_dir = os.path.join(base_dir, config.data_dirname) makedirs(data_dir) loss_coeff = defaultdict(one) if config.metadata_path.endswith("json"): with open(config.metadata_path) as f: content = f.read() info = json.loads(content) elif config.metadata_path.endswith("csv"): with open(config.metadata_path) as f: info = {} for line in f: path, text = line.strip().split('|') info[path] = text else: raise Exception(" [!] Unkown metadata format: {}".format(config.metadata_path)) new_info = {} for path in info.keys(): if not os.path.exists(path): new_path = os.path.join(base_dir, path) if not os.path.exists(new_path): print(" [!] Audio not found: {}".format([path, new_path])) continue else: new_path = path new_info[new_path] = info[path] info = new_info for path in info.keys(): if type(info[path]) == list: if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \ hparams.ignore_recognition_level == 2: loss_coeff[path] = hparams.recognition_loss_coeff info[path] = info[path][0] ignore_description = { 0: "use all", 1: "ignore only unmatched_alignment", 2: "fully ignore recognitio", } print(" [!] Skip recognition level: {} ({})". \ format(hparams.ignore_recognition_level, ignore_description[hparams.ignore_recognition_level])) for audio_path, text in info.items(): if hparams.ignore_recognition_level > 0 and loss_coeff[audio_path] != 1: continue if base_dir not in audio_path: audio_path = os.path.join(base_dir, audio_path) try: tokens = text_to_sequence(text) except: continue fn = partial( _process_utterance, audio_path, data_dir, tokens, loss_coeff[audio_path]) futures.append(executor.submit(fn)) n_frames = [future.result() for future in tqdm(futures)] n_frames = [n_frame for n_frame in n_frames if n_frame is not None] hours = frames_to_hours(n_frames) print(' [*] Loaded metadata for {} examples ({:.2f} hours)'.format(len(n_frames), hours)) print(' [*] Max length: {}'.format(max(n_frames))) print(' [*] Min length: {}'.format(min(n_frames))) plot_n_frames(n_frames, os.path.join( base_dir, "n_frames_before_filter.png")) min_n_frame = hparams.reduction_factor * hparams.min_iters max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor n_frames = [n for n in n_frames if min_n_frame <= n <= max_n_frame] hours = frames_to_hours(n_frames) print(' [*] After filtered: {} examples ({:.2f} hours)'.format(len(n_frames), hours)) print(' [*] Max length: {}'.format(max(n_frames))) print(' [*] Min length: {}'.format(min(n_frames))) plot_n_frames(n_frames, os.path.join( base_dir, "n_frames_after_filter.png"))