def load_phoneme_sequence(self, wav_file, text): file_name = os.path.basename(wav_file).split('.')[0] tmp_path = os.path.join(self.phoneme_cache_path, file_name + '_phoneme.npy') if os.path.isfile(tmp_path): try: text = np.load(tmp_path) except: print( " > ERROR: phoneme connot be loaded for {}. Recomputing.". format(wav_file)) text = np.asarray(phoneme_to_sequence( text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos), dtype=np.int32) np.save(tmp_path, text) else: text = np.asarray(phoneme_to_sequence( text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=self.enable_eos_bos), dtype=np.int32) np.save(tmp_path, text) return text
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
def generate_phoneme_sequence(text, phoneme_file): phonemes = phoneme_to_sequence(text, ['phoneme_cleaners'], language='en-us', enable_eos_bos=False) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(phoneme_file, phonemes) return phonemes
def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None): if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 fig = plt.figure(figsize=(8, 24)) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) if CONFIG.use_phonemes: seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars) text = sequence_to_phoneme(seq) print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) plt.subplot(num_plot, 1, 3) librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if spectrogram is not None: plt.subplot(num_plot, 1, 4) librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if output_path: print(output_path) fig.savefig(output_path) plt.close()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' print(sen) sen = sen.strip() seq = np.array( phoneme_to_sequence(sen, text_cleaner, self.config.phoneme_language)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
def load_phoneme_sequence(self, wav_file, text): file_name = os.path.basename(wav_file).split('.')[0] tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy') if os.path.isfile(tmp_path): text = np.load(tmp_path) else: text = np.asarray( phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language), dtype=np.int32) np.save(tmp_path, text) return text
def tts(model, text, ap): inputs = phoneme_to_sequence(text, ['phoneme_cleaners'], language='en-us', enable_eos_bos=False) inputs = np.asarray(inputs, dtype=np.int32) inputs = torch.from_numpy(inputs).unsqueeze(0).long().to(device) decoder_output, postnet_output, alignments, stop_tokens = \ model.inference(inputs) postnet_output = postnet_output[0].data.cpu().numpy() wav = ap.inv_spectrogram(postnet_output.T) return wav
def _generate_and_cache_phoneme_sequence(self, text, cache_path): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=False) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes
def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval()