def generate(model, text): # Text to index sequence cleaner_names = [x.strip() for x in hp.cleaners.split(',')] seq = np.expand_dims(np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32), axis=0) # Provide [GO] Frame mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32) # Variables characters = Variable(torch.from_numpy(seq).type(torch.cuda.LongTensor), volatile=True).cuda() mel_input = Variable(torch.from_numpy(mel_input).type( torch.cuda.FloatTensor), volatile=True).cuda() # Spectrogram to wav _, linear_output = model.forward(characters, mel_input) wav = inv_spectrogram(linear_output[0].data.cpu().numpy()) wav = wav[:find_endpoint(wav)] out = io.BytesIO() save_wav(wav, out) return out.getvalue()
def generate(model, text, device, writer, curr, _tt): # Text to index sequence cleaner_names = [x.strip() for x in hp.cleaners.split(',')] seq = np.expand_dims(np.asarray(text_to_sequence(text), dtype=np.int32), axis=0) # Provide [GO] Frame mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32) # Variables characters = torch.from_numpy(seq).type(torch.cuda.LongTensor).to(device) mel_input = torch.from_numpy(mel_input).type( torch.cuda.FloatTensor).to(device) mel_input = torch.transpose(mel_input, 1, 2) # Spectrogram to wav _, linear_output = model(characters, mel_input, False) linear_output = torch.transpose(linear_output, 1, 2) wav = inv_spectrogram(linear_output[0].data.cpu().numpy()) wav = wav[:find_endpoint(wav)] wav_tensor = wav * 1.0 / max(0.01, np.max(np.abs(wav))) wav_tensor = torch.Tensor(wav).to(device).view(1, -1) writer.add_audio('audio_result_%02d' % (_tt), wav_tensor, curr, hp.sample_rate) out = io.BytesIO() save_wav(wav, out) return out.getvalue()
def generate_audio(self): print('Generating Audio Samples') # Text to index sequence characters = [] for text in self.sentences: text = np.asarray(text_to_sequence(text, [hp.cleaners]), dtype=np.int32) characters.append(text) characters = _prepare_data(characters).astype(np.int32) characters = torch.from_numpy(characters).long().to(self.args.device) # Provide [GO] Frame mel_input = torch.zeros([characters.shape[0], hp.num_mels, 1], dtype=torch.float).to(self.args.device) print('char: ', characters.shape) print('mel input: ', mel_input.shape) self.model.eval() # Spectrogram to wav _, linear_output = self.model(characters, mel_input) for i in range(linear_output.shape[0]): wav = inv_spectrogram(linear_output[i].data.cpu().numpy()) wav = wav[:find_endpoint(wav)].astype(np.float32) print('wav: ', wav.shape) print('wav max: ', wav.max()) print('wav min: ', wav.min()) print('wav: ', wav.dtype) self.writer.add_audio('audio', wav, self.epoch, sample_rate=16000)
def main(args): device = torch.device('cuda:0') if 'english' in hp.cleaners: _symbols = en_symbols elif 'korean' in hp.cleaners: _symbols = symbols model = Tacotron(len(_symbols)).to(device) checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint['model']) model = model.eval() sentences = [ 'Scientists at the CERN laboratory say they have discovered a new particle.', 'President Trump met with other leaders at the Group of 20 conference.', 'Generative adversarial network or variational auto-encoder.', 'Does the quick brown fox jump over the lazy dog?' ] # Text to index sequence for i, ele in enumerate(sentences): cleaner_names = [x.strip() for x in hp.cleaners.split(',')] seq = np.expand_dims(np.asarray(text_to_sequence(ele), dtype=np.int32), axis=0) # Provide [GO] Frame mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32) # Variables characters = torch.from_numpy(seq).type( torch.cuda.LongTensor).to(device) mel_input = torch.from_numpy(mel_input).type( torch.cuda.FloatTensor).to(device) mel_input = torch.transpose(mel_input, 1, 2) # Spectrogram to wav mel_output, linear_output = model(characters, mel_input, False) linear_output = torch.transpose(linear_output, 1, 2) wav = inv_spectrogram(linear_output[0].data.cpu().numpy()) _wav = wav[:find_endpoint(wav)] out = io.BytesIO() save_wav(_wav, out) tt = out.getvalue() f = open('./result_%02d.wav' % i, 'wb') f.write(tt) f.close()