def synthesis(m, s, CONFIG, use_cuda, ap, language=None): """ Given the text, synthesising the audio """ if language is None: language = CONFIG.phoneme_language text_cleaner = [CONFIG.text_cleaner] # print(phoneme_to_sequence(s, text_cleaner)) # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner))) if CONFIG.use_phonemes: seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language), dtype=np.int32) else: seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() mel_spec, linear_spec, alignments, stop_tokens = m.forward( chars_var.long()) linear_spec = linear_spec[0].data.cpu().numpy() mel_spec = mel_spec[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() mel_tensor = torch.FloatTensor(mel_spec.T).unsqueeze(0) if torch.cuda.is_available(): mel_tensor = mel_tensor.cuda() wav = wavernn.generate(mel_tensor, batched=True, target=11000, overlap=550) return wav
def create_speech(m, s, CONFIG, use_cuda, ap): text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(s, text_cleaner)) # mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32) if use_cuda: chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda() # mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda() else: chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0) # mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True) mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() spec = ap._denormalize(linear_out) wav = ap.inv_spectrogram(linear_out.T) wav = wav[:ap.find_endpoint(wav)] out = io.BytesIO() ap.save_wav(wav, out) return wav, alignment, spec, stop_tokens
def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample
def synthesis(model, ap, text, use_cuda, text_cleaner): text_cleaner = [text_cleaner] seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda().long() _, linear_out, alignments, _ = model.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = ap.inv_spectrogram(linear_out.T) return wav, linear_out, alignments
def create_speech(m, s, CONFIG, use_cuda, ap): text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(s, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() spec = ap._denormalize(linear_out) wav = ap.inv_spectrogram(linear_out.T) wav = wav[:ap.find_endpoint(wav)] out = io.BytesIO() ap.save_wav(wav, out) return wav, alignment, spec, stop_tokens
def tts(text, model_path='model/best_model.pth.tar', config_path='model/config.json', use_cuda=False): CONFIG = load_config(config_path) model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r) if use_cuda: cp = torch.load(model_path + seq_to_seq_test_model_fname, map_location='cuda:0') else: cp = torch.load(model_path, map_location=lambda storage, loc: storage) model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() model.decoder.max_decoder_steps = 250 ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis, griffin_lim_iters=50) t_1 = time.time() text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() linear_out = model.forward(chars_var.long()) linear_out = linear_out[0].data.cpu().numpy() waveform = ap.inv_spectrogram(linear_out.T) waveform = waveform[:ap.find_endpoint(waveform)] out_path = 'static/samples/' os.makedirs(out_path, exist_ok=True) file_name = text.replace(" ", "_").replace(".", "") + ".wav" out_path = os.path.join(out_path, file_name) ap.save_wav(waveform, out_path) # print(" > Run-time: {}".format(time.time() - t_1)) return file_name
def load_data(self, idx): text, wav_file, speaker_name = self.items[idx] wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) if self.use_phonemes: text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] sample = { 'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'speaker_name': speaker_name } return sample
def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen += '.' print(sen) sen = sen.strip() seq = np.array(text_to_sequence(text, text_cleaner)) chars_var = torch.from_numpy(seq).unsqueeze(0) if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) # wav = wav[:self.ap.find_endpoint(wav)] out = io.BytesIO() wavs.append(wav) wavs.append(np.zeros(10000)) self.save_wav(wav, out) return out
def synthesis(m, s, CONFIG, use_cuda, ap, language=None): """ Given the text, synthesising the audio """ if language is None: language = CONFIG.phoneme_language text_cleaner = [CONFIG.text_cleaner] # print(phoneme_to_sequence(s, text_cleaner)) # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner))) if CONFIG.use_phonemes: seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language), dtype=np.int32) else: seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() mel_spec, linear_spec, alignments, stop_tokens = m.forward( chars_var.long()) linear_spec = linear_spec[0].data.cpu().numpy() mel_spec = mel_spec[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() wav = ap.inv_spectrogram(linear_spec.T) wav = wav[:ap.find_endpoint(wav)] return wav