def __call__(self, text, denoise=True): """ inference only for now args: text: The text to convert visualize: Whether to display intermediary results, the mel spectograms denoise: whether to reduce the waveglow bias to denoise the audio """ with torch.no_grad(): sequence = np.array(text_to_sequence( text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).long() if self.device.type == "cuda": sequence = sequence.cuda() mel_outputs, mel_outputs_postnet, _, alignments = self.tacotron.inference( sequence) audio = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) if denoise: audio = self.denoiser(audio, strength=0.01)[:, 0] return audio, mel_outputs, mel_outputs_postnet, alignments
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model = load_and_setup_model(parser, args) log_hardware() log_args(args) if args.include_warmup: sequences = torch.randint(low=0, high=148, size=(1,50), dtype=torch.long).cuda() text_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): _, mels, _, _, mel_lengths = model.infer(sequences, text_lengths) os.makedirs(args.output, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [os.path.join(args.dataset_path, anchor) for anchor in args.anchor_dirs] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for mel_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) melspec = torch.from_numpy(np.load(mel_path)) target = melspec[:, ::args.reduction_factor] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) inputs = (to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).float(), to_gpu(target_lengths).int()) _, mel_outs, _, _ = model(inputs) fname = os.path.basename(mel_path) np.save(os.path.join(args.output, fname), mel_outs[0, :, :melspec.shape[1]], allow_pickle=False) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
def prepare_input_sequence(texts, speaker_id): sequences = [text_to_sequence(text, speaker_id, ['basic_cleaners'])[:] for text in texts] texts, text_lengths, ids_sorted_decreasing = pad_sequences(sequences) if torch.cuda.is_available(): texts = texts.cuda().long() text_lengths = text_lengths.cuda().int() else: texts = texts.long() text_lengths = text_lengths.int() return texts, text_lengths, ids_sorted_decreasing
def mapper(line): fp, text, _ = line.strip().split('|') seq = text_to_sequence(text, ['english_cleaners']) if os.path.isfile(fp): with audioread.audio_open(fp) as f: duration = f.duration else: duration = None return fp, len(seq), duration
def prepare_input_sequence(texts): d = [] for i, text in enumerate(texts): d.append( torch.IntTensor(text_to_sequence(text, ['english_cleaners'])[:])) text_padded, input_lengths = pad_sequences(d) if torch.cuda.is_available(): text_padded = torch.autograd.Variable(text_padded).cuda().long() input_lengths = torch.autograd.Variable(input_lengths).cuda().long() else: text_padded = torch.autograd.Variable(text_padded).long() input_lengths = torch.autograd.Variable(input_lengths).long() return text_padded, input_lengths
def prepare_input_sequence(texts, cpu_run=False): d = [] for i, text in enumerate(texts): d.append( torch.IntTensor(text_to_sequence(text, ['english_cleaners'])[:])) text_padded, input_lengths = pad_sequences(d) if not cpu_run: text_padded = text_padded.cuda().long() input_lengths = input_lengths.cuda().long() else: text_padded = text_padded.long() input_lengths = input_lengths.long() return text_padded, input_lengths
def get_mel_from_tacotron2(audiopath, text): audio, sampling_rate = load_wav_to_torch(audiopath) audio = audio.numpy() preprocessed_wav = encoder.preprocess_wav(audio, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) embed = torch.Tensor(embed).cuda() sequence = np.array(text_to_sequence(text))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, embed) return mel_outputs, mel_outputs_postnet, alignments
def prepare_input_sequence(texts, cpu_run=False): d = [] for i, text in enumerate(texts): d.append( torch.IntTensor( #TODO: eng or kor text_to_sequence(text, ['english_cleaners'])[:])) #text_to_sequence(text, ['transliteration_cleaners'])[:])) text_padded, input_lengths = pad_sequences(d) if torch.cuda.is_available() and not cpu_run: text_padded = torch.autograd.Variable(text_padded).cuda().long() input_lengths = torch.autograd.Variable(input_lengths).cuda().long() else: text_padded = torch.autograd.Variable(text_padded).long() input_lengths = torch.autograd.Variable(input_lengths).long() return text_padded, input_lengths
def inference_mel(text, model): """" Performs conversion from text to mel spectogram """ sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) mel = torch.autograd.Variable(mel_outputs_postnet) mel = mel.reshape(80, mel.shape[2]) mel = mel.data filename = "text_to_mel" mel = torch.save(mel, filename) file = open(str(filename) + ".txt", 'w') file.write(filename) file.close() return file.name
def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("waveglow_items_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) log_hardware() log_args(args) # tacotron2 model filepath was specified if args.tacotron2: # Setup Tacotron2 tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16_run) # file with mel spectrogram was specified elif args.mel_file: mel = torch.load(args.mel_file) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) # Setup WaveGlow if args.old_waveglow: waveglow = torch.load(args.waveglow)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow = waveglow.cuda() waveglow.eval() else: waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16_run) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file. Using default text.") texts = ["The forms of printed letters should be beautiful, and\ that their arrangement on the page should be reasonable and\ a help to the shapeliness of the letters themselves."] for i, text in enumerate(texts): LOGGER.iteration_start() sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() if args.tacotron2: tacotron2_t0 = time.time() with torch.no_grad(): _, mel, _, _ = tacotron2.inference(sequence) tacotron2_t1 = time.time() tacotron2_infer_perf = sequence.size(1)/(tacotron2_t1-tacotron2_t0) LOGGER.log(key="tacotron2_items_per_sec", value=tacotron2_infer_perf) waveglow_t0 = time.time() with torch.no_grad(): audio = waveglow.infer(mel, sigma=args.sigma_infer) audio = audio.float() waveglow_t1 = time.time() waveglow_infer_perf = audio[0].size(0)/(waveglow_t1-waveglow_t0) audio_path = args.output + "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[0].data.cpu().numpy()) LOGGER.log(key="waveglow_items_per_sec", value=waveglow_infer_perf) LOGGER.iteration_stop() LOGGER.finish()
def get_sequence(self, text, speaker_id): return text_to_sequence(text, speaker_id, self.text_cleaners)
def get_text(self, text): "function which maps input text to integer tensor list" text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm
def get_text(self, text): return text_to_sequence(text, self.text_cleaners)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_training_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model, args = load_and_setup_model(parser, args) log_hardware() log_args(args) os.makedirs(args.output_dir, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [ os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs ] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for npy_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) wav = load_wav_to_torch(npy_path) mel = stft.mel_spectrogram(wav.unsqueeze(0)) mel = mel.squeeze() max_target_len = mel.size(1) - 1 max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val) target = padded_mel[:, ::args.n_frames_per_step] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) outputs = model.infer( to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int()) _, mel_out, _, _ = [ output.cpu() for output in outputs if output is not None ] mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1] assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length) fname = os.path.basename(npy_path) np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False) # GTA synthesis # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze()) # wav = griffin_lim(magnitudes, stft.stft_fn, 60) # save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
origin='bottom', interpolation='none') fig.savefig('data' + str(i) + '.png') plt.close(fig) sys.path.append('./waveglow') tacotron_path = 'output/checkpoint_Tacotron2_30' taco_checkpoint = torch.load(tacotron_path, map_location='cpu') state_dict = torch.load(tacotron_path)['state_dict'] t2 = models.get_model('Tacotron2', taco_checkpoint['config'], to_cuda=True) text = "아들 진수가 살아 돌아온다" # preprocessing inputs = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] print(inputs) inputs = torch.from_numpy(inputs).to(device='cuda', dtype=torch.int64) #inputs = torch.from_numpy(np.array([bombom, kitkat], dtype=np.int64)).to(device='cuda', dtype=torch.int64) #input_lengths = torch.IntTensor([inputs.size(1), inputs.size(1)]).cuda().long() input_lengths = torch.IntTensor([inputs.size(1)]).cuda().long() speaker_id = torch.IntTensor([0]).cuda().long() embedded_speaker = t2.speakers_embedding(speaker_id) print("speaker", embedded_speaker) t2.load_state_dict(state_dict) _ = t2.cuda().eval().half() waveglow = torch.load('output/waveglow_128000')['model'] for m in waveglow.modules():
def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text)) return text_norm