def synthesizer(model, text, device): seq = text_to_sequence(text, [hparams.cleaners]) # seq = torch.Tensor(seq).to(device) seq = np.stack([seq]) if torch.cuda.is_available(): seq = torch.from_numpy(seq).type(torch.cuda.LongTensor).to(device) else: seq = torch.from_numpy(seq).type(torch.LongTensor).to(device) # print(seq) # Provide [GO] Frame mel_input = np.zeros([np.shape(seq)[0], hparams.num_mels, 1], dtype=np.float32) mel_input = torch.Tensor(mel_input).to(device) # print(np.shape(mel_input)) model.eval() with torch.no_grad(): _, linear_output = model(seq, mel_input) # print(np.shape(linear_output)) # trans_linear = audio.trans(linear_output[0].cpu().numpy()) wav = audio.inv_spectrogram(linear_output[0].cpu().numpy()) # print(audio.find_endpoint(wav)) # print(np.shape(wav)) wav = wav[:audio.find_endpoint(wav)] # print(np.shape(wav)) return wav
def tts(model, text, spk, qF0s): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) spk = np.array([spk]) #sequence = np.array(text_to_sequence(text, [hparams.cleaners])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) spk = Variable(torch.from_numpy(spk)) qF0s = np.array(qF0s) qF0s = Variable(torch.from_numpy(qF0s)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() spk = spk.cuda() qF0s = qF0s.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence, spk, qF0s.long()) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T, hparams) save_wav(waveform, audio_path, hparams.sample_rate) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: log('Training korean : Use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True), isKorean=True) else: log('Training non-korean : X use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
def tts(model, text, mel): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) mel = Variable(torch.from_numpy(mel)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() mel = mel.cuda() mel_outputs, linear_outputs, alignments = model.forward_generate_gst( sequence, mel) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() # TODO: Turning off dropout of decoder's prenet causes serious performance # regression, not sure why. # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text) #sequence = np.array(text_to_sequence(text, [hparams.cleaners])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def save_states(global_step, mel_outputs, linear_outputs, attn, y, checkpoint_dir=None): idx = 1 # idx = np.random.randint(0, len(mel_outputs)) # Alignment path = os.path.join(checkpoint_dir, "step{}_alignment.png".format(global_step)) alignment = attn[idx].cpu().data.numpy( ) # alignment = attn[idx].cpu().data.numpy()[:, :input_length] plot_alignment(alignment.T, path, info="tacotron, step={}".format(global_step)) # Predicted spectrogram path = os.path.join(checkpoint_dir, "step{}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() plot_spectrogram(linear_output, path) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = os.path.join(checkpoint_dir, "step{}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = os.path.join(checkpoint_dir, "step{}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() plot_spectrogram(linear_output, path)
def tts(model, text, tones): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) tones = np.array(tones) tones = Variable(torch.from_numpy(tones)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() tones = tones.cuda() mel_outputs, linear_outputs = model(sequence, tones) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) waveform = audio.inv_spectrogram(linear_output.T) return waveform, spectrogram
def synthesize(model, mspec, spk): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.eval() sequence = np.array(mspec) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) spk = np.array(spk) spk = Variable(torch.from_numpy(spk)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() spk = spk.cuda() with torch.no_grad(): model.forward_getlatents(sequence) mel_outputs, linear_outputs, = model.forward_eval(sequence, spk) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) waveform = audio.inv_spectrogram(linear_output.T) return waveform
def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if USE_CUDA: model = model.cuda() # NOTE: dropout in the decoder should be activated for generalization! # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text_to_sequence(text)) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if USE_CUDA: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, gate_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio._denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
wavs = [ os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav") ] outputs_py = [file + ".py.gen.wav" for file in wavs] outputs_tf = [file + ".tf.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] spectrogram = [audio.spectrogram(wav).astype(np.float32) for wav in wavs] print("Linear spectrograms dim: ") print(spectrogram[0].shape) # --------------------------------- librosa Version --------------------------------- # convert back gens = [audio.inv_spectrogram(s) for s in spectrogram] for gen, output in zip(gens, outputs_py): audio.save_wav(gen, output) # --------------------------------- TensorFlow Version --------------------------------- samples = [inv_spectrogram(spec) for spec in spectrogram] with tf.Session() as sess: samples = [sess.run(sample) for sample in samples] for gen, output in zip(samples, outputs_tf): audio.save_wav(gen, output) print("Done!")
def plot(self, with_head=False): ''' Plotting the visualizations of the Unsupervised End-to-end Mockingjay Model''' self.verbose('Testing set total ' + str(len(self.dataloader)) + ' batches.') if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) with torch.no_grad(): idx = 0 for x in tqdm(self.dataloader, desc="Plotting"): spec_stacked, pos_enc, attn_mask = self.process_MAM_data( spec=x) if with_head: outputs = self.model(spec_stacked, pos_enc, attention_mask=attn_mask) if self.output_attention: _, pred_spec = outputs else: pred_spec, _ = outputs # generate the model filled MAM spectrogram spec_masked = copy.deepcopy(spec_stacked) for i in range(len(spec_masked)): sample_index = random.sample( range(len(spec_masked[i])), int( len(spec_masked[i]) * self.config['mockingjay']['mask_proportion'])) spec_masked[i][sample_index] = 0 outputs = self.model(spec_masked, pos_enc, attention_mask=attn_mask) if self.output_attention: _, fill_spec = outputs else: fill_spec, _ = outputs # plot reconstructed / ground-truth / MAM filled spectrogram for y_pred, y_true, y_fill in zip(pred_spec, spec_stacked, fill_spec): y_pred = self.up_sample_frames(y_pred, return_first=True) y_true = self.up_sample_frames(y_true, return_first=True) y_fill = self.up_sample_frames(y_fill, return_first=True) plot_spectrogram(y_pred.data.cpu().numpy(), path=os.path.join( self.dump_dir, str(idx) + '_pred.png')) plot_spectrogram(y_true.data.cpu().numpy(), path=os.path.join( self.dump_dir, str(idx) + '_true.png')) plot_spectrogram(y_fill.data.cpu().numpy(), path=os.path.join( self.dump_dir, str(idx) + '_fill.png')) wave_pred = inv_spectrogram( y_pred.data.cpu().numpy().T) wave_fill = inv_spectrogram( y_fill.data.cpu().numpy().T) librosa.output.write_wav( os.path.join(self.dump_dir, str(idx) + '_pred.wav'), wave_pred, sample_rate) librosa.output.write_wav( os.path.join(self.dump_dir, str(idx) + '_fill.wav'), wave_fill, sample_rate) idx += 1 if idx >= 10: self.verbose( 'Spectrogram head generated samples are saved to: {}' .format(self.dump_dir)) exit() # visualize the first 10 testing samples elif self.output_attention: all_attentions, _ = self.mockingjay( spec_stacked, pos_enc, attention_mask=attn_mask, output_all_encoded_layers=True) all_attentions = torch.stack(all_attentions).transpose( 0, 1) # all_attentions: (batch_size, num_layer, num_head, Q_seq_len, K_seq_len) for attentions in all_attentions: torch.save( attentions.cpu(), os.path.join(self.dump_dir, f'{idx}_attentions')) idx += 1 if idx >= 10: self.verbose( f'Attention samples are saved to {self.dump_dir}' ) exit() else: encoded_layers = self.mockingjay( spec_stacked, pos_enc, attention_mask=attn_mask, output_all_encoded_layers=True) encoded_layers = torch.stack(encoded_layers) layer_num = encoded_layers.size(0) batch_size = encoded_layers.size(1) seq_len = encoded_layers.size(2) feature_dim = encoded_layers.size(3) dckpt = torch.load(self.paras.load_ws) weights = dckpt['Classifier']['weight'] flatten = encoded_layers.reshape(layer_num, -1) weighted_sum = torch.matmul(weights[:layer_num], flatten).reshape( batch_size, seq_len, feature_dim) # embeddings: (batch_size, seq_len, feature_dim) targets = [ encoded_layers[0], encoded_layers[-1], weighted_sum ] target_names = [ '_hidden_first.png', '_hidden_last.png', '_hidden_weighted_sum.png' ] for target, name in zip(targets, target_names): for index, rep in enumerate(target): if idx + index >= 10: break png_name = os.path.join(self.dump_dir, str(idx + index) + name) self.verbose(f'Generating {png_name}') plot_embedding(rep.data.cpu().numpy(), path=png_name) idx += batch_size if idx >= 10: self.verbose( 'Mockingjay generated samples are saved to: {}'. format(self.dump_dir)) break # visualize the first 10 testing samples
def train(log_dir, args): checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt') log(hp.to_string(), is_print=False) log('Loading training data from: %s' % args.tfr_dir) log('Checkpoint path: %s' % checkpoint_path) log('Using model: sygst tacotron2') tf_dset = TFDataSet(hp, args.tfr_dir) feats = tf_dset.get_train_next() # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) training = tf.placeholder_with_default(True, shape=(), name='training') with tf.name_scope('model'): model = Tacotron2SYGST(hp) model(feats['inputs'], mel_inputs=feats['mel_targets'], spec_inputs=feats['linear_targets'], spec_lengths=feats['spec_lengths'], ref_inputs=feats['mel_targets'], ref_lengths=feats['spec_lengths'], arousal_labels=feats['soft_arousal_labels'], valence_labels=feats['soft_valance_labels'], training=training) """ text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10) model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training) """ model.add_loss() model.add_optimizer(global_step) stats = model.add_stats() # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2) # Train! config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%s' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s' % restore_path, slack=True) else: log('Starting a new training run ...', slack=True) """ fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max, model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max] """ fetches = [ global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss ] for _ in range(_max_step): start_time = time.time() sess.run(debug.get_ops()) # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches) step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run( fetches) time_window.append(time.time() - start_time) loss_window.append(loss) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) try: summary_writer.add_summary(sess.run(stats), step) except Exception as e: log(f'summary failed and ignored: {str(e)}') if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') gt_mel, gt_spec, seq, mel, spec, align = sess.run([ model.mel_targets[0], model.spec_targets[0], model.text_targets[0], model.mel_outputs[0], model.spec_outputs[0], model.alignment_outputs[0] ]) text = sequence_to_text(seq) wav = audio.inv_spectrogram(hp, spec.T) wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step) mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step) spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step) align_path = os.path.join(log_dir, 'step-%d-align.png' % step) info = '%s, %s, step=%d, loss=%.5f\n %s' % ( args.model, time_string(), step, loss, text) plot.plot_alignment(align, align_path, info=info) plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel) plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec) audio.save_wav(hp, wav, wav_path) log('Input: %s' % text) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc()