def audio(outputs, res_pth): src = outputs[0] res = outputs[1] # save audio save_wav(src, res_pth + '_src.wav') save_wav(res, res_pth + '_res.wav')
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args): if not os.path.exists(wav_filename): print("Wav file {} doesn't exists.".format(wav_filename)) return None wav = audio.load_wav(wav_filename, sr=hparams.sample_rate) # Process wav samples wav = audio.trim_silence(wav, hparams) n_samples = len(wav) # Extract mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) n_frames = mel_spectrogram.shape[1] if n_frames > hparams.max_acoustic_length: print( "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)." .format(wav_filename, n_frames, hparams.max_acoustic_length)) return None # Align features desired_frames = int(min(n_samples / hparams.hop_size, n_frames)) wav = wav[:desired_frames * hparams.hop_size] mel_spectrogram = mel_spectrogram[:, :desired_frames] n_samples = wav.shape[0] n_frames = mel_spectrogram.shape[1] assert (n_samples / hparams.hop_size == n_frames) # Save intermediate acoustic features mel_filename = os.path.join(out_dir, key + '.npy') np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) audio.save_wav(wav, out_wav_path, hparams) return (wav_filename, mel_filename, n_samples, n_frames)
def save_states(global_step, mel_outputs, linear_outputs, attn, y, checkpoint_dir=None): idx = 1 # idx = np.random.randint(0, len(mel_outputs)) # Alignment path = os.path.join(checkpoint_dir, "step{}_alignment.png".format(global_step)) alignment = attn[idx].cpu().data.numpy( ) # alignment = attn[idx].cpu().data.numpy()[:, :input_length] plot_alignment(alignment.T, path, info="tacotron, step={}".format(global_step)) # Predicted spectrogram path = os.path.join(checkpoint_dir, "step{}_predicted_spectrogram.png".format(global_step)) linear_output = linear_outputs[idx].cpu().data.numpy() plot_spectrogram(linear_output, path) # Predicted audio signal signal = audio.inv_spectrogram(linear_output.T) path = os.path.join(checkpoint_dir, "step{}_predicted.wav".format(global_step)) audio.save_wav(signal, path) # Target spectrogram path = os.path.join(checkpoint_dir, "step{}_target_spectrogram.png".format(global_step)) linear_output = y[idx].cpu().data.numpy() plot_spectrogram(linear_output, path)
def synthesize(self, inputs): cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] seq_input = [text_to_sequence(j, cleaner_names) for j in inputs] seq_length = [len(j) for j in seq_input] max_len = max(seq_length) inputs = [_pad_input(j, max_len) for j in seq_input] seq = np.stack((x for x in inputs)) # seq = text_to_sequence(text, cleaner_names) if not self.model_filename.endswith('.pb'): feed_dict = { self.model.inputs: np.asarray(seq, dtype=np.int32), self.model.input_lengths: np.asarray(seq_length, dtype=np.int32) } else: feed_dict = { self.inputs: np.asarray(seq, dtype=np.int32), self.input_lengths: np.asarray(seq_length, dtype=np.int32) } wav = self.session.run(self.wav_output, feed_dict=feed_dict) output = [] print('wav.shape:', wav.shape) for wav_index in range(wav.shape[0]): wav_index_temp = audio.inv_preemphasis(wav[wav_index]) wav_index_temp = wav_index_temp[:audio.find_endpoint(wav_index_temp)] # wav_index_temp = vad_check(wav_index_temp, hparams.sample_rate) out = io.BytesIO() audio.save_wav(wav_index_temp, out) output.append(out) return output
def save_and_plot_fn(args, log_dir, step, loss, prefix): idx, (seq, spec, align) = args audio_path = os.path.join( log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx)) align_path = os.path.join( log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx)) waveform = inv_spectrogram(spec.T, hparams) save_wav(waveform, audio_path, hparams.sample_rate) info_text = 'step={:d}, loss={:.5f}'.format(step, loss) if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]: log('Training korean : Use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=True), isKorean=True) else: log('Training non-korean : X use jamo') plot.plot_alignment(align, align_path, info=info_text, text=sequence_to_text(seq, skip_eos_and_pad=True, combine_jamo=False), isKorean=False)
def main(): data_foler = "data" wavs = [ os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler) if file.endswith(".wav") ] outputs_lws = [file + ".lws.gen.wav" for file in wavs] wavs = [ audio.load_wav(wav_path + ".wav", hparams.sample_rate) for wav_path in wavs ] lws_processor = lws.lws( 512, 128, mode="speech") # 512: window length; 128: window shift i = 0 for x in wavs: X = lws_processor.stft(x) # where x is a single-channel waveform X0 = np.abs(X) # Magnitude spectrogram print('{:6}: {:5.2f} dB'.format('Abs(X)', lws_processor.get_consistency(X0))) X1 = lws_processor.run_lws( X0 ) # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram) print(X1.shape) print('{:6}: {:5.2f} dB'.format('LWS', lws_processor.get_consistency(X1))) print(X1.shape) wav = lws_processor.istft(X1).astype(np.float32) audio.save_wav(wav, outputs_lws[i]) i += 1
def audio(output, pth): mel_outputs, mel_outputs_postnet, _ = output wav = inv_melspectrogram(to_arr(mel_outputs[0])) wav_postnet = inv_melspectrogram(to_arr(mel_outputs_postnet[0])) save_wav(wav, pth + '.wav') save_wav(wav_postnet, pth + '_post.wav') print('wav save to:', pth + '.wav') print('postnet_wav save to:', pth + '_post.wav')
def save_current_model(args, checkpoint_path, global_step, hparams, loss, model, plot_dir, saver, sess, step, wav_dir): # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run( [ model.inputs[0], model.post_net_predictions[0], model.mag_pred[0], model.alignments[0], model.targets_mel[0], model.targets_length[0], model.targets_mag[0], ]) alignments, alignment_titles = get_alignments(attention_mask_sample) # save griffin lim inverted wav for debug (linear -> wav) wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-linear.wav'.format(step)), sr=hparams.sample_rate) # Save real and predicted linear-spectrogram plot to disk (control purposes) plot.plot_spectrogram( linear_prediction, os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target, max_len=target_length, auto_aspect=True) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav(wav, os.path.join(wav_dir, '{}-mel.wav'.format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) for i in range(len(alignments)): plot.plot_alignment( alignments[i], os.path.join(plot_dir, '{}_{}-align.png'.format(step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), max_len=target_length // hparams.reduction_factor) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, '{}-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss), target_spectrogram=targets_mel, max_len=target_length) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
def synthesis(args): model = create_model(args) if args.resume is not None: attempt_to_restore(model, args.resume, args.use_cuda) device = torch.device("cuda" if args.use_cuda else "cpu") output_dir = "samples" os.makedirs(output_dir, exist_ok=True) lists = [] for filename in os.listdir(os.path.join(args.input, 'mel')): lists.append(filename) start = time.time() conditions = [ np.load(os.path.join(args.input, 'mel', filename)) for filename in lists ] lengths = [condition.shape[0] for condition in conditions] max_len = max(lengths) conditions = [ np.concatenate( (condition, np.zeros((max_len - condition.shape[0], condition.shape[1]))), axis=0) for condition in conditions ] conditions = np.stack(conditions) conditions = torch.FloatTensor(conditions) conditions = conditions.transpose(1, 2).to(device) batch_size = conditions.size()[0] z = torch.randn(batch_size, args.z_dim).to(device).normal_(0.0, 0.6) print(conditions.shape) audios = model(conditions, z) audios = audios.cpu().squeeze().detach().numpy() print(audios.shape) for (i, filename) in enumerate(lists): name = filename.split('.')[0] sample = np.load(os.path.join(args.input, 'audio', filename)) sample = mu_law_decode(mu_law_encode(sample)) save_wav(np.squeeze(sample), '{}/{}_target.wav'.format(output_dir, name)) save_wav( np.asarray(audios[i])[:len(sample)], '{}/{}.wav'.format(output_dir, name)) print("Time used: {:.3f}".format(time.time() - start))
def test_synth(model, step, dst_path): f = open('test.txt') for line in f: if len(line) > 2: line = line.split('\n')[0] fname = line.split()[0] content = line.split()[1:] content = ' '.join(k for k in content) content = re.sub(r'[^\w\s]', '', content) text = ' '.join( str(charids[k.lower()] if k.lower() in charids.keys() else charids['UNK']) for k in content) waveform, alignment, _ = tts(model, text.split()) dst_wav_path = join(dst_path, "{}_step{}.wav".format(fname, step)) audio.save_wav(waveform, dst_wav_path) model.train()
def save_i(i): try: name = names[i] mel = mel_aft[i][:generated_lengths[i]] np.save(os.path.join(output_dir, '%s.npy' % name), mel) wav = mel2wav(mel) save_wav(wav, os.path.join(output_dir, '%s.wav' % name)) if save_trimmed_wave: wav_trim = trim_silence_intervals(wav) save_wav(wav_trim, os.path.join(output_dir, '%s_trim.wav' % name)) plot_mel(os.path.join(output_dir, '%s_mel.png' % name), mel) if n_plot_alignment is None or i < n_plot_alignment: aligns = [a[i].transpose([0, 2, 1]) for a in alignments["encdec"]] plot_attn(aligns, os.path.join(output_dir, '%s_align.png' % (name)), enc_length=input_lengths[i], dec_length=generated_lengths[i]) except: logging.error('Fail to produce eval output: ' + names[i]) logging.error(traceback.format_exc())
def synthesis(args): model = create_model(args) if args.resume is not None: attempt_to_restore(model, args.resume, args.use_cuda) model.after_update() output_dir = "out" os.makedirs(output_dir, exist_ok=True) lists = [] for filename in os.listdir(os.path.join(args.input, 'mel')): lists.append(filename) start = time.time() conditions = [ np.load(os.path.join(args.input, 'mel', filename)) for filename in lists ] lengths = [condition.shape[0] for condition in conditions] max_len = max(lengths) conditions = [ np.concatenate( (condition, np.zeros((max_len - condition.shape[0], condition.shape[1]))), axis=0) for condition in conditions ] conditions = np.stack(conditions) conditions = torch.FloatTensor(conditions) audios = model.generate(conditions) print(audios.shape) for (i, filename) in enumerate(lists): name = filename.split('.')[0] sample = np.load(os.path.join(args.input, 'audio', filename)) save_wav(np.squeeze(sample), '{}/{}_target.wav'.format(output_dir, name)) save_wav( np.asarray(audios[i])[:len(sample)], '{}/{}.wav'.format(output_dir, name)) print("Time used: {:.3f}".format(time.time() - start))
def synthesis(args): model = create_model(args) model.eval() if args.resume is not None: attempt_to_restore(model, args.resume, args.use_cuda) device = torch.device("cuda" if args.use_cuda else "cpu") model.to(device) model.remove_weight_norm() output_dir = "samples" target_dir = os.path.join(output_dir, "target") predict_dir = os.path.join(output_dir, "predict") os.makedirs(output_dir, exist_ok=True) os.makedirs(target_dir, exist_ok=True) os.makedirs(predict_dir, exist_ok=True) avg_rtf = [] for filename in os.listdir(os.path.join(args.input, 'mel')): start = time.time() conditions = np.load(os.path.join(args.input, 'mel', filename)) conditions = torch.FloatTensor(conditions).unsqueeze(0) conditions = conditions.transpose(1, 2).to(device) audio = model(conditions) audio = audio.cpu().squeeze().detach().numpy() print(audio.shape) name = filename.split('.')[0] sample = np.load(os.path.join(args.input, 'audio', filename)) save_wav(np.squeeze(sample), '{}/{}_target.wav'.format(target_dir, name)) save_wav(np.asarray(audio), '{}/{}.wav'.format(predict_dir, name)) time_used = time.time() - start rtf = time_used / (len(audio) / 16000) avg_rtf.append(rtf) print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf)) print("Average RTF: {:.3f}".format(sum(avg_rtf) / len(avg_rtf)))
def synthesis(args): model = create_model(args) if args.resume is not None: attempt_to_restore(model, args.resume, args.use_cuda) device = torch.device("cuda" if args.use_cuda else "cpu") model.to(device) output_dir = "samples" os.makedirs(output_dir, exist_ok=True) avg_rtf = [] for filename in os.listdir(os.path.join(args.input, 'mel')): start = time.time() conditions = np.load(os.path.join(args.input, 'mel', filename)) conditions = torch.FloatTensor(conditions).unsqueeze(0) conditions = conditions.transpose(1, 2).to(device) batch_size = conditions.size()[0] z = torch.randn(batch_size, args.z_dim).to(device).normal_(0.0, 1.0) audios = model(conditions, z) audios = audios.cpu().squeeze().detach().numpy() print(audios.shape) name = filename.split('.')[0] sample = np.load(os.path.join(args.input, 'audio', filename)) sample = mu_law_decode(mu_law_encode(sample)) save_wav(np.squeeze(sample), '{}/{}_target.wav'.format(output_dir, name)) save_wav(np.asarray(audios), '{}/{}.wav'.format(output_dir, name)) time_used = time.time() - start rtf = time_used / (len(audios) / 24000) avg_rtf.append(rtf) print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf)) print("Average RTF: {:.3f}".format(sum(avg_rtf) / len(avg_rtf)))
def train(log_dir, args): checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt') log(hp.to_string(), is_print=False) log('Loading training data from: %s' % args.tfr_dir) log('Checkpoint path: %s' % checkpoint_path) log('Using model: sygst tacotron2') tf_dset = TFDataSet(hp, args.tfr_dir) feats = tf_dset.get_train_next() # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) training = tf.placeholder_with_default(True, shape=(), name='training') with tf.name_scope('model'): model = Tacotron2SYGST(hp) model(feats['inputs'], mel_inputs=feats['mel_targets'], spec_inputs=feats['linear_targets'], spec_lengths=feats['spec_lengths'], ref_inputs=feats['mel_targets'], ref_lengths=feats['spec_lengths'], arousal_labels=feats['soft_arousal_labels'], valence_labels=feats['soft_valance_labels'], training=training) """ text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10) model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training) """ model.add_loss() model.add_optimizer(global_step) stats = model.add_stats() # Bookkeeping: step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2) # Train! config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) if args.restore_step: # Restore from a checkpoint if the user requested it. restore_path = '%s-%s' % (checkpoint_path, args.restore_step) saver.restore(sess, restore_path) log('Resuming from checkpoint: %s' % restore_path, slack=True) else: log('Starting a new training run ...', slack=True) """ fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max, model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max] """ fetches = [ global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss, model.stop_loss, model.arousal_loss, model.valence_loss ] for _ in range(_max_step): start_time = time.time() sess.run(debug.get_ops()) # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches) step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run( fetches) time_window.append(time.time() - start_time) loss_window.append(loss) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g) """ message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % ( step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss) log(message, slack=(step % args.checkpoint_interval == 0)) if loss > 100 or math.isnan(loss): log('Loss exploded to %.5f at step %d!' % (loss, step), slack=True) raise Exception('Loss Exploded') if step % args.summary_interval == 0: log('Writing summary at step: %d' % step) try: summary_writer.add_summary(sess.run(stats), step) except Exception as e: log(f'summary failed and ignored: {str(e)}') if step % args.checkpoint_interval == 0: log('Saving checkpoint to: %s-%d' % (checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) log('Saving audio and alignment...') gt_mel, gt_spec, seq, mel, spec, align = sess.run([ model.mel_targets[0], model.spec_targets[0], model.text_targets[0], model.mel_outputs[0], model.spec_outputs[0], model.alignment_outputs[0] ]) text = sequence_to_text(seq) wav = audio.inv_spectrogram(hp, spec.T) wav_path = os.path.join(log_dir, 'step-%d-audio.wav' % step) mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step) spec_path = os.path.join(log_dir, 'step-%d-spec.png' % step) align_path = os.path.join(log_dir, 'step-%d-align.png' % step) info = '%s, %s, step=%d, loss=%.5f\n %s' % ( args.model, time_string(), step, loss, text) plot.plot_alignment(align, align_path, info=info) plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel) plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec) audio.save_wav(hp, wav, wav_path) log('Input: %s' % text) except Exception as e: log('Exiting due to exception: %s' % e, slack=True) traceback.print_exc()
#text = ' '.join(k for k in line.decode("utf-8").split()[1:]) #text = '< ' + text + ' >' #text = [phids[l] for l in text.split()] text, qF0s = get_textNqF0s(line, phids) # Generating from original speaker spk = speakers_dict[fname[0]] waveform, alignment, _ = tts(model, text, spk, qF0s) fname_generated = '_'.join(k for k in fname[1:]) fname_generated = fname_generated + '_generated' dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_generated, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_generated)) plot_alignment(alignment.T, dst_alignment_path, info="tacotron, {}".format(checkpoint_path)) audio.save_wav(waveform, dst_wav_path) # Generating from a different speaker spk = np.random.randint(len(speakers)) #fname = fname.split('_') #fname[0] = ids2speakers[spk] fname_transferred = '_'.join(k for k in fname[1:]) fname_transferred = fname_transferred + '_transferred' print("I picked a random number as ", spk, " the corresponding speaker from the dictionary is ", ids2speakers[spk], " the filename I am storing is ", fname_transferred) print(text, fname_transferred) waveform, alignment, _ = tts(model, text, spk, qF0s) dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_transferred, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_transferred)) plot_alignment(alignment.T, dst_alignment_path, info="tacotron, {}".format(checkpoint_path)) audio.save_wav(waveform, dst_wav_path)
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, use_manual_attention=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None #plot_path = add_prefix(plot_path, time_str) if use_manual_attention: plot_path = add_postfix(plot_path, "manual") if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: end_idx_counter = 0 attention_argmax = alignment.argmax(0) end_idx = min(len(sequence) - 1, max(attention_argmax)) max_counter = min((attention_argmax == end_idx).sum(), 5) for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) return True else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return result
plt.plot(h) plt.ylim(0, h.max()) plt.xlim(0, len(h)) plt.draw() plt.savefig("figures/genes_" + species_names[i] + ".png") plt.gcf().clear() print("Storing final generation...") filenames = [] samples = [] group_by_species = {} for i in range(len(GA.originals)): group_by_species[i] = [ org for org in GA.curr_generation if org.species == i ] for i in range(len(GA.originals)): peers = group_by_species[i] for j in range(len(peers)): filenames.append("GA." + species_names[i][:-4] + "_" + str(j) + ".wav") samples.append(peers[j].waveform) for gen, output in zip(samples, filenames): out = io.BytesIO() audio.save_wav(gen, out) with open("output/" + output, "wb") as f: f.write(out.getvalue()) print("Program complete.")
def main(): with tf.device( '/cpu:0'): # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다. config = get_arguments() started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) logdir = os.path.join(config.logdir, 'generate', started_datestring) print('logdir0-------------' + logdir) if not os.path.exists(logdir): os.makedirs(logdir) load_hparams(hparams, config.checkpoint_dir) sess = tf.Session() scalar_input = hparams.scalar_input net = WaveNetModel( batch_size=config.batch_size, dilations=hparams.dilations, filter_width=hparams.filter_width, residual_channels=hparams.residual_channels, dilation_channels=hparams.dilation_channels, quantization_channels=hparams.quantization_channels, out_channels=hparams.out_channels, skip_channels=hparams.skip_channels, use_biases=hparams.use_biases, scalar_input=hparams.scalar_input, global_condition_channels=hparams.gc_channels, global_condition_cardinality=config.gc_cardinality, local_condition_channels=hparams.num_mels, upsample_factor=hparams.upsample_factor, legacy=hparams.legacy, residual_legacy=hparams.residual_legacy, train_mode=False ) # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함 if scalar_input: samples = tf.placeholder(tf.float32, shape=[net.batch_size, None]) else: samples = tf.placeholder( tf.int32, shape=[net.batch_size, None] ) # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이) # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels) upsampled_local_condition = tf.placeholder( tf.float32, shape=[net.batch_size, hparams.num_mels]) next_sample = net.predict_proba_incremental( samples, upsampled_local_condition, [config.gc_id] * net.batch_size ) # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용 # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자. print('logdir0-------------' + logdir) mel_input = np.load(config.mel) sample_size = mel_input.shape[0] * hparams.hop_size mel_input = np.tile(mel_input, (config.batch_size, 1, 1)) with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE): upsampled_local_condition_data = net.create_upsample( mel_input, upsample_type=hparams.upsample_type) var_list = [ var for var in tf.global_variables() if 'queue' not in var.name ] saver = tf.train.Saver(var_list) print('Restoring model from {}'.format(config.checkpoint_dir)) load(saver, sess, config.checkpoint_dir) init_op = tf.group(tf.initialize_all_variables(), net.queue_initializer) sess.run(init_op) # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다. quantization_channels = hparams.quantization_channels if config.wav_seed: # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함 --> 그래서 너무 짧으면 error seed = create_seed(config.wav_seed, hparams.sample_rate, quantization_channels, net.receptive_field, scalar_input) # --> mu_law encode 된 것. if scalar_input: waveform = seed.tolist() else: waveform = sess.run( seed).tolist() # [116, 114, 120, 121, 127, ...] print('Priming generation...') for i, x in enumerate(waveform[-net.receptive_field:-1] ): # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다. if i % 100 == 0: print('Priming sample {}/{}'.format( i, net.receptive_field), end='\r') sess.run(next_sample, feed_dict={ samples: np.array([x] * net.batch_size).reshape( net.batch_size, 1), upsampled_local_condition: np.zeros([net.batch_size, hparams.num_mels]) }) print('Done.') waveform = np.array([waveform[-net.receptive_field:]] * net.batch_size) else: # Silence with a single random sample at the end. if scalar_input: waveform = [0.0] * (net.receptive_field - 1) waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, 2 * np.random.rand(net.batch_size).reshape( net.batch_size, -1) - 1 ], axis=-1) # -1~1사이의 random number를 만들어 끝에 붙힌다. # wavefor: shape(batch_size,net.receptive_field ) else: waveform = [quantization_channels / 2] * ( net.receptive_field - 1 ) # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다. waveform = np.array(waveform * net.batch_size).reshape( net.batch_size, -1) waveform = np.concatenate( [ waveform, np.random.randint(quantization_channels, size=net.batch_size).reshape( net.batch_size, -1) ], axis=-1) # one hot 변환 전. (batch_size, 5117) start_time = time.time() upsampled_local_condition_data = sess.run( upsampled_local_condition_data) last_sample_timestamp = datetime.now() for step in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:, -1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) # Run the WaveNet to predict the next sample. # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185] # fast인 경우, window는 숫자 1개. prediction = sess.run( next_sample, feed_dict={ samples: window, upsampled_local_condition: upsampled_local_condition_data[:, step, :] } ) # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다. --> (batch_size,256) if scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log( prediction ) / config.temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = ( scaled_prediction - np.logaddexp.reduce( scaled_prediction, axis=-1, keepdims=True) ) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if config.temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg= 'Prediction scaling at temperature=1.0 is not working as intended.' ) # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[ np.random.choice(np.arange(quantization_channels), p=p) ] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform, sample], axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format( step + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp # Introduce a newline to clear the carriage return from the progress. print() # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:, net.receptive_field:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, quantization_channels, quantization=False) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, quantization_channels, quantization=True) out = sess.run( decode, feed_dict={samples: waveform[:, net.receptive_field:]}) # save wav for i in range(net.batch_size): config.wav_out_path = logdir + '/test-{}.wav'.format(i) mel_path = config.wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype( np.float32).T audio.save_wav(out[i], config.wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram', target_spectrogram=mel_input[i]) print('Finished generating.')
def plot_graph_and_save_audio(args, base_path=None, start_of_sentence=None, end_of_sentence=None, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, save_alignment=False, librosa_trim=False, attention_trim=False, time_str=None, isKorean=True): idx, (wav, alignment, path, text, sequence, mel) = args if base_path: plot_path = "{}/{}.png".format(base_path, get_time()) elif path: plot_path = path.rsplit('.', 1)[0] + ".png" else: plot_path = None if plot_path: plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean) if use_short_concat: wav = short_concat(wav, alignment, text, start_of_sentence, end_of_sentence, pre_word_num, post_word_num, pre_surplus_idx, post_surplus_idx) if attention_trim and end_of_sentence: # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다. end_idx_counter = 0 attention_argmax = alignment.argmax( 0 ) # alignment: text length(encoder), target length(decoder) ==> target length(decoder) end_idx = min(len(sequence) - 1, max(attention_argmax)) # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1 # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다. # 한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다. # (설계자가 왜 5로 잘랐는지는 미지수) max_counter = (attention_argmax == end_idx).sum() for jdx, attend_idx in enumerate(attention_argmax): if len(attention_argmax) > jdx + 1: if attend_idx == end_idx: end_idx_counter += 1 if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx: break if end_idx_counter >= max_counter: break else: break spec_end_idx = hparams.reduction_factor * jdx + 3 wav = wav[:spec_end_idx] mel = mel[:spec_end_idx] audio_out = inv_linear_spectrogram(wav.T, hparams) if librosa_trim and end_of_sentence: yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50) audio_out = audio_out[:index[-1]] mel = mel[:index[-1] // hparams.hop_size] if save_alignment: alignment_path = "{}/{}.npy".format(base_path, idx) np.save(alignment_path, alignment, allow_pickle=False) if path or base_path: if path: current_path = add_postfix(path, idx) elif base_path: current_path = plot_path.replace(".png", ".wav") save_wav(audio_out, current_path, hparams.sample_rate) #hccho mel_path = current_path.replace(".wav", ".npy") np.save(mel_path, mel) #return True return audio_out else: io_out = io.BytesIO() save_wav(audio_out, io_out, hparams.sample_rate) result = io_out.getvalue() return audio_out
def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: step_count = 0 try: #simple text file to keep count of global step with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file: step_count = int(file.read()) except: print( 'no step_counter file found, assuming there is no saved checkpoint' ) global_step = tf.Variable(step_count, name='global_step', trainable=False) with tf.variable_scope('model') as scope: model = create_model(args.model, hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets) model.add_loss() model.add_optimizer(global_step) stats = add_stats(model) #Book keeping step = 0 save_step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(log_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: #Restore saved model if the user requested it, Default = True. try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e)) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: if not args.restore: log('Starting new training!') else: log('No model to load at {}'.format(save_dir)) #initiating feeder feeder.start_in_session(sess) #Training loop while not coord.should_stop(): start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r') if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0: with open(os.path.join(log_dir, 'step_counter.txt'), 'w') as file: file.write(str(step)) log('Saving checkpoint to: {}-{}'.format( checkpoint_path, step)) saver.save(sess, checkpoint_path, global_step=step) save_step = step log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..' ) input_seq, prediction, alignment, target = sess.run([ model.inputs[0], model.mel_outputs[0], model.alignments[0], model.mel_targets[0], ]) #save predicted spectrogram to disk (for plot and manual evaluation purposes) mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format( step) np.save(os.path.join(mel_dir, mel_filename), prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug. wav = audio.inv_mel_spectrogram(prediction.T) audio.save_wav( wav, os.path.join(wav_dir, 'step-{}-waveform.wav'.format(step))) #save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, loss)) #save real mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( target, os.path.join( plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, Real'.format( args.model, time_string(), step, loss)) #save predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( prediction, os.path.join( plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), info='{}, {}, step={}, loss={:.5}'.format( args.model, time_string(), step, loss)) log('Input at step {}: {}'.format( step, sequence_to_text(input_seq))) except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*- import os import numpy as np from utils import audio from hparams import hparams as hps linear_path = './data/linear-000001.npy' linear_name = linear_path.split('/')[-1].split('.')[0] linear_p = np.load(linear_path) mel_path = r'./data/mel-000001.npy' mel_name = mel_path.split('/')[-1].split('.')[0] mel_p = np.load(mel_path) # 保存线性频谱 wav = audio.inv_linear_spectrogram(linear_p.T, hps) audio.save_wav(wav, os.path.join("./data", "{}.wav".format(linear_name)), hps) # 保存mel频谱 wav = audio.inv_mel_spectrogram(mel_p.T, hps) audio.save_wav(wav, os.path.join("./data", "{}.wav".format(mel_name)), hps)
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0): waveform = waveform[:,:1] sample_size = upsampled_local_condition_data.shape[1] last_sample_timestamp = datetime.now() start_time = time.time() for step2 in range(sample_size): # 원하는 길이를 구하기 위해 loop sample_size window = waveform[:,-1:] # 제일 끝에 있는 1개만 samples에 넣어 준다. window: shape(N,1) prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data }) if hparams.scalar_input: sample = prediction # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다. else: # Scale prediction distribution using temperature. # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다. # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다. np.seterr(divide='ignore') scaled_prediction = np.log(prediction) / temperature # config.temperature인 경우는 값의 변화가 없다. scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True)) # np.log(np.sum(np.exp(scaled_prediction))) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # Prediction distribution at temperature=1.0 should be unchanged after # scaling. if temperature == 1.0: np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.') # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다. sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction] # choose one sample per batch waveform = np.concatenate([waveform,sample],axis=-1) #window.shape: (N,1) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: duration = time.time() - start_time print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r') last_sample_timestamp = current_sample_timestamp print('\n') # Save the result as a wav file. if hparams.input_type == 'raw': out = waveform[:,1:] elif hparams.input_type == 'mulaw': decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) else: # 'mulaw-quantize' decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True) out = sess.run(decode, feed_dict={samples: waveform[:,1:]}) # save wav for i in range(1): wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i) mel_path = wav_out_path.replace(".wav", ".png") gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T audio.save_wav(out[i], wav_out_path, hparams.sample_rate) # save_wav 내에서 out[i]의 값이 바뀐다. plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])
model.eval() with torch.no_grad(): _, linear_output = model(seq, mel_input) # print(np.shape(linear_output)) # trans_linear = audio.trans(linear_output[0].cpu().numpy()) wav = audio.inv_spectrogram(linear_output[0].cpu().numpy()) # print(audio.find_endpoint(wav)) # print(np.shape(wav)) wav = wav[:audio.find_endpoint(wav)] # print(np.shape(wav)) return wav if __name__ == "__main__": device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define model model = nn.DataParallel(Tacotron()).to(device) print("Model Have Been Defined") # Load checkpoint checkpoint = torch.load(os.path.join( hparams.checkpoint_path, 'checkpoint_40.pth.tar')) model.load_state_dict(checkpoint['model']) text = "in being comparatively modern." wav = synthesizer(model, text, device) audio.save_wav(wav, "test.wav")
def train(log_dir, metadata_path, data_path): tf.reset_default_graph() vocoder = Vocoder(hparams) vocoder.init_synthesizer(hparams.batch_size) coord = tf.train.Coordinator() reader = DataFeeder(metadata_filename=metadata_path, coord=coord, receptive_field=vocoder.net.receptive_field, gc_enable=hparams.gc_enable, sample_size=hparams.sample_size, npy_dataroot=data_path, num_mels=hparams.num_mels, speaker_id=None) if hparams.gc_enable: audio_batch, lc_batch, gc_batch = reader.dequeue(hparams.batch_size) else: audio_batch, lc_batch = reader.dequeue(hparams.batch_size) gc_batch = None loss = vocoder.loss(audio_batch, lc_batch, gc_batch) sess = tf.Session() last_step, _ = vocoder.load(sess, log_dir) last_step = last_step or 0 all_params = tf.trainable_variables() global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(last_step), trainable=False) decay_steps = hparams.NUM_STEPS_RATIO_PER_DECAY * hparams.max_num_step # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(hparams.initial_learning_rate, global_step, decay_steps, hparams.LEARNING_RATE_DECAY_FACTOR, staircase=True) # lr = hparams.initial_learning_rate optimizer = optimizer_factory['adam'](learning_rate=lr, momentum=None) if hparams.clip_thresh > 0: grads_and_vars = optimizer.compute_gradients(loss, all_params) grads_and_vars = list( filter(lambda t: t[0] is not None, grads_and_vars)) capped_gvs = [(tf.clip_by_norm(grad, hparams.clip_thresh), var) for grad, var in grads_and_vars] optim = optimizer.apply_gradients(capped_gvs) else: optim = optimizer.minimize(loss, var_list=all_params, global_step=global_step) # Track the moving averages of all trainable variables. ema = tf.train.ExponentialMovingAverage(hparams.MOVING_AVERAGE_DECAY, global_step) maintain_averages_op = tf.group(ema.apply(all_params)) train_op = tf.group(optim, maintain_averages_op) sess.run(tf.global_variables_initializer()) threads = tf.train.start_queue_runners(sess=sess, coord=coord) reader.start_threads(sess) try: print_loss = 0. start_time = time() for step in range(last_step + 1, hparams.max_num_step): if gc_batch is None: fetches = [audio_batch, vocoder.upsampled_lc, loss, train_op] _x, _lc, _loss, _ = sess.run(fetches) _gc = None else: fetches = [ audio_batch, vocoder.upsampled_lc, gc_batch, loss, train_op ] _x, _lc, _gc, _loss, _ = sess.run(fetches) print_loss += _loss if step % PRINT_LOSS_EVERY == 0: duration = time() - start_time print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format( step, print_loss / PRINT_LOSS_EVERY, duration / PRINT_LOSS_EVERY)) start_time = time() print_loss = 0. if step % hparams.checkpoint_interval == 0: vocoder.save(sess, log_dir, step) if step % hparams.train_eval_interval == 0: samples = vocoder.synthesize(sess, _x.shape[1], _lc, _gc) targets = _x.reshape(hparams.batch_size, -1) for j in range(hparams.batch_size): predicted_wav = samples[j, :] target_wav = targets[j, :] predicted_wav_path = os.path.join( log_dir, 'predicted_{}_{}.wav'.format(step, j)) target_wav_path = os.path.join( log_dir, 'target_{}_{}.wav'.format(step, j)) save_wav(predicted_wav, predicted_wav_path) save_wav(target_wav, target_wav_path) except Exception as error: print(error) finally: coord.request_stop() coord.join(threads) sess.close()
def synthesize(self, texts, basenames, log_dir, mel_filenames): hparams = self._hparams # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) sequences = [np.asarray(text_to_sequence(text)) for text in texts] input_lengths = [len(seq) for seq in sequences] seqs, max_seq_len = self._prepare_inputs(sequences) feed_dict = { self.inputs: seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32) } linears, mels, alignments, audio_length = self.session.run( [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length], feed_dict=feed_dict) # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions target_lengths = audio_length if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels[0].T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way if platform.system() == 'Linux': # Linux wav reader os.system('aplay temp.wav') elif platform.system() == 'Windows': # windows wav reader os.system('start /min mplay32 /play /close temp.wav') else: raise RuntimeError( 'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!') return for i, mel in enumerate(mels): if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) alignments_samples, alignment_titles = self.get_alignments(alignments) for idx in range(len(alignments_samples)): # save alignments plot.plot_alignment(alignments_samples[idx], os.path.join(log_dir, 'plots/{}.png'.format( alignment_titles[ idx])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder, hparams, sess, step, summary_writer): # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) sum_eval_loss = 0.0 sum_mel_loss = 0.0 sum_stop_token_loss = 0.0 sum_linear_loss = 0.0 count = 0.0 mel_p = None mel_t = None t_len = None attention_mask_sample = None lin_p = None lin_t = None for _ in tqdm(range(feeder.test_steps)): test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run( [ eval_model.loss, eval_model.mel_loss, eval_model.stop_token_loss, eval_model.linear_loss, eval_model.post_net_predictions[0], eval_model.targets_mel[0], eval_model.targets_length[0], eval_model.alignments[0], eval_model.mag_pred[0], eval_model.targets_mag[0], ]) sum_eval_loss += test_eloss sum_mel_loss += test_mel_loss sum_stop_token_loss += test_stop_token_loss sum_linear_loss += test_linear_loss count += 1.0 wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-linear.wav'.format(step)), sr=hparams.sample_rate) if count > 0.0: eval_loss = sum_eval_loss / count mel_loss = sum_mel_loss / count stop_token_loss = sum_stop_token_loss / count linear_loss = sum_linear_loss / count else: eval_loss = sum_eval_loss mel_loss = sum_mel_loss stop_token_loss = sum_stop_token_loss linear_loss = sum_linear_loss log('Saving eval log to {}..'.format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav(wav, os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)), sr=hparams.sample_rate) alignments, alignment_titles = get_alignments(attention_mask_sample) for i in range(len(alignments)): plot.plot_alignment(alignments[i], os.path.join( eval_plot_dir, '{}_{}-eval-align.png'.format( step, alignment_titles[i])), title='{}, {}, step={}, loss={:.5f}'.format( args.model, time_string(), step, eval_loss), max_len=t_len // hparams.reduction_factor) plot.plot_spectrogram( mel_p, os.path.join(eval_plot_dir, '{}-eval-mel-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) plot.plot_spectrogram( lin_p, os.path.join(eval_plot_dir, '{}-eval-linear-spectrogram.png'.format(step)), title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) log('Writing eval summary!') add_eval_stats(summary_writer, step, linear_loss, mel_loss, stop_token_loss, eval_loss)
model.eval() with torch.no_grad(): _, linear_output = model(seq, mel_input) # print(np.shape(linear_output)) # trans_linear = audio.trans(linear_output[0].cpu().numpy()) wav = audio.inv_spectrogram(linear_output[0].cpu().numpy()) # print(audio.find_endpoint(wav)) # print(np.shape(wav)) wav = wav[:audio.find_endpoint(wav)] # print(np.shape(wav)) return wav if __name__ == "__main__": device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define model model = Tacotron().to(device) print("Model Have Been Defined") # Load checkpoint checkpoint = torch.load( os.path.join(hparams.checkpoint_path, 'checkpoint_20500.pth.tar')) model.load_state_dict(checkpoint['model']) print("Load Done") text = "I am very happy to see you again." wav = synthesizer(model, text, device) audio.save_wav(wav, text + ".wav")
def audio(output, pth): mel_outputs, mel_outputs_postnet, _ = output #wav = inv_melspectrogram(to_arr(mel_outputs[0])) wav_postnet = inv_melspectrogram(to_arr(mel_outputs_postnet[0])) #save_wav(wav, pth+'.wav') save_wav(wav_postnet, pth + '.wav')
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav wav_path = "LJ001-0008.wav" raw_wav = load_wav(wav_path) mel_spec = melspectrogram(raw_wav) inv_wav = inv_mel_spectrogram(mel_spec) save_wav(inv_wav,"inv.wav")