def convert(predictor, df): a, b, c = next(df().get_data()) pred_spec, r_spec = predictor(a, b, c) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) r_spec = denormalize_db(r_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) r_spec = db2amp(r_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) r_spec = np.power(r_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), pred_spec))) y_audio = np.array( list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.n_iter), r_spec))) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio
def convert(predictor, df): pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec]) y_audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) # if hp.Convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def synth(self, text, save=None): inp = clean(text) print(inp) x = [self.c2i[c] for c in inp + 'E'] x += [0] * (hp.maxlen - len(x)) x = np.array(x) x = x.reshape(1, -1) with self.melsession.as_default(): preds = np.zeros((1, 1, hp.n_mels), np.float32) cnt = hp.Tyr for j in range(hp.Tyr): sys.stdout.write('\rProcessing %d' % j) sys.stdout.flush() _preds, a = self.melsession.run( [self.melmodel.mel_output, self.melmodel.A], { self.melmodel.text: x, self.melmodel.mel: preds }) preds = np.concatenate((np.zeros((1, 1, hp.n_mels)), _preds), axis=1) cnt -= 1 if np.argmax(a[0, :, -1]) >= len(inp) - 3: cnt = min(cnt, 10) if cnt <= 0: break with self.magsession.as_default(): wav = self.magsession.run(self.magmodel.wav_output, {self.magmodel.mel: preds}) wav = audio.inv_preemphasis(wav) if save is not None: audio.save_wav(wav[0], save) else: out = io.BytesIO() audio.save_wav(wav[0], out) return out.getvalue()
def do_convert(predictor, input_name, logdir2): convert_s = datetime.datetime.now() # Load input audio input_audio, _ = librosa.load(input_name, sr=hp.default.sr, dtype=np.float64) # Extract F0 from input audio first input_f0, t_table = pw.dio(input_audio, hp.default.sr) input_f0 = pw.stonemask(input_audio, input_f0, t_table, hp.default.sr) # Get MFCC, Spectral Envelope, and Aperiodicity mfcc = _get_mfcc(input_audio, hp.default.n_fft, hp.default.win_length, hp.default.hop_length) mfcc = np.expand_dims(mfcc, axis=0) input_ap = pw.d4c(input_audio, input_f0, t_table, hp.default.sr, fft_size=hp.default.n_fft) input_sp_en = _get_spectral_envelope(preemphasis(input_audio, coeff=hp.default.preemphasis), hp.default.n_fft) plt.imsave('./converted/debug/input_sp_en_original.png', input_sp_en, cmap='binary') input_sp_en = np.expand_dims(input_sp_en, axis=0) # Convert Spectral Envelope output_sp_en, ppgs = convert_spectral_envelope(predictor, mfcc, input_sp_en) output_sp_en = np.squeeze(output_sp_en.astype(np.float64), axis=0) preproc_s = datetime.datetime.now() # Denormalization output_sp_en = denormalize_db(output_sp_en, hp.default.max_db, hp.default.min_db) # Db to amp output_sp_en = librosa.db_to_amplitude(output_sp_en) # Emphasize the magnitude output_sp_en = np.power(output_sp_en, hp.convert.emphasis_magnitude) preproc_e = datetime.datetime.now() preproc_t = preproc_e - preproc_s print("Pre-Processing time:{}s".format(preproc_t.seconds)) # F0 transformation with WORLD Vocoder output_f0 = f0_adapt(input_f0, logdir2) # Synthesize audio and de-emphasize output_audio = pw.synthesize(output_f0, output_sp_en, input_ap, hp.default.sr) output_audio = inv_preemphasis(output_audio, coeff=hp.default.preemphasis) # Saving output_audio to 32-bit Float wav file output_audio = output_audio.astype(np.float32) librosa.output.write_wav(path="./converted/"+input_name,y=output_audio,sr=hp.default.sr) # Saving PPGS data to Grayscale Image and raw binary file ppgs = np.squeeze(ppgs, axis=0) plt.imsave('./converted/debug/'+input_name+'.png', ppgs, cmap='binary') np.save('./converted/debug/'+input_name+'.npy', ppgs) convert_e = datetime.datetime.now() convert_time = convert_e - convert_s print("Total Converting Time:{}s".format(convert_time.seconds))
def convert(predictor, df): t = next(df().get_data()) print(t[0].shape) pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) librosa.output.write_wav( '/home/user/vilin/deep-voice-conversion/output/file_trim_8.wav', audio[0], hp.default.sr) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, data): x_mfccs, y_spec, y_mel = data x_mfccs, y_spec, y_mel = data x_mfccs = np.array(x_mfccs).reshape((-1, ) + x_mfccs.shape) y_spec = np.array(y_spec).reshape((-1, ) + y_spec.shape) y_mel = np.array(y_mel).reshape((-1, ) + y_mel.shape) pred_spec, y_spec, ppgs = predictor(x_mfccs, y_spec, y_mel) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db) y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in pred_spec ]) y_audio = np.array([ spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec ]) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis) if hp.Convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def convert(predictor, df): # TODO need to fix reading in with duration pred_spec, y_spec, ppgs = predictor(next(df().get_data())) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) if hp.convert.one_full_wav: # Concatenate to a wav y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') audio = np.reshape(audio, (1, audio.size), order='C') return audio, y_audio, ppgs
def sumimage(mel, mel_name): mel = mel #+ 0.001 * np.random.standard_normal([hp.batch_size, hp.duration * hp.n_mels, hp.n_mels]) mel_image = mel.transpose(0, 2, 1) heatmap = np.expand_dims(mel_image, 3) tf.summary.image(mel_name, heatmap, max_outputs=mel_image.shape[0]) mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) mel_basis = np.mat(mel_basis) mel_basis_I = mel_basis.I mel_spec = [] for i in range(len(mel)): print(mel_name) print(np.max(mel[i])) print(np.min(mel[i])) print(np.mean(mel[i])) #mel[i] = mel[i] * (0.6 / np.max(mel[i])) mel_db_item = np.transpose(mel[i]) mel_db_item = denormalize_0_1(mel_db_item, hp.max_db, hp.min_db) #mel_db_item = np.maximum(mel_db_item, 0) # = normalize_0_1(mel_db_item, hp.default.max_db, hp.default.min_db) print(np.max(mel_db_item)) print(np.mean(mel_db_item)) mel_item = db2amp(mel_db_item) print(np.max(mel_item)) mag_item = np.dot(mel_basis_I, mel_item) print(np.max(mel_item)) mag_item = np.maximum(mag_item, 0) spec_item = np.transpose(mag_item) #mag_db_item = amp2db(mag_item) #mag_db_item = normalize_0_1(mag_db_item, hp.default.max_db, hp.default.min_db) #mag_db_item = np.transpose(mag_db_item) #specitem = np.transpose(magitem) #mel_complex = mel_D_abs + np.complex(0, 0) #specitem = librosa.istft(stft_matrix=mel_complex, hop_length=hp.default.hop_length, win_length=hp.default.win_length) mel_spec.append(spec_item.getA()) mel_spec = np.power(mel_spec, hp.emphasis_magnitude) mel_audio = np.array( list( map( lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp. hop_length, hp.n_iter), mel_spec))) mel_audio = inv_preemphasis(mel_audio, coeff=hp.preemphasis) tf.summary.audio(mel_name, mel_audio, hp.sr, max_outputs=hp.batch_size)
def sumspecimage(spec, spec_name): spec = denormalize_db(spec, hp.max_db, hp.min_db) spec = db2amp(spec) spec_image = spec.transpose(0, 2, 1) heatmap = np.expand_dims(spec_image, 3) tf.summary.image(spec_name, heatmap, max_outputs=spec_image.shape[0]) out_spec = np.power(np.maximum(spec, 0), 1) #hp.emphasis_magnitude) out_audio = np.array( list( map( lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp. hop_length, hp.n_iter), out_spec))) out_audio = inv_preemphasis(out_audio, coeff=hp.preemphasis) tf.summary.audio(spec_name, out_audio, hp.sr, max_outputs=hp.batch_size)
def convert(predictor, mfcc, spec, mel_spec): print("convert") pred_s = datetime.datetime.now() pred_spec, _, ppgs = predictor(mfcc, spec, mel_spec) pred_e = datetime.datetime.now() pred_t = pred_e - pred_s print("Predicting time:{}s".format(pred_t.seconds)) preproc_s = datetime.datetime.now() # Denormalizatoin print("denormalize_db") pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) # Db to amp print("db2amp") pred_spec = db2amp(pred_spec) # Emphasize the magnitude print("emphasize") pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) preproc_e = datetime.datetime.now() preproc_t = preproc_e - preproc_s print("Pre-Processing time:{}s".format(preproc_t.seconds)) audio = [] # Spectrogram to waveform recon_s = datetime.datetime.now() print("spec2wav") audio.append( spec2wav_lws(pred_spec[0], hp.default.n_fft, hp.default.win_length, hp.default.hop_length, hp.default.lws_mode)) recon_e = datetime.datetime.now() recon_t = recon_e - recon_s print("Converting Spectrogram-to-Wave time:{}s".format(recon_t.seconds)) audio = np.array(audio) # print('audio.shape : ', audio.shape) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) return audio[0], ppgs
def convert(predictor, tensor): # tensor = next(df().get_data()) # print(tensor.shape) pred_spec, y_spec, ppgs = predictor(tensor) # pred_spec, y_spec, ppgs = predictor(tf.expand_dims(df, 0)) # Denormalizatoin pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) # y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) # Db to amp pred_spec = db2amp(pred_spec) # y_spec = db2amp(y_spec) # Emphasize the magnitude pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) # y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) # y_audio = np.array(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length, # hp.default.n_iter), y_spec)) # Apply inverse pre-emphasis audio = inv_preemphasis(audio, coeff=hp.default.preemphasis) # y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis) # pickle.dump( y_audio, open( "y-audio.p", "wb" ) ) # pickle.dump( audio, open( "o-audio.p", "wb" ) ) # if hp.convert.one_full_wav: # # Concatenate to a wav # y_audio = np.reshape(y_audio, (1, y_audio.size), order='C') # audio = np.reshape(audio, (1, audio.size), order='C') # return audio, y_audio, ppgs return audio, ppgs
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, 'taco_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') mel_dir = os.path.join(log_dir, 'mel-spectrograms') tensorboard_dir = os.path.join(log_dir, 'tacotron_events') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = args.input_dir log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log(hparams_debug_string()) #Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) #Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope('datafeeder') as scope: feeder = Feeder(coord, input_path, hparams) #Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope: model = Tacotron(hparams) model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets, targets_lengths=feeder.targets_lengths, global_step=global_step, is_training=True, split_infos=feeder.split_infos) model.add_loss() model.add_optimizer(global_step) stats = _add_train_stats(model, hparams) GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs') GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams) #Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=20) log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) #saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) #initializing feeder feeder.start_threads(sess) #Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if np.isnan(loss) or loss > 100.: log('Loss exploded to {:.5f} at step {}'.format(loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step: {}'.format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300: #Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') input_seq, mel_prediction = sess.run([ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], ]) #save predicted mel spectrogram to disk (debug) mel_filename = 'mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def synthesize(self, texts, basenames, mel_dir, wav_dir, plot_dir, mel_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] #[-max, max] or [0,max] T2_output_range = ( -hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) mels, alignments, stop_tokens = self.session.run( [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) #Linearize outputs (n_gpus -> 1D) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) #save wav (mel -> wav) wav = self.session.run(self.GLGPU_mel_outputs, feed_dict={self.GLGPU_mel_inputs: mel}) wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize) audio.save_wav(wav, os.path.join(wav_dir, 'wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( plot_dir, 'alignment-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram(mel, os.path.join( plot_dir, 'mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) return saved_mels_paths