Beispiel #1
0
def convert(predictor, df):
    a, b, c = next(df().get_data())
    pred_spec, r_spec = predictor(a, b, c)

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    r_spec = denormalize_db(r_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    r_spec = db2amp(r_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    r_spec = np.power(r_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
                                       hp.default.n_iter), pred_spec)))
    y_audio = np.array(
        list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
                                       hp.default.n_iter), r_spec)))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio
Beispiel #2
0
def convert(predictor, df):
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db)
    y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length,
                               hp.Default.n_iter) for spec in pred_spec])
    y_audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length,
                                 hp.Default.n_iter) for spec in y_spec])

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis)

    # if hp.Convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Beispiel #3
0
 def synth(self, text, save=None):
     inp = clean(text)
     print(inp)
     x = [self.c2i[c] for c in inp + 'E']
     x += [0] * (hp.maxlen - len(x))
     x = np.array(x)
     x = x.reshape(1, -1)
     with self.melsession.as_default():
         preds = np.zeros((1, 1, hp.n_mels), np.float32)
         cnt = hp.Tyr
         for j in range(hp.Tyr):
             sys.stdout.write('\rProcessing %d' % j)
             sys.stdout.flush()
             _preds, a = self.melsession.run(
                 [self.melmodel.mel_output, self.melmodel.A], {
                     self.melmodel.text: x,
                     self.melmodel.mel: preds
                 })
             preds = np.concatenate((np.zeros((1, 1, hp.n_mels)), _preds),
                                    axis=1)
             cnt -= 1
             if np.argmax(a[0, :, -1]) >= len(inp) - 3:
                 cnt = min(cnt, 10)
             if cnt <= 0:
                 break
     with self.magsession.as_default():
         wav = self.magsession.run(self.magmodel.wav_output,
                                   {self.magmodel.mel: preds})
         wav = audio.inv_preemphasis(wav)
         if save is not None:
             audio.save_wav(wav[0], save)
         else:
             out = io.BytesIO()
             audio.save_wav(wav[0], out)
             return out.getvalue()
def do_convert(predictor, input_name, logdir2):
    convert_s = datetime.datetime.now()

    # Load input audio
    input_audio, _ = librosa.load(input_name, sr=hp.default.sr, dtype=np.float64)

    # Extract F0 from input audio first
    input_f0, t_table = pw.dio(input_audio, hp.default.sr)
    input_f0 = pw.stonemask(input_audio, input_f0, t_table, hp.default.sr)

    # Get MFCC, Spectral Envelope, and Aperiodicity
    mfcc = _get_mfcc(input_audio, hp.default.n_fft, hp.default.win_length, hp.default.hop_length)
    mfcc = np.expand_dims(mfcc, axis=0)

    input_ap = pw.d4c(input_audio, input_f0, t_table, hp.default.sr, fft_size=hp.default.n_fft)

    input_sp_en = _get_spectral_envelope(preemphasis(input_audio, coeff=hp.default.preemphasis), hp.default.n_fft)
    plt.imsave('./converted/debug/input_sp_en_original.png', input_sp_en, cmap='binary')
    input_sp_en = np.expand_dims(input_sp_en, axis=0)

    # Convert Spectral Envelope
    output_sp_en, ppgs = convert_spectral_envelope(predictor, mfcc, input_sp_en)
    output_sp_en = np.squeeze(output_sp_en.astype(np.float64), axis=0)

    preproc_s = datetime.datetime.now()
    # Denormalization
    output_sp_en = denormalize_db(output_sp_en, hp.default.max_db, hp.default.min_db)

    # Db to amp
    output_sp_en = librosa.db_to_amplitude(output_sp_en)

    # Emphasize the magnitude
    output_sp_en = np.power(output_sp_en, hp.convert.emphasis_magnitude)

    preproc_e = datetime.datetime.now()
    preproc_t = preproc_e - preproc_s
    print("Pre-Processing time:{}s".format(preproc_t.seconds))

    # F0 transformation with WORLD Vocoder
    output_f0 = f0_adapt(input_f0, logdir2)

    # Synthesize audio and de-emphasize
    output_audio = pw.synthesize(output_f0, output_sp_en, input_ap, hp.default.sr)
    output_audio = inv_preemphasis(output_audio, coeff=hp.default.preemphasis)

    # Saving output_audio to 32-bit Float wav file
    output_audio = output_audio.astype(np.float32)
    librosa.output.write_wav(path="./converted/"+input_name,y=output_audio,sr=hp.default.sr)

    # Saving PPGS data to Grayscale Image and raw binary file
    ppgs = np.squeeze(ppgs, axis=0)
    plt.imsave('./converted/debug/'+input_name+'.png', ppgs, cmap='binary')
    np.save('./converted/debug/'+input_name+'.npy', ppgs)

    convert_e = datetime.datetime.now()
    convert_time = convert_e - convert_s
    print("Total Converting Time:{}s".format(convert_time.seconds))
Beispiel #5
0
def convert(predictor, df):

    t = next(df().get_data())
    print(t[0].shape)
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)
    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    librosa.output.write_wav(
        '/home/user/vilin/deep-voice-conversion/output/file_trim_8.wav',
        audio[0], hp.default.sr)

    y_audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Beispiel #6
0
def convert(predictor, data):
    x_mfccs, y_spec, y_mel = data
    x_mfccs, y_spec, y_mel = data
    x_mfccs = np.array(x_mfccs).reshape((-1, ) + x_mfccs.shape)
    y_spec = np.array(y_spec).reshape((-1, ) + y_spec.shape)
    y_mel = np.array(y_mel).reshape((-1, ) + y_mel.shape)
    pred_spec, y_spec, ppgs = predictor(x_mfccs, y_spec, y_mel)

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db)
    y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array([
        spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length,
                 hp.Default.hop_length, hp.Default.n_iter)
        for spec in pred_spec
    ])
    y_audio = np.array([
        spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length,
                 hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec
    ])

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis)

    if hp.Convert.one_full_wav:
        # Concatenate to a wav
        y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
        audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Beispiel #7
0
def convert(predictor, df):
    # TODO need to fix reading in with duration
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    y_audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    if hp.convert.one_full_wav:
        # Concatenate to a wav
        y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
        audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Beispiel #8
0
def sumimage(mel, mel_name):
    mel = mel  #+ 0.001 * np.random.standard_normal([hp.batch_size, hp.duration * hp.n_mels, hp.n_mels])
    mel_image = mel.transpose(0, 2, 1)
    heatmap = np.expand_dims(mel_image, 3)
    tf.summary.image(mel_name, heatmap, max_outputs=mel_image.shape[0])

    mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)
    mel_basis = np.mat(mel_basis)
    mel_basis_I = mel_basis.I
    mel_spec = []

    for i in range(len(mel)):
        print(mel_name)
        print(np.max(mel[i]))
        print(np.min(mel[i]))
        print(np.mean(mel[i]))
        #mel[i] = mel[i] * (0.6 / np.max(mel[i]))
        mel_db_item = np.transpose(mel[i])
        mel_db_item = denormalize_0_1(mel_db_item, hp.max_db, hp.min_db)
        #mel_db_item = np.maximum(mel_db_item, 0)
        # = normalize_0_1(mel_db_item, hp.default.max_db, hp.default.min_db)

        print(np.max(mel_db_item))
        print(np.mean(mel_db_item))

        mel_item = db2amp(mel_db_item)
        print(np.max(mel_item))

        mag_item = np.dot(mel_basis_I, mel_item)
        print(np.max(mel_item))
        mag_item = np.maximum(mag_item, 0)
        spec_item = np.transpose(mag_item)

        #mag_db_item = amp2db(mag_item)
        #mag_db_item = normalize_0_1(mag_db_item, hp.default.max_db, hp.default.min_db)
        #mag_db_item = np.transpose(mag_db_item)
        #specitem = np.transpose(magitem)
        #mel_complex = mel_D_abs + np.complex(0, 0)
        #specitem = librosa.istft(stft_matrix=mel_complex, hop_length=hp.default.hop_length, win_length=hp.default.win_length)
        mel_spec.append(spec_item.getA())

    mel_spec = np.power(mel_spec, hp.emphasis_magnitude)
    mel_audio = np.array(
        list(
            map(
                lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp.
                                      hop_length, hp.n_iter), mel_spec)))

    mel_audio = inv_preemphasis(mel_audio, coeff=hp.preemphasis)
    tf.summary.audio(mel_name, mel_audio, hp.sr, max_outputs=hp.batch_size)
Beispiel #9
0
def sumspecimage(spec, spec_name):
    spec = denormalize_db(spec, hp.max_db, hp.min_db)
    spec = db2amp(spec)

    spec_image = spec.transpose(0, 2, 1)
    heatmap = np.expand_dims(spec_image, 3)
    tf.summary.image(spec_name, heatmap, max_outputs=spec_image.shape[0])

    out_spec = np.power(np.maximum(spec, 0), 1)  #hp.emphasis_magnitude)
    out_audio = np.array(
        list(
            map(
                lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp.
                                      hop_length, hp.n_iter), out_spec)))

    out_audio = inv_preemphasis(out_audio, coeff=hp.preemphasis)
    tf.summary.audio(spec_name, out_audio, hp.sr, max_outputs=hp.batch_size)
def convert(predictor, mfcc, spec, mel_spec):
    print("convert")
    pred_s = datetime.datetime.now()
    pred_spec, _, ppgs = predictor(mfcc, spec, mel_spec)
    pred_e = datetime.datetime.now()
    pred_t = pred_e - pred_s
    print("Predicting time:{}s".format(pred_t.seconds))

    preproc_s = datetime.datetime.now()
    # Denormalizatoin
    print("denormalize_db")
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    print("db2amp")
    pred_spec = db2amp(pred_spec)

    # Emphasize the magnitude
    print("emphasize")
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)

    preproc_e = datetime.datetime.now()
    preproc_t = preproc_e - preproc_s
    print("Pre-Processing time:{}s".format(preproc_t.seconds))

    audio = []
    # Spectrogram to waveform
    recon_s = datetime.datetime.now()

    print("spec2wav")
    audio.append(
        spec2wav_lws(pred_spec[0], hp.default.n_fft, hp.default.win_length,
                     hp.default.hop_length, hp.default.lws_mode))
    recon_e = datetime.datetime.now()
    recon_t = recon_e - recon_s
    print("Converting Spectrogram-to-Wave time:{}s".format(recon_t.seconds))

    audio = np.array(audio)
    # print('audio.shape : ', audio.shape)

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    return audio[0], ppgs
Beispiel #11
0
def convert(predictor, tensor):
    # tensor = next(df().get_data())
    # print(tensor.shape)
    pred_spec, y_spec, ppgs = predictor(tensor)
    # pred_spec, y_spec, ppgs = predictor(tf.expand_dims(df, 0))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    # y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    # y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    # y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    # y_audio = np.array(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
    #                                              hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    # y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)
    # pickle.dump( y_audio, open( "y-audio.p", "wb" ) )
    # pickle.dump( audio, open( "o-audio.p", "wb" ) )

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    # return audio, y_audio, ppgs
    return audio, ppgs
Beispiel #12
0
def train(log_dir, args, hparams):
	save_dir = os.path.join(log_dir, 'taco_pretrained')
	plot_dir = os.path.join(log_dir, 'plots')
	wav_dir = os.path.join(log_dir, 'wavs')
	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
	tensorboard_dir = os.path.join(log_dir, 'tacotron_events')

	os.makedirs(save_dir, exist_ok=True)
	os.makedirs(plot_dir, exist_ok=True)
	os.makedirs(wav_dir, exist_ok=True)
	os.makedirs(mel_dir, exist_ok=True)
	os.makedirs(tensorboard_dir, exist_ok=True)


	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
	input_path = args.input_dir


	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log(hparams_debug_string())

	#Start by setting a seed for repeatability
	tf.set_random_seed(hparams.tacotron_random_seed)

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	global_step = tf.Variable(0, name='global_step', trainable=False)
	with tf.variable_scope('Tacotron_model', reuse=tf.AUTO_REUSE) as scope:
		model = Tacotron(hparams)
		model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets,
			targets_lengths=feeder.targets_lengths, global_step=global_step,
			is_training=True, split_infos=feeder.split_infos)
		model.add_loss()
		model.add_optimizer(global_step)
		stats = _add_train_stats(model, hparams)


	GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels), name='GLGPU_mel_inputs')
	GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(GLGPU_mel_inputs, hparams)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=20)

	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	config.allow_soft_placement = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				# Restore saved model if the user requested it, default = True
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)

					if (checkpoint_state and checkpoint_state.model_checkpoint_path):
						log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
						saver.restore(sess, checkpoint_state.model_checkpoint_path)

					else:
						log('No model to load at {}'.format(save_dir), slack=True)
						saver.save(sess, checkpoint_path, global_step=global_step)

				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e), slack=True)
			else:
				log('Starting new training!', slack=True)
				saver.save(sess, checkpoint_path, global_step=global_step)

			#initializing feeder
			feeder.start_threads(sess)

			#Training loop
			while not coord.should_stop() and step < args.tacotron_train_steps:
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

				if np.isnan(loss) or loss > 100.:
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step: {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)

				if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
					#Save model and current global step
					saver.save(sess, checkpoint_path, global_step=global_step)

					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')

					input_seq, mel_prediction = sess.run([
						model.tower_inputs[0][0],
						model.tower_mel_outputs[0][0],
						])

					#save predicted mel spectrogram to disk (debug)
					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)

					#save griffin lim inverted wav for debug (mel -> wav)

					wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_prediction})
					wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True)
			return save_dir

		except Exception as e:
			log('Exiting due to exception: {}'.format(e), slack=True)
			traceback.print_exc()
			coord.request_stop(e)
Beispiel #13
0
    def synthesize(self, texts, basenames, mel_dir, wav_dir, plot_dir,
                   mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        #[-max, max] or [0,max]
        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])
        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }
        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        mels, alignments, stop_tokens = self.session.run(
            [self.mel_outputs, self.alignments, self.stop_token_prediction],
            feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        mels = [mel for gpu_mels in mels for mel in gpu_mels]
        alignments = [
            align for gpu_aligns in alignments for align in gpu_aligns
        ]
        stop_tokens = [
            token for gpu_token in stop_tokens for token in gpu_token
        ]

        #Natural batch synthesis
        #Get Mel lengths for the entire batch from stop_tokens predictions
        target_lengths = self._get_output_lengths(stop_tokens)

        #Take off the batch wise padding
        mels = [
            mel[:target_length, :]
            for mel, target_length in zip(mels, target_lengths)
        ]
        assert len(mels) == len(texts)

        mels = np.clip(mels, T2_output_range[0], T2_output_range[1])

        saved_mels_paths = []
        for i, mel in enumerate(mels):

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(mel_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            #save wav (mel -> wav)

            wav = self.session.run(self.GLGPU_mel_outputs,
                                   feed_dict={self.GLGPU_mel_inputs: mel})
            wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                        hparams.preemphasize)

            audio.save_wav(wav,
                           os.path.join(wav_dir,
                                        'wav-{}-mel.wav'.format(basenames[i])),
                           sr=hparams.sample_rate)

            #save alignments
            plot.plot_alignment(alignments[i],
                                os.path.join(
                                    plot_dir,
                                    'alignment-{}.png'.format(basenames[i])),
                                title='{}'.format(texts[i]),
                                split_title=True,
                                max_len=target_lengths[i])

            #save mel spectrogram plot
            plot.plot_spectrogram(mel,
                                  os.path.join(
                                      plot_dir,
                                      'mel-{}.png'.format(basenames[i])),
                                  title='{}'.format(texts[i]),
                                  split_title=True)

        return saved_mels_paths