Beispiel #1
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                checkpoint_dir=None):

    idx = 1  # idx = np.random.randint(0, len(mel_outputs))

    # Alignment
    path = os.path.join(checkpoint_dir,
                        "step{}_alignment.png".format(global_step))
    alignment = attn[idx].cpu().data.numpy(
    )  # alignment = attn[idx].cpu().data.numpy()[:, :input_length]
    plot_alignment(alignment.T,
                   path,
                   info="tacotron, step={}".format(global_step))

    # Predicted spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)
Beispiel #2
0
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T)
    save_audio(waveform, audio_path)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
Beispiel #3
0
def save_current_model(args, checkpoint_path, global_step, hparams, loss,
                       model, plot_dir, saver, sess, step, wav_dir):
    # Save model and current global step
    saver.save(sess, checkpoint_path, global_step=global_step)
    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
        )
    input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run(
        [
            model.inputs[0],
            model.post_net_predictions[0],
            model.mag_pred[0],
            model.alignments[0],
            model.targets_mel[0],
            model.targets_length[0],
            model.targets_mag[0],
        ])
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    # save griffin lim inverted wav for debug (linear -> wav)
    wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    # Save real and predicted linear-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(
        linear_prediction,
        os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, loss),
        target_spectrogram=linear_target,
        max_len=target_length,
        auto_aspect=True)
    # save griffin lim inverted wav for debug (mel -> wav)
    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    # save alignment plot to disk (control purposes)
    for i in range(len(alignments)):
        plot.plot_alignment(
            alignments[i],
            os.path.join(plot_dir,
                         '{}_{}-align.png'.format(step, alignment_titles[i])),
            title='{}, {}, step={}, loss={:.5f}'.format(
                args.model, time_string(), step, loss),
            max_len=target_length // hparams.reduction_factor)
    # save real and predicted mel-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(mel_prediction,
                          os.path.join(plot_dir,
                                       '{}-mel-spectrogram.png'.format(step)),
                          title='{}, {}, step={}, loss={:.5f}'.format(
                              args.model, time_string(), step, loss),
                          target_spectrogram=targets_mel,
                          max_len=target_length)
    log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
Beispiel #4
0
    def plot_result(self, mel_pred, mel_target, alig):
        os.makedirs(os.path.join(self.config['outdir'], 'plots'),
                    exist_ok=True)

        plot_spectrogram(mel_pred,
                         os.path.join(self.config['outdir'], 'plots',
                                      'mel-before-{}.png'.format(self.steps)),
                         target_spectrogram=mel_target)

        plot_alignment(
            alig,
            os.path.join(self.config['outdir'], 'plots',
                         'alig-{}.png'.format(self.steps)))
Beispiel #5
0
def	save_and_plot_fn(args, log_dir, step, loss, prefix):
	idx, (seq, spec, align) = args

	audio_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
	align_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx))

	waveform = inv_spectrogram(spec.T)
	save_audio(waveform, audio_path)

	info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
	plot.plot_alignment(
		align, align_path, info=info_text,
		text=sequence_to_text(seq,
			skip_eos_and_pad=True, combine_jamo=True))
Beispiel #6
0
def train(log_dir, args):
	save_dir = os.path.join(log_dir, 'pretrained/')
	checkpoint_path = os.path.join(save_dir, 'model.ckpt')
	input_path = os.path.join(args.base_dir, args.input)
	plot_dir = os.path.join(log_dir, 'plots')
	os.makedirs(plot_dir, exist_ok=True)
	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log('Using model: {}'.format(args.model))
	log(hparams_debug_string())

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	step_count = 0
	try:
		#simple text file to keep count of global step
		with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file:
			step_count = int(file.read())
	except:
		print('no step_counter file found, assuming there is no saved checkpoint')

	global_step = tf.Variable(step_count, name='global_step', trainable=False)
	with tf.variable_scope('model') as scope:
		model = create_model(args.model, hparams)
		model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets)
		model.add_loss()
		model.add_optimizer(global_step)
		stats = add_stats(model)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=5)

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				#Restore saved model if the user requested it, Default = True.
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e))

			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
				saver.restore(sess, checkpoint_state.model_checkpoint_path)

			else:
				if not args.restore:
					log('Starting new training!')
				else:
					log('No model to load at {}'.format(save_dir))

			#initiating feeder
			feeder.start_in_session(sess)

			#Training loop
			while not coord.should_stop():
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r')

				if loss > 100 or np.isnan(loss):
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step: {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)
				
				if step % args.checkpoint_interval == 0:
					with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file:
						file.write(str(step))
					log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step))
					saver.save(sess, checkpoint_path, global_step=step)
					# Unlike the original tacotron, we won't save audio
					# because we yet have to use wavenet as vocoder
					log('Saving alignement and Mel-Spectrograms..')
					input_seq, prediction, alignment, target = sess.run([model.inputs[0],
							 model.mel_outputs[0],
							 model.alignments[0],
							 model.mel_targets[0],
							 ])
					#save predicted spectrogram to disk (for plot and manual evaluation purposes)
					mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(log_dir, mel_filename), prediction, allow_pickle=False)

					#save alignment plot to disk (control purposes)
					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss))
					#save real mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss))
					#save predicted mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss))
					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

		except Exception as e:
			log('Exiting due to exception: {}'.format(e), slack=True)
			traceback.print_exc()
			coord.request_stop(e)
            cmd = 'cp ' + 'vox/wav/' + '_'.join(k for k in fname) + '.wav ' + dst_dir + '/' + fname_original + '_original.wav'
            os.system(cmd)

            #text = ' '.join(k for k in line.decode("utf-8").split()[1:])
            #text = '< ' + text + ' >'
            #text = [phids[l] for l in text.split()]
            text, qF0s = get_textNqF0s(line, phids)

            # Generating from original speaker
            spk = speakers_dict[fname[0]]
            waveform, alignment, _ = tts(model, text, spk, qF0s)
            fname_generated = '_'.join(k for k in fname[1:])
            fname_generated = fname_generated + '_generated'
            dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_generated, file_name_suffix))
            dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_generated))
            plot_alignment(alignment.T, dst_alignment_path,
                           info="tacotron, {}".format(checkpoint_path))
            audio.save_wav(waveform, dst_wav_path)

            # Generating from a different speaker
            spk = np.random.randint(len(speakers))
            #fname = fname.split('_')
            #fname[0] = ids2speakers[spk]
            fname_transferred = '_'.join(k for k in fname[1:])
            fname_transferred = fname_transferred + '_transferred'
            print("I picked a random number as ", spk, " the corresponding speaker from the dictionary is ", ids2speakers[spk], " the filename I am storing is ", fname_transferred)
            print(text, fname_transferred)
            waveform, alignment, _ = tts(model, text, spk, qF0s)
            dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_transferred, file_name_suffix))
            dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_transferred))
            plot_alignment(alignment.T, dst_alignment_path,
                           info="tacotron, {}".format(checkpoint_path))
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              use_manual_attention=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    #plot_path = add_prefix(plot_path, time_str)
    if use_manual_attention:
        plot_path = add_postfix(plot_path, "manual")

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        end_idx_counter = 0
        attention_argmax = alignment.argmax(0)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        max_counter = min((attention_argmax == end_idx).sum(), 5)

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]

    audio_out = inv_spectrogram(wav.T)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_audio(audio_out, current_path)
        return True
    else:
        io_out = io.BytesIO()
        save_audio(audio_out, io_out)
        result = io_out.getvalue()
        return result
def plot_graph_and_save_audio(args,
							  base_path=None,
							  start_of_sentence=None, end_of_sentence=None,
							  pre_word_num=0, post_word_num=0,
							  pre_surplus_idx=0, post_surplus_idx=1,
							  save_alignment=False,
							  librosa_trim=False, attention_trim=False,
							  time_str=None, isKorean=True, config=None):
	idx, (wav, alignment, path, text, sequence, mel) = args

	if base_path:
		plot_path = "{}/{}_{}.png".format(base_path, config.file.split('.')[0], idx)
	elif path:
		plot_path = path.rsplit('.', 1)[0] + ".png"
	else:
		plot_path = None

	if plot_path:
		plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

	if attention_trim and end_of_sentence:
		# attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다.
		end_idx_counter = 0
		attention_argmax = alignment.argmax(
			0)  # alignment: text length(encoder), target length(decoder)   ==> target length(decoder)
		end_idx = min(len(sequence) - 1, max(attention_argmax))
		max_counter = min((attention_argmax == end_idx).sum(), 5)

		for jdx, attend_idx in enumerate(attention_argmax):
			if len(attention_argmax) > jdx + 1:
				if attend_idx == end_idx:
					end_idx_counter += 1

				if attend_idx == end_idx and attention_argmax[jdx + 1] > end_idx:
					break

				if end_idx_counter >= max_counter:
					break
			else:
				break

		spec_end_idx = hparams.reduction_factor * jdx + 3
		wav = wav[:spec_end_idx]
		mel = mel[:spec_end_idx]

	audio_out = inv_linear_spectrogram(wav.T, hparams)

	if librosa_trim and end_of_sentence:
		yt, index = librosa.effects.trim(audio_out, frame_length=5120, hop_length=256, top_db=50)
		audio_out = audio_out[:index[-1]]
		mel = mel[:index[-1] // hparams.hop_size]

	if save_alignment:
		alignment_path = "{}/{}.npy".format(base_path, idx)
		np.save(alignment_path, alignment, allow_pickle=False)

	if path or base_path:
		if path:
			current_path = add_postfix(path, idx)
		elif base_path:
			current_path = plot_path.replace(".png", ".wav")

		save_wav(audio_out, current_path, hparams.sample_rate)

		# hccho
		mel_path = current_path.replace(".wav", ".npy")
		np.save(mel_path, mel)
		return current_path
	else:
		io_out = io.BytesIO()
		save_wav(audio_out, io_out, hparams.sample_rate)
		result = io_out.getvalue()
		return io_out
Beispiel #10
0
    def _train_epoch(self, dataloader=None):
        self.model.train()

        ll = len(dataloader)
        running_loss = 0.0
        running_l1_loss = 0.0
        running_ssim_loss = 0.0
        running_att_loss = 0.0

        pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \
                    disable=self.hparams.trainer.disable_progress_bar)
        for it, batch in enumerate(pbar, start=1):
            self.optimizer.zero_grad()

            mels, mlens, texts, tlens = \
                batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1)
            mels, mlens, texts, tlens = \
                mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device)

            s = mels = self.normalizer(mels)

            # Spectrogram augmentation
            if self.hparams.duration.enable_augment:
                s = add_random_noise(mels, self.hparams.duration.noise)
                s = degrade_some(self.model, s, texts, tlens, \
                                self.hparams.duration.feed_ratio, repeat=self.hparams.duration.feed_repeat)
                s = frame_dropout(s, self.hparams.duration.replace_ratio)

            melspecs, attns = self.model((texts, tlens, s, True))
            outputs_and_targets = (melspecs, mels, attns, mlens, tlens)
            loss, l1_loss, ssim_loss, att_loss = self.compute_metrics(
                outputs_and_targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
            self.optimizer.step()
            self.step += 1

            loss, l1_loss, ssim_loss, att_loss = loss.item(), l1_loss.item(
            ), ssim_loss.item(), att_loss.item()
            running_loss += loss
            running_l1_loss += l1_loss
            running_ssim_loss += ssim_loss
            running_att_loss += att_loss

            # update the progress bar
            pbar.set_postfix({
                'l1': "%.05f" % (running_l1_loss / it),
                'ssim': "%.05f" % (running_ssim_loss / it),
                'att': "%.05f" % (running_att_loss / it)
            })

            mels, melspecs, attns = mels.cpu().detach(), melspecs.cpu().detach(
            ), attns.cpu().detach()
            index = -1
            mlen, tlen = mlens[index].item(), tlens[index].item()
            mels_fig = plot_spectrogram(
                melspecs[index, :mlen, :],
                target_spectrogram=mels[index, :mlen, :])
            attn_fig = plot_alignment(attns[index, :mlen, :tlen])
            self.loggers.log_step(
                'train', self.step, {
                    'step_l1_loss': l1_loss,
                    'step_ssim_loss': ssim_loss,
                    'step_att_loss': att_loss
                }, {
                    'melspecs': mels_fig,
                    'attention': attn_fig
                })

        epoch_loss = running_loss / ll
        epoch_l1_loss = running_l1_loss / ll
        epoch_ssim_loss = running_ssim_loss / ll
        epoch_att_loss = running_att_loss / ll

        return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_att_loss
Beispiel #11
0
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder,
             hparams, sess, step, summary_writer):
    # Run eval and save eval stats
    log('\nRunning evaluation at step {}'.format(step))
    sum_eval_loss = 0.0
    sum_mel_loss = 0.0
    sum_stop_token_loss = 0.0
    sum_linear_loss = 0.0
    count = 0.0
    mel_p = None
    mel_t = None
    t_len = None
    attention_mask_sample = None
    lin_p = None
    lin_t = None
    for _ in tqdm(range(feeder.test_steps)):
        test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run(
            [
                eval_model.loss,
                eval_model.mel_loss,
                eval_model.stop_token_loss,
                eval_model.linear_loss,
                eval_model.post_net_predictions[0],
                eval_model.targets_mel[0],
                eval_model.targets_length[0],
                eval_model.alignments[0],
                eval_model.mag_pred[0],
                eval_model.targets_mag[0],
            ])
        sum_eval_loss += test_eloss
        sum_mel_loss += test_mel_loss
        sum_stop_token_loss += test_stop_token_loss
        sum_linear_loss += test_linear_loss
        count += 1.0
    wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir,
                                '{}-eval-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    if count > 0.0:
        eval_loss = sum_eval_loss / count
        mel_loss = sum_mel_loss / count
        stop_token_loss = sum_stop_token_loss / count
        linear_loss = sum_linear_loss / count
    else:
        eval_loss = sum_eval_loss
        mel_loss = sum_mel_loss
        stop_token_loss = sum_stop_token_loss
        linear_loss = sum_linear_loss
    log('Saving eval log to {}..'.format(eval_dir))
    # Save some log to monitor model improvement on same unseen sequence
    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    for i in range(len(alignments)):
        plot.plot_alignment(alignments[i],
                            os.path.join(
                                eval_plot_dir, '{}_{}-eval-align.png'.format(
                                    step, alignment_titles[i])),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            max_len=t_len // hparams.reduction_factor)
    plot.plot_spectrogram(
        mel_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-mel-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=mel_t,
        max_len=t_len)
    plot.plot_spectrogram(
        lin_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=lin_t,
        max_len=t_len,
        auto_aspect=True)
    log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
    log('Writing eval summary!')
    add_eval_stats(summary_writer, step, linear_loss, mel_loss,
                   stop_token_loss, eval_loss)
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence, mel) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다.
        end_idx_counter = 0
        attention_argmax = alignment.argmax(
            0
        )  # alignment: text length(encoder), target length(decoder)   ==> target length(decoder)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1
        # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다.
        #          한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다.
        #          (설계자가 왜 5로 잘랐는지는 미지수)
        max_counter = (attention_argmax == end_idx).sum()

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]
        mel = mel[:spec_end_idx]

    audio_out = inv_linear_spectrogram(wav.T, hparams)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]
        mel = mel[:index[-1] // hparams.hop_size]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_wav(audio_out, current_path, hparams.sample_rate)

        #hccho
        mel_path = current_path.replace(".wav", ".npy")
        np.save(mel_path, mel)

        #return True
        return audio_out
    else:
        io_out = io.BytesIO()
        save_wav(audio_out, io_out, hparams.sample_rate)
        result = io_out.getvalue()

        return audio_out
def train(log_dir, args):
    checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt')
    log(hp.to_string(), is_print=False)
    log('Loading training data from: %s' % args.tfr_dir)
    log('Checkpoint path: %s' % checkpoint_path)
    log('Using model: sygst tacotron2')

    tf_dset = TFDataSet(hp, args.tfr_dir)
    feats = tf_dset.get_train_next()
    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    training = tf.placeholder_with_default(True, shape=(), name='training')
    with tf.name_scope('model'):
        model = Tacotron2SYGST(hp)
        model(feats['inputs'],
              mel_inputs=feats['mel_targets'],
              spec_inputs=feats['linear_targets'],
              spec_lengths=feats['spec_lengths'],
              ref_inputs=feats['mel_targets'],
              ref_lengths=feats['spec_lengths'],
              arousal_labels=feats['soft_arousal_labels'],
              valence_labels=feats['soft_valance_labels'],
              training=training)
        """
        text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10)
        model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training)
        """
        model.add_loss()
        model.add_optimizer(global_step)
        stats = model.add_stats()

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2)

    # Train!
    config = tf.ConfigProto(allow_soft_placement=True,
                            gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())
            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%s' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s' % restore_path, slack=True)
            else:
                log('Starting a new training run ...', slack=True)
            """
            fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss,
                       model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max,
                       model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max]
            """
            fetches = [
                global_step, model.optimize, model.loss, model.mel_loss,
                model.spec_loss, model.stop_loss, model.arousal_loss,
                model.valence_loss
            ]
            for _ in range(_max_step):
                start_time = time.time()
                sess.run(debug.get_ops())
                # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches)
                step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run(
                    fetches)
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss,
                    aro_loss, val_loss)
                log(message, slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.5f at step %d!' % (loss, step),
                        slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    try:
                        summary_writer.add_summary(sess.run(stats), step)
                    except Exception as e:
                        log(f'summary failed and ignored: {str(e)}')

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' %
                        (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    gt_mel, gt_spec, seq, mel, spec, align = sess.run([
                        model.mel_targets[0], model.spec_targets[0],
                        model.text_targets[0], model.mel_outputs[0],
                        model.spec_outputs[0], model.alignment_outputs[0]
                    ])
                    text = sequence_to_text(seq)
                    wav = audio.inv_spectrogram(hp, spec.T)
                    wav_path = os.path.join(log_dir,
                                            'step-%d-audio.wav' % step)
                    mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step)
                    spec_path = os.path.join(log_dir,
                                             'step-%d-spec.png' % step)
                    align_path = os.path.join(log_dir,
                                              'step-%d-align.png' % step)
                    info = '%s, %s, step=%d, loss=%.5f\n %s' % (
                        args.model, time_string(), step, loss, text)
                    plot.plot_alignment(align, align_path, info=info)
                    plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel)
                    plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec)
                    audio.save_wav(hp, wav, wav_path)
                    log('Input: %s' % text)

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()
Beispiel #14
0
    def synthesize(self, texts, basenames, log_dir, mel_filenames):
        hparams = self._hparams

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
        sequences = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in sequences]
        seqs, max_seq_len = self._prepare_inputs(sequences)

        feed_dict = {
            self.inputs: seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32)
        }

        linears, mels, alignments, audio_length = self.session.run(
            [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length],
            feed_dict=feed_dict)
        # Natural batch synthesis
        # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
        target_lengths = audio_length

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        for i, mel in enumerate(mels):

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)
                alignments_samples, alignment_titles = self.get_alignments(alignments)
                for idx in range(len(alignments_samples)):
                    # save alignments
                    plot.plot_alignment(alignments_samples[idx],
                                        os.path.join(log_dir, 'plots/{}.png'.format(
                                            alignment_titles[
                                                idx])),
                                        title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel,
                                      os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                # save wav (linear -> wav)

                wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                audio.save_wav(wav,
                               os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save linear spectrogram plot
                plot.plot_spectrogram(linears[i],
                                      os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
Beispiel #15
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        # [-max, max] or [0,max]
        t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
            0, hparams.max_abs_value)

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        seqs = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]
        input_seqs, max_seq_len = self._prepare_inputs(seqs)

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
            target_lengths = [len(np_target) for np_target in np_targets]
            target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)
        linears = None
        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction],
                feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
            linears = np.clip(linears, t2_output_range[0], t2_output_range[1])
            assert len(mels) == len(linears) == len(texts)

        mels = np.clip(mels, t2_output_range[0], t2_output_range[1])

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save alignments
                plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
                                    title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                if linears:
                    # save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                                   sr=hparams.sample_rate)

                    # save linear spectrogram plot
                    plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                          title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

        return saved_mels_paths
        for idx, line in enumerate(lines):
            fname = line.decode("utf-8").split()[0].zfill(8)
            cmd = 'cp vox/wav/' + fname + '.wav ' + dst_dir + '/' + fname + '_original.wav'
            print(cmd)
            os.system(cmd)
            text = ' '.join(k for k in line.decode("utf-8").split()[1:])
            text = '< ' + text + ' >'
            print(text, fname)
            text = [phids[l] for l in text.split()]
            waveform, alignment, mel = tts(acousticmodel, text)
            waveform_vocoder = vocoder(vocoder_model, mel)
            print(waveform_vocoder.shape)
            dst_wav_path = join(dst_dir,
                                "{}{}.wav".format(fname, file_name_suffix))
            dst_alignment_path = join(dst_dir,
                                      "{}_alignment.png".format(fname))
            plot_alignment(
                alignment.T,
                dst_alignment_path,
                info="tacotron, {}".format(checkpoint_path_acousticmodel))
            audio.save_wav(waveform, dst_wav_path)

            dest_fname = fname + '_generated_vocoder'
            dst_wav_path = join(
                dst_dir, "{}{}.wav".format(dest_fname, file_name_suffix))
            write(dst_wav_path, 16000, waveform_vocoder)

    print(
        "Finished! Check out {} for generated audio samples.".format(dst_dir))
    sys.exit(0)