コード例 #1
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                checkpoint_dir=None):

    idx = 1  # idx = np.random.randint(0, len(mel_outputs))

    # Alignment
    path = os.path.join(checkpoint_dir,
                        "step{}_alignment.png".format(global_step))
    alignment = attn[idx].cpu().data.numpy(
    )  # alignment = attn[idx].cpu().data.numpy()[:, :input_length]
    plot_alignment(alignment.T,
                   path,
                   info="tacotron, step={}".format(global_step))

    # Predicted spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)
コード例 #2
0
    def synthesize(self,texts, speaker):
        from utils.plot import plot_alignment,plot_spectrogram

        inputs,input_lengths=self.process_sentence(texts)
        # print(inputs,input_lengths)
        speaker=np.array(speaker)

        # tacotron2 inference.
        mel_outputs, post_mel_outputs, stop_outputs, alignment_historys = self.model.inference(
            inputs,
            input_lengths,
            speaker_ids=speaker,
            use_window_mask=False,
            win_front=20,
            win_back=20,
            maximum_iterations=1000,
           
        )
        plot_spectrogram(post_mel_outputs[0].numpy(),'./mel.png','inference')
        if self.vocoder_type=='GL':
            target_lengths = self._get_output_lengths(stop_outputs)

            # Take off the batch wise padding
            mels = [mel[:target_length, :].numpy() for mel, target_length in zip(post_mel_outputs, target_lengths)]
            wavs=[self.vocoder(mel.T,self.hp) for mel in mels]
        else:
            if self.vocoder_type=='Multi':
                if post_mel_outputs.shape[1]%self.vocoder_window!=0:
                    post_mel_outputs=post_mel_outputs[:,:-int( post_mel_outputs.shape[1]%self.vocoder_window)]
                _,wavs=self.vocoder(post_mel_outputs)
            else:
                _,wavs = self.vocoder(post_mel_outputs)
        return wavs
コード例 #3
0
    def plot_result(self,pred,target):

        os.makedirs(os.path.join(self.config['outdir'], 'plots'), exist_ok=True)

        plot_spectrogram(pred,
                         os.path.join(self.config['outdir'], 'plots','mel-before-{}.png'.format(self.steps)),
                         target_spectrogram=target)
コード例 #4
0
def save_current_model(args, checkpoint_path, global_step, hparams, loss,
                       model, plot_dir, saver, sess, step, wav_dir):
    # Save model and current global step
    saver.save(sess, checkpoint_path, global_step=global_step)
    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
        )
    input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run(
        [
            model.inputs[0],
            model.post_net_predictions[0],
            model.mag_pred[0],
            model.alignments[0],
            model.targets_mel[0],
            model.targets_length[0],
            model.targets_mag[0],
        ])
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    # save griffin lim inverted wav for debug (linear -> wav)
    wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    # Save real and predicted linear-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(
        linear_prediction,
        os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, loss),
        target_spectrogram=linear_target,
        max_len=target_length,
        auto_aspect=True)
    # save griffin lim inverted wav for debug (mel -> wav)
    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    # save alignment plot to disk (control purposes)
    for i in range(len(alignments)):
        plot.plot_alignment(
            alignments[i],
            os.path.join(plot_dir,
                         '{}_{}-align.png'.format(step, alignment_titles[i])),
            title='{}, {}, step={}, loss={:.5f}'.format(
                args.model, time_string(), step, loss),
            max_len=target_length // hparams.reduction_factor)
    # save real and predicted mel-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(mel_prediction,
                          os.path.join(plot_dir,
                                       '{}-mel-spectrogram.png'.format(step)),
                          title='{}, {}, step={}, loss={:.5f}'.format(
                              args.model, time_string(), step, loss),
                          target_spectrogram=targets_mel,
                          max_len=target_length)
    log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
コード例 #5
0
    def plot_result(self, mel_pred, mel_target, alig):
        os.makedirs(os.path.join(self.config['outdir'], 'plots'),
                    exist_ok=True)

        plot_spectrogram(mel_pred,
                         os.path.join(self.config['outdir'], 'plots',
                                      'mel-before-{}.png'.format(self.steps)),
                         target_spectrogram=mel_target)

        plot_alignment(
            alig,
            os.path.join(self.config['outdir'], 'plots',
                         'alig-{}.png'.format(self.steps)))
コード例 #6
0
def train(log_dir, args):
	save_dir = os.path.join(log_dir, 'pretrained/')
	checkpoint_path = os.path.join(save_dir, 'model.ckpt')
	input_path = os.path.join(args.base_dir, args.input)
	plot_dir = os.path.join(log_dir, 'plots')
	os.makedirs(plot_dir, exist_ok=True)
	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log('Using model: {}'.format(args.model))
	log(hparams_debug_string())

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	step_count = 0
	try:
		#simple text file to keep count of global step
		with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file:
			step_count = int(file.read())
	except:
		print('no step_counter file found, assuming there is no saved checkpoint')

	global_step = tf.Variable(step_count, name='global_step', trainable=False)
	with tf.variable_scope('model') as scope:
		model = create_model(args.model, hparams)
		model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets, feeder.token_targets)
		model.add_loss()
		model.add_optimizer(global_step)
		stats = add_stats(model)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=5)

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				#Restore saved model if the user requested it, Default = True.
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e))

			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
				saver.restore(sess, checkpoint_state.model_checkpoint_path)

			else:
				if not args.restore:
					log('Starting new training!')
				else:
					log('No model to load at {}'.format(save_dir))

			#initiating feeder
			feeder.start_in_session(sess)

			#Training loop
			while not coord.should_stop():
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r')

				if loss > 100 or np.isnan(loss):
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step: {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)
				
				if step % args.checkpoint_interval == 0:
					with open(os.path.join(log_dir,'step_counter.txt'), 'w') as file:
						file.write(str(step))
					log('Saving checkpoint to: {}-{}'.format(checkpoint_path, step))
					saver.save(sess, checkpoint_path, global_step=step)
					# Unlike the original tacotron, we won't save audio
					# because we yet have to use wavenet as vocoder
					log('Saving alignement and Mel-Spectrograms..')
					input_seq, prediction, alignment, target = sess.run([model.inputs[0],
							 model.mel_outputs[0],
							 model.alignments[0],
							 model.mel_targets[0],
							 ])
					#save predicted spectrogram to disk (for plot and manual evaluation purposes)
					mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(log_dir, mel_filename), prediction, allow_pickle=False)

					#save alignment plot to disk (control purposes)
					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss))
					#save real mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss))
					#save predicted mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss))
					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

		except Exception as e:
			log('Exiting due to exception: {}'.format(e), slack=True)
			traceback.print_exc()
			coord.request_stop(e)
コード例 #7
0
def main():

    with tf.device(
            '/cpu:0'):  # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다.
        config = get_arguments()
        started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
        logdir = os.path.join(config.logdir, 'generate', started_datestring)
        print('logdir0-------------' + logdir)

        if not os.path.exists(logdir):
            os.makedirs(logdir)

        load_hparams(hparams, config.checkpoint_dir)

        sess = tf.Session()
        scalar_input = hparams.scalar_input
        net = WaveNetModel(
            batch_size=config.batch_size,
            dilations=hparams.dilations,
            filter_width=hparams.filter_width,
            residual_channels=hparams.residual_channels,
            dilation_channels=hparams.dilation_channels,
            quantization_channels=hparams.quantization_channels,
            out_channels=hparams.out_channels,
            skip_channels=hparams.skip_channels,
            use_biases=hparams.use_biases,
            scalar_input=hparams.scalar_input,
            global_condition_channels=hparams.gc_channels,
            global_condition_cardinality=config.gc_cardinality,
            local_condition_channels=hparams.num_mels,
            upsample_factor=hparams.upsample_factor,
            legacy=hparams.legacy,
            residual_legacy=hparams.residual_legacy,
            train_mode=False
        )  # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함

        if scalar_input:
            samples = tf.placeholder(tf.float32, shape=[net.batch_size, None])
        else:
            samples = tf.placeholder(
                tf.int32, shape=[net.batch_size, None]
            )  # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이)

        # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels)
        upsampled_local_condition = tf.placeholder(
            tf.float32, shape=[net.batch_size, hparams.num_mels])

        next_sample = net.predict_proba_incremental(
            samples, upsampled_local_condition, [config.gc_id] * net.batch_size
        )  # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용

        # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자.
        print('logdir0-------------' + logdir)
        mel_input = np.load(config.mel)
        sample_size = mel_input.shape[0] * hparams.hop_size
        mel_input = np.tile(mel_input, (config.batch_size, 1, 1))
        with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE):
            upsampled_local_condition_data = net.create_upsample(
                mel_input, upsample_type=hparams.upsample_type)

        var_list = [
            var for var in tf.global_variables() if 'queue' not in var.name
        ]
        saver = tf.train.Saver(var_list)
        print('Restoring model from {}'.format(config.checkpoint_dir))

        load(saver, sess, config.checkpoint_dir)
        init_op = tf.group(tf.initialize_all_variables(),
                           net.queue_initializer)

        sess.run(init_op)  # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다.

        quantization_channels = hparams.quantization_channels
        if config.wav_seed:
            # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함  --> 그래서 너무 짧으면 error
            seed = create_seed(config.wav_seed, hparams.sample_rate,
                               quantization_channels, net.receptive_field,
                               scalar_input)  # --> mu_law encode 된 것.
            if scalar_input:
                waveform = seed.tolist()
            else:
                waveform = sess.run(
                    seed).tolist()  # [116, 114, 120, 121, 127, ...]

            print('Priming generation...')
            for i, x in enumerate(waveform[-net.receptive_field:-1]
                                  ):  # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다.
                if i % 100 == 0:
                    print('Priming sample {}/{}'.format(
                        i, net.receptive_field),
                          end='\r')
                sess.run(next_sample,
                         feed_dict={
                             samples:
                             np.array([x] * net.batch_size).reshape(
                                 net.batch_size, 1),
                             upsampled_local_condition:
                             np.zeros([net.batch_size, hparams.num_mels])
                         })
            print('Done.')
            waveform = np.array([waveform[-net.receptive_field:]] *
                                net.batch_size)
        else:
            # Silence with a single random sample at the end.
            if scalar_input:
                waveform = [0.0] * (net.receptive_field - 1)
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform, 2 * np.random.rand(net.batch_size).reshape(
                            net.batch_size, -1) - 1
                    ],
                    axis=-1)  # -1~1사이의 random number를 만들어 끝에 붙힌다.
                # wavefor: shape(batch_size,net.receptive_field )
            else:
                waveform = [quantization_channels / 2] * (
                    net.receptive_field - 1
                )  # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다.
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform,
                        np.random.randint(quantization_channels,
                                          size=net.batch_size).reshape(
                                              net.batch_size, -1)
                    ],
                    axis=-1)  # one hot 변환 전. (batch_size, 5117)

        start_time = time.time()
        upsampled_local_condition_data = sess.run(
            upsampled_local_condition_data)
        last_sample_timestamp = datetime.now()
        for step in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size

            window = waveform[:,
                              -1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)

            # Run the WaveNet to predict the next sample.

            # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185]
            # fast인 경우, window는 숫자 1개.
            prediction = sess.run(
                next_sample,
                feed_dict={
                    samples:
                    window,
                    upsampled_local_condition:
                    upsampled_local_condition_data[:, step, :]
                }
            )  # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다.  --> (batch_size,256)

            if scalar_input:
                sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
            else:
                # Scale prediction distribution using temperature.
                # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
                # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
                np.seterr(divide='ignore')
                scaled_prediction = np.log(
                    prediction
                ) / config.temperature  # config.temperature인 경우는 값의 변화가 없다.
                scaled_prediction = (
                    scaled_prediction - np.logaddexp.reduce(
                        scaled_prediction, axis=-1, keepdims=True)
                )  # np.log(np.sum(np.exp(scaled_prediction)))
                scaled_prediction = np.exp(scaled_prediction)
                np.seterr(divide='warn')

                # Prediction distribution at temperature=1.0 should be unchanged after
                # scaling.
                if config.temperature == 1.0:
                    np.testing.assert_allclose(
                        prediction,
                        scaled_prediction,
                        atol=1e-5,
                        err_msg=
                        'Prediction scaling at temperature=1.0 is not working as intended.'
                    )

                # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
                sample = [[
                    np.random.choice(np.arange(quantization_channels), p=p)
                ] for p in scaled_prediction]  # choose one sample per batch

            waveform = np.concatenate([waveform, sample],
                                      axis=-1)  #window.shape: (N,1)

            # Show progress only once per second.
            current_sample_timestamp = datetime.now()
            time_since_print = current_sample_timestamp - last_sample_timestamp
            if time_since_print.total_seconds() > 1.:
                duration = time.time() - start_time
                print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(
                    step + 1, sample_size, duration),
                      end='\r')
                last_sample_timestamp = current_sample_timestamp

        # Introduce a newline to clear the carriage return from the progress.
        print()

        # Save the result as a wav file.
        if hparams.input_type == 'raw':
            out = waveform[:, net.receptive_field:]
        elif hparams.input_type == 'mulaw':
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=False)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})
        else:  # 'mulaw-quantize'
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=True)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})

        # save wav

        for i in range(net.batch_size):
            config.wav_out_path = logdir + '/test-{}.wav'.format(i)
            mel_path = config.wav_out_path.replace(".wav", ".png")

            gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(
                np.float32).T
            audio.save_wav(out[i], config.wav_out_path,
                           hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.

            plot.plot_spectrogram(gen_mel_spectrogram,
                                  mel_path,
                                  title='generated mel spectrogram',
                                  target_spectrogram=mel_input[i])
        print('Finished generating.')
コード例 #8
0
    def _train_epoch(self, dataloader):
        self.model.train()

        running_loss = 0.0
        running_l1_loss = 0.0
        running_ssim_loss = 0.0
        running_drn_loss = 0.0

        pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \
                    disable=self.hparams.trainer.disable_progress_bar)
        for it, batch in enumerate(pbar, start=1):
            self.optimizer.zero_grad()

            mels, mlens, texts, tlens, durations = \
                batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1), batch['drns'].long()
            mels, mlens, texts, tlens, durations = \
                mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device), durations.to(self.device)

            mels = self.normalizer(mels)

            melspecs, prd_durans = self.model((texts, tlens, durations, 1.0))
            outputs_and_targets = (melspecs, mels, mlens, tlens, durations,
                                   prd_durans)
            loss, l1_loss, ssim_loss, drn_loss = self.compute_metrics(
                outputs_and_targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
            self.optimizer.step()
            self.step += 1

            loss, l1_loss, ssim_loss, drn_loss = \
                loss.item(), l1_loss.item(), ssim_loss.item(), drn_loss.item()
            running_loss += loss
            running_l1_loss += l1_loss
            running_ssim_loss += ssim_loss
            running_drn_loss += drn_loss

            # update the progress bar
            pbar.set_postfix({
                'l1': "%.05f" % (running_l1_loss / it),
                'ssim': "%.05f" % (running_ssim_loss / it),
                'drn': "%.05f" % (running_drn_loss / it)
            })

            mels, melspecs = mels.cpu().detach(), melspecs.cpu().detach()
            index = -1
            mlen, tlen = mlens[index].item(), tlens[index].item()
            mels_fig = plot_spectrogram(
                melspecs[index, :mlen, :],
                target_spectrogram=mels[index, :mlen, :])
            self.loggers.log_step(
                'train', self.step, {
                    'step_l1_loss': l1_loss,
                    'step_ssim_loss': ssim_loss,
                    'step_drn_loss': drn_loss
                }, {'melspecs': mels_fig})

        epoch_loss = running_loss / it
        epoch_l1_loss = running_l1_loss / it
        epoch_ssim_loss = running_ssim_loss / it
        epoch_drn_loss = running_drn_loss / it

        return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_drn_loss
コード例 #9
0
    def _train_epoch(self, dataloader=None):
        self.model.train()

        ll = len(dataloader)
        running_loss = 0.0
        running_l1_loss = 0.0
        running_ssim_loss = 0.0
        running_att_loss = 0.0

        pbar = tqdm(dataloader, unit="audios", unit_scale=dataloader.batch_size, \
                    disable=self.hparams.trainer.disable_progress_bar)
        for it, batch in enumerate(pbar, start=1):
            self.optimizer.zero_grad()

            mels, mlens, texts, tlens = \
                batch['mels'], batch['mlens'].squeeze(1), batch['texts'].long(), batch['tlens'].squeeze(1)
            mels, mlens, texts, tlens = \
                mels.to(self.device), mlens.to(self.device), texts.to(self.device), tlens.to(self.device)

            s = mels = self.normalizer(mels)

            # Spectrogram augmentation
            if self.hparams.duration.enable_augment:
                s = add_random_noise(mels, self.hparams.duration.noise)
                s = degrade_some(self.model, s, texts, tlens, \
                                self.hparams.duration.feed_ratio, repeat=self.hparams.duration.feed_repeat)
                s = frame_dropout(s, self.hparams.duration.replace_ratio)

            melspecs, attns = self.model((texts, tlens, s, True))
            outputs_and_targets = (melspecs, mels, attns, mlens, tlens)
            loss, l1_loss, ssim_loss, att_loss = self.compute_metrics(
                outputs_and_targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
            self.optimizer.step()
            self.step += 1

            loss, l1_loss, ssim_loss, att_loss = loss.item(), l1_loss.item(
            ), ssim_loss.item(), att_loss.item()
            running_loss += loss
            running_l1_loss += l1_loss
            running_ssim_loss += ssim_loss
            running_att_loss += att_loss

            # update the progress bar
            pbar.set_postfix({
                'l1': "%.05f" % (running_l1_loss / it),
                'ssim': "%.05f" % (running_ssim_loss / it),
                'att': "%.05f" % (running_att_loss / it)
            })

            mels, melspecs, attns = mels.cpu().detach(), melspecs.cpu().detach(
            ), attns.cpu().detach()
            index = -1
            mlen, tlen = mlens[index].item(), tlens[index].item()
            mels_fig = plot_spectrogram(
                melspecs[index, :mlen, :],
                target_spectrogram=mels[index, :mlen, :])
            attn_fig = plot_alignment(attns[index, :mlen, :tlen])
            self.loggers.log_step(
                'train', self.step, {
                    'step_l1_loss': l1_loss,
                    'step_ssim_loss': ssim_loss,
                    'step_att_loss': att_loss
                }, {
                    'melspecs': mels_fig,
                    'attention': attn_fig
                })

        epoch_loss = running_loss / ll
        epoch_l1_loss = running_l1_loss / ll
        epoch_ssim_loss = running_ssim_loss / ll
        epoch_att_loss = running_att_loss / ll

        return epoch_loss, epoch_l1_loss, epoch_ssim_loss, epoch_att_loss
コード例 #10
0
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0):
    waveform = waveform[:,:1]
    
    sample_size = upsampled_local_condition_data.shape[1]
    last_sample_timestamp = datetime.now()
    start_time = time.time()
    for step2 in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size
        window = waveform[:,-1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)
        

        prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data })


        if hparams.scalar_input:
            sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
        else:
            # Scale prediction distribution using temperature.
            # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
            # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
            np.seterr(divide='ignore')
            scaled_prediction = np.log(prediction) / temperature   # config.temperature인 경우는 값의 변화가 없다.
            scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True))  # np.log(np.sum(np.exp(scaled_prediction)))
            scaled_prediction = np.exp(scaled_prediction)
            np.seterr(divide='warn')
    
            # Prediction distribution at temperature=1.0 should be unchanged after
            # scaling.
            if temperature == 1.0:
                np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.')
            
            # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
            sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction]  # choose one sample per batch
        
        waveform = np.concatenate([waveform,sample],axis=-1)   #window.shape: (N,1)

        # Show progress only once per second.
        current_sample_timestamp = datetime.now()
        time_since_print = current_sample_timestamp - last_sample_timestamp
        if time_since_print.total_seconds() > 1.:
            duration = time.time() - start_time
            print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r')
            last_sample_timestamp = current_sample_timestamp
    
    print('\n')
    # Save the result as a wav file.    
    if hparams.input_type == 'raw':
        out = waveform[:,1:]
    elif hparams.input_type == 'mulaw':
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})
    else:  # 'mulaw-quantize'
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})          
        
        
    # save wav
    
    for i in range(1):
        wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i)
        mel_path =  wav_out_path.replace(".wav", ".png")
        
        gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T
        audio.save_wav(out[i], wav_out_path, hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.
        
        plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])  
コード例 #11
0
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder,
             hparams, sess, step, summary_writer):
    # Run eval and save eval stats
    log('\nRunning evaluation at step {}'.format(step))
    sum_eval_loss = 0.0
    sum_mel_loss = 0.0
    sum_stop_token_loss = 0.0
    sum_linear_loss = 0.0
    count = 0.0
    mel_p = None
    mel_t = None
    t_len = None
    attention_mask_sample = None
    lin_p = None
    lin_t = None
    for _ in tqdm(range(feeder.test_steps)):
        test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run(
            [
                eval_model.loss,
                eval_model.mel_loss,
                eval_model.stop_token_loss,
                eval_model.linear_loss,
                eval_model.post_net_predictions[0],
                eval_model.targets_mel[0],
                eval_model.targets_length[0],
                eval_model.alignments[0],
                eval_model.mag_pred[0],
                eval_model.targets_mag[0],
            ])
        sum_eval_loss += test_eloss
        sum_mel_loss += test_mel_loss
        sum_stop_token_loss += test_stop_token_loss
        sum_linear_loss += test_linear_loss
        count += 1.0
    wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir,
                                '{}-eval-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    if count > 0.0:
        eval_loss = sum_eval_loss / count
        mel_loss = sum_mel_loss / count
        stop_token_loss = sum_stop_token_loss / count
        linear_loss = sum_linear_loss / count
    else:
        eval_loss = sum_eval_loss
        mel_loss = sum_mel_loss
        stop_token_loss = sum_stop_token_loss
        linear_loss = sum_linear_loss
    log('Saving eval log to {}..'.format(eval_dir))
    # Save some log to monitor model improvement on same unseen sequence
    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    for i in range(len(alignments)):
        plot.plot_alignment(alignments[i],
                            os.path.join(
                                eval_plot_dir, '{}_{}-eval-align.png'.format(
                                    step, alignment_titles[i])),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            max_len=t_len // hparams.reduction_factor)
    plot.plot_spectrogram(
        mel_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-mel-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=mel_t,
        max_len=t_len)
    plot.plot_spectrogram(
        lin_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=lin_t,
        max_len=t_len,
        auto_aspect=True)
    log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
    log('Writing eval summary!')
    add_eval_stats(summary_writer, step, linear_loss, mel_loss,
                   stop_token_loss, eval_loss)
コード例 #12
0
from datasets.audio.stft import TacotronSTFT
from utils.plot import plot_spectrogram

fullpath = '../audios/LJ001-0007.wav'

filter_length = 1024
hop_length = 256
win_length = 1024
n_mel_channels = 80
sampling_rate = 22050
mel_fmin = 0.0 # 80.0
mel_fmax = 8000.0 # 7600.0

stft = TacotronSTFT(filter_length=filter_length,
                    hop_length=hop_length,
                    win_length=win_length,
                    n_mel_channels=n_mel_channels,
                    sampling_rate=sampling_rate,
                    mel_fmin=mel_fmin,
                    mel_fmax=mel_fmax)

wav, sr = librosa.load(fullpath, sr=None)

assert sr == sampling_rate

wav = torch.from_numpy(wav).unsqueeze(0)
mel = stft.mel_spectrogram(wav).squeeze(0).t()

print(mel.size())
plot_spectrogram(pred_spectrogram=mel, save_img=True, path='test.png')
コード例 #13
0
    def synthesize(self, texts, basenames, log_dir, mel_filenames):
        hparams = self._hparams

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
        sequences = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in sequences]
        seqs, max_seq_len = self._prepare_inputs(sequences)

        feed_dict = {
            self.inputs: seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32)
        }

        linears, mels, alignments, audio_length = self.session.run(
            [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length],
            feed_dict=feed_dict)
        # Natural batch synthesis
        # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
        target_lengths = audio_length

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        for i, mel in enumerate(mels):

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)
                alignments_samples, alignment_titles = self.get_alignments(alignments)
                for idx in range(len(alignments_samples)):
                    # save alignments
                    plot.plot_alignment(alignments_samples[idx],
                                        os.path.join(log_dir, 'plots/{}.png'.format(
                                            alignment_titles[
                                                idx])),
                                        title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel,
                                      os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                # save wav (linear -> wav)

                wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                audio.save_wav(wav,
                               os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save linear spectrogram plot
                plot.plot_spectrogram(linears[i],
                                      os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
コード例 #14
0
ファイル: synthesizer.py プロジェクト: yqlihust/Tacotron-3
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        # [-max, max] or [0,max]
        t2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (
            0, hparams.max_abs_value)

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        seqs = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]
        input_seqs, max_seq_len = self._prepare_inputs(seqs)

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
            target_lengths = [len(np_target) for np_target in np_targets]
            target_seqs, max_target_len = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)
        linears = None
        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction],
                feed_dict=feed_dict)

            # Natural batch synthesis
            # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            # Take off the batch wise padding
            mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
            linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
            linears = np.clip(linears, t2_output_range[0], t2_output_range[1])
            assert len(mels) == len(linears) == len(texts)

        mels = np.clip(mels, t2_output_range[0], t2_output_range[1])

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save alignments
                plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
                                    title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                if linears:
                    # save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                                   sr=hparams.sample_rate)

                    # save linear spectrogram plot
                    plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                          title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

        return saved_mels_paths