コード例 #1
0
	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seq = text_to_sequence(text, cleaner_names)
		feed_dict = {
			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
		}

		if self.gta:
			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

		mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

		mels = mels.reshape(-1, 80) #Thanks to @imdatsolak for pointing this out

		# Write the spectrogram to disk
		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
		mel_filename = os.path.join(out_dir, 'ljspeech-mel-{:05d}.npy'.format(index))
		np.save(mel_filename, mels, allow_pickle=False)

		if log_dir is not None:
			#save wav
			wav = audio.inv_mel_spectrogram(mels.T)
			audio.save_wav(wav, os.path.join(log_dir, 'wavs/ljspeech-wav-{:05d}.wav'.format(index)))

			#save alignments
			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/ljspeech-alignment-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

			#save mel spectrogram plot
			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/ljspeech-mel-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

		return mel_filename
コード例 #2
0
ファイル: train.py プロジェクト: Qwaz/tacotron-ksss
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T)
    save_audio(waveform, audio_path)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
コード例 #3
0
    def synthesize(self, text, mel, out_dir, idx):
        hparams = self._hparams
        r = hparams.outputs_per_step

        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        target = np.load(mel)
        target = np.clip(target, T2_output_range[0], T2_output_range[1])
        target_length = target.shape[0]

        targets = padding_targets(target, r, T2_output_range[0])
        new_target_length = targets.shape[0]

        pyin, text = get_pyin(text)
        print(text)

        inputs = [np.asarray(text_to_sequence(pyin.split(' ')))]
        print(inputs)
        input_lengths = [len(inputs[0])]

        feed_dict = {
            self.inputs: np.asarray(inputs, dtype=np.int32),
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
            self.targets: np.asarray([targets], dtype=np.float32),
            self.target_lengths: np.asarray([new_target_length],
                                            dtype=np.int32),
        }

        mels, alignments = self.session.run(
            [self.mel_outputs, self.alignments], feed_dict=feed_dict)

        mel = mels[0]
        print('pred_mel.shape', mel.shape)
        mel = np.clip(mel, T2_output_range[0], T2_output_range[1])
        mel = mel[:target_length, :]
        mel = (mel + T2_output_range[1]) / (2 * T2_output_range[1])
        mel = np.clip(mel, 0.0, 1.0)  # 0~1.0
        print(target_length, new_target_length)

        pred_mel_path = os.path.join(out_dir, 'mel-{}-pred.npy'.format(idx))
        np.save(pred_mel_path, mel, allow_pickle=False)
        plot.plot_spectrogram(mel,
                              pred_mel_path.replace('.npy', '.png'),
                              title='')

        alignment = alignments[0]
        alignment_path = os.path.join(out_dir, 'align-{}.png'.format(idx))
        plot.plot_alignment(alignment, alignment_path, title='')
        #alignment_path = os.path.join(out_dir, 'align-{}.npy'.format(idx))
        #np.save(alignment_path, alignment, allow_pickle=False)

        return pred_mel_path, alignment_path
コード例 #4
0
    def synthesize(self, texts, basenames, out_dir, log_dir):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        seqs = self._prepare_inputs(seqs)
        feed_dict = {
            self.model.inputs: seqs,
            self.model.input_lengths: np.asarray(input_lengths,
                                                 dtype=np.int32),
        }

        features, alignments, stop_tokens = self.session.run(
            [self.final_outputs, self.alignments, self.stop_token_outputs],
            feed_dict=feed_dict)

        #Get feature output lengths for the entire batch from stop_tokens outputs
        output_lengths = self._get_output_lengths(stop_tokens)
        features = [
            feature[:output_length, :]
            for feature, output_length in zip(features, output_lengths)
        ]
        assert len(features) == len(texts)

        for i, feature in enumerate(features):
            # Write the predicted features to disk
            # Note: outputs files and target ones have same names, just different folders
            np.save(os.path.join(out_dir, 'feature-{:03d}.npy'.format(i + 1)),
                    feature,
                    allow_pickle=False)

            if log_dir is not None:
                #save alignments
                plot.plot_alignment(
                    alignments[i],
                    os.path.join(log_dir,
                                 'plots/alignment-{:03d}.png'.format(i + 1)),
                    info='{}'.format(texts[i]),
                    split_title=True)

                #save wav
                wav = audio.synthesize(feature, hparams)
                audio.save_wav(
                    wav,
                    os.path.join(log_dir, 'wavs/wav-{:03d}.wav'.format(i + 1)),
                    hparams)
コード例 #5
0
    def synthesize(self, texts, speaker_id, basenames, out_dir, log_dir,
                   mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        speaker_id_list = []
        input_lengths = [len(seq) for seq in seqs]
        seqs = self._prepare_inputs(seqs)
        feed_dict = {
            self.model.inputs: seqs,
            self.model.input_lengths: np.asarray(input_lengths,
                                                 dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]
            padded_targets = self._prepare_targets(
                np_targets, self._hparams.outputs_per_step)
            feed_dict[self.model.mel_targets] = padded_targets.reshape(
                len(np_targets), -1, hparams.num_mels)

        if self.gta or not hparams.predict_linear:
            mels, alignments = self.session.run(
                [self.mel_outputs, self.alignments], feed_dict=feed_dict)
            if self.gta:
                mels = [
                    mel[:target_length, :]
                    for mel, target_length in zip(mels, target_lengths)
                ]  #Take off the reduction factor padding frames for time consistency with wavenet
                assert len(mels) == len(np_targets)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)

            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        # if basenames is None:
        # 	#Generate wav and read it
        # 	wav = audio.inv_mel_spectrogram(mels.T, hparams)
        # 	audio.save_wav(wav, 'temp.wav', hparams) #Find a better way

        # 	chunk = 512
        # 	f = wave.open('temp.wav', 'rb')
        # 	p = pyaudio.PyAudio()
        # 	stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
        # 		channels=f.getnchannels(),
        # 		rate=f.getframerate(),
        # 		output=True)
        # 	data = f.readframes(chunk)
        # 	while data:
        # 		stream.write(data)
        # 		data=f.readframes(chunk)

        # 	stream.stop_stream()
        # 	stream.close()

        # 	p.terminate()
        # 	return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            #Get speaker id for global conditioning (only used with GTA generally)
            speaker_id = '<no_g>'
            speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, '{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel.T, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(
                    wav,
                    os.path.join(log_dir,
                                 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                    hparams)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}.png'.format(
                                            basenames[i])),
                                    info='{}'.format(texts[i]),
                                    split_title=True)

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    info='{}'.format(texts[i]),
                    split_title=True)

                if hparams.predict_linear and not self.gta:
                    #save wav (linear -> wav)
                    linear_wav = self.session.run(
                        self.linear_wav_outputs,
                        feed_dict={self.linear_spectrograms: linears[i]})
                    wav = audio.inv_preemphasis(linear_wav,
                                                hparams.preemphasis)
                    audio.save_wav(
                        wav,
                        os.path.join(log_dir,
                                     'wavs/wav-{}-linear.wav'.format(i)),
                        hparams)

                    #save mel spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}.png'.format(
                                                  basenames[i])),
                                          info='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
コード例 #6
0
    def synthesize(self, text, index, out_dir, log_dir, mel_filename):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
        }

        if self.gta:
            feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(
                1, -1, 80)

        if self.gta or not hparams.predict_linear:
            mels, alignment = self.session.run(
                [self.mel_outputs, self.alignment], feed_dict=feed_dict)

        else:
            linear, mels, alignment = self.session.run(
                [self.linear_outputs, self.mel_outputs, self.alignment],
                feed_dict=feed_dict)
            linear = linear.reshape(-1, hparams.num_freq)

        mels = mels.reshape(
            -1, hparams.num_mels)  #Thanks to @imdatsolak for pointing this out

        if index is None:
            #Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        #Get speaker id for global conditioning (only used with GTA generally)
        if hparams.gin_channels > 0:
            raise RuntimeError(
                'Please set the speaker_id rule in line 89 of tacotron/synthesizer.py to allow for global condition usage later.'
            )
            speaker_id = '<no_g>'  #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "index" variable)
        else:
            speaker_id = '<no_g>'

        # Write the spectrogram to disk
        # Note: outputs mel-spectrogram files and target ones have same names, just different folders
        mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(index))
        np.save(mel_filename, mels, allow_pickle=False)

        if log_dir is not None:
            #save wav (mel -> wav)
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav,
                           os.path.join(log_dir,
                                        'wavs/wav-{}-mel.wav'.format(index)),
                           sr=hparams.sample_rate)

            if hparams.predict_linear:
                #save wav (linear -> wav)
                wav = audio.inv_linear_spectrogram(linear.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-linear.wav'.format(index)),
                               sr=hparams.sample_rate)

            #save alignments
            plot.plot_alignment(alignment,
                                os.path.join(
                                    log_dir,
                                    'plots/alignment-{}.png'.format(index)),
                                info='{}'.format(text),
                                split_title=True)

            #save mel spectrogram plot
            plot.plot_spectrogram(mels,
                                  os.path.join(
                                      log_dir,
                                      'plots/mel-{}.png'.format(index)),
                                  info='{}'.format(text),
                                  split_title=True)

        return mel_filename, speaker_id
コード例 #7
0
    def synthesize(self,
                   texts,
                   basenames,
                   out_dir,
                   log_dir,
                   mel_filenames,
                   mel_reference_filenames=None):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
            if mel_reference_filenames is not None:
                mel_reference_filenames.append(mel_reference_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        if not self._hparams.tacotron_phoneme_transcription:
            seqs = [
                np.asarray(text_to_sequence(text, cleaner_names))
                for text in texts
            ]
        else:
            seqs = [
                np.asarray(ipa_to_articulatory_sequence(text), dtype=np.int32)
                for text in texts
            ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if mel_reference_filenames is not None:
            np_references = [
                np.load(mel_reference_filename)
                for mel_reference_filename in mel_reference_filenames
            ]

            # pad references according to each GPU max length
            reference_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_reference = np_references[size_per_device *
                                                 i:size_per_device * (i + 1)]
                device_reference, max_reference_len = self._prepare_targets(
                    device_reference, self._hparams.outputs_per_step)
                reference_seqs = np.concatenate(
                    (reference_seqs, device_reference),
                    axis=1) if reference_seqs is not None else device_reference
                # split_infos[i][
                # 	1] = max_target_len  # Not really used but setting it in case for future development maybe?

            feed_dict[self.mel_references] = reference_seqs
            # assert len(np_targets) == len(texts)

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            #Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            #Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError(
                    'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.'
                )
                speaker_id = '<no_g>'  #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(
                    speaker_id
                )  #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}.png'.format(
                                            basenames[i])),
                                    title='{}'.format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    title='{}'.format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear.wav'.format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}.png'.format(
                                                  basenames[i])),
                                          title='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
コード例 #8
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]
        seqs = self._prepare_inputs(seqs)
        feed_dict = {
            self.model.inputs: seqs,
            self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
            target_lengths = [len(np_target) for np_target in np_targets]
            padded_targets = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
            feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, 80)

        if self.gta or not hparams.predict_linear:
            mels, alignments = self.session.run([self.mel_outputs, self.alignments], feed_dict=feed_dict)
            if self.gta:
                mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]  # Take off the reduction factor padding frames for time consistency with wavenet
                assert len(mels) == len(np_targets)

        else:
            linears, mels, alignments = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments], feed_dict=feed_dict)

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav', hparams.sample_rate)  # Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            # Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
                speaker_id = '<no_g>'  # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(speaker_id)  # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), hparams.sample_rate)

                # save alignments
                plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
                                    title='{}'.format(texts[i]), split_title=True)

                # save mel spectrogram plot
                plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                if hparams.predict_linear:
                    # save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), hparams.sample_rate)

                    # save mel spectrogram plot
                    plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                          title='{}'.format(texts[i]), split_title=True, auto_aspect=True)

        return saved_mels_paths, speaker_ids
コード例 #9
0
ファイル: synthesizer.py プロジェクト: duvtedudug/Tacotron-2
	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
		hparams = self._hparams
		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
		seq = text_to_sequence(text, cleaner_names)
		feed_dict = {
			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
		}

		if self.gta:
			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

		if self.gta or not hparams.predict_linear:
			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

		else:
			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
			linear = linear.reshape(-1, hparams.num_freq)

		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out


		if index is None:
			#Generate wav and read it
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way

			chunk = 512
			f = wave.open('temp.wav', 'rb')
			p = pyaudio.PyAudio()
			stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
				channels=f.getnchannels(),
				rate=f.getframerate(),
				output=True)
			data = f.readframes(chunk)
			while data:
				stream.write(data)
				data=f.readframes(chunk)

			stream.stop_stream()
			stream.close()

			p.terminate()
			return


		# Write the spectrogram to disk
		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
		mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
		np.save(mel_filename, mels, allow_pickle=False)

		if log_dir is not None:
			#save wav (mel -> wav)
			wav = audio.inv_mel_spectrogram(mels.T, hparams)
			audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)), sr=hparams.sample_rate)

			if hparams.predict_linear:
				#save wav (linear -> wav)
				wav = audio.inv_linear_spectrogram(linear.T, hparams)
				audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)), sr=hparams.sample_rate)

			#save alignments
			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

			#save mel spectrogram plot
			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
				info='{}'.format(text), split_title=True)

		return mel_filename
コード例 #10
0
    def synthesize(self,
                   texts,
                   basenames,
                   out_dir,
                   log_dir,
                   mel_filenames,
                   basenames_refs=None,
                   mel_ref_filenames_emt=None,
                   mel_ref_filenames_spk=None,
                   emb_only=False,
                   emt_labels_synth=None,
                   spk_labels_synth=None):
        hparams = self._hparams
        # [-max, max] or [0,max]
        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        basenames, basenames_refs, input_seqs, input_lengths, split_infos, mel_ref_seqs_emt, mel_ref_seqs_spk,\
        emt_labels_synth, spk_labels_synth = filenames_to_inputs(hparams, texts, basenames, mel_filenames,
                       basenames_refs, mel_ref_filenames_emt,
                       mel_ref_filenames_spk, emt_labels_synth,
                       spk_labels_synth)
        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: input_lengths,
            self.mel_refs_emt: mel_ref_seqs_emt,
            self.mel_refs_spk: mel_ref_seqs_spk,
            self.spk_labels: spk_labels_synth,
            self.emt_labels: emt_labels_synth,
            self.split_infos: split_infos
        }

        # if self.gta:
        # 	np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
        # 	target_lengths = [len(np_target) for np_target in np_targets]
        #
        # 	#pad targets according to each GPU max length
        # 	target_seqs = None
        # 	for i in range(self._hparams.tacotron_num_gpus):
        # 		device_target = np_targets[size_per_device*i: size_per_device*(i+1)]
        # 		device_target, max_target_len = self._prepare_targets(device_target, self._hparams.outputs_per_step, target_pad=self._target_pad)
        # 		target_seqs = np.concatenate((target_seqs, device_target), axis=1) if target_seqs is not None else device_target
        # 		split_infos[i][1] = max_target_len #Not really used but setting it in case for future development maybe?
        #
        # 	feed_dict[self.targets] = target_seqs
        # 	assert len(np_targets) == len(texts)

        # feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        if emb_only:
            if self.args.emt_attn:
                return (self.session.run([
                    self.model.tower_refnet_out_emt[0],
                    self.model.tower_refnet_out_spk[0],
                    self.model.tower_refnet_outputs_mel_out_emt[0],
                    self.model.tower_refnet_outputs_mel_out_spk[0],
                    self.model.tower_context_emt[0]
                ],
                                         feed_dict=feed_dict))
            else:
                return (self.session.run([
                    self.model.tower_refnet_out_emt[0],
                    self.model.tower_refnet_out_spk[0],
                    self.model.tower_refnet_outputs_mel_out_emt[0],
                    self.model.tower_refnet_outputs_mel_out_spk[0],
                    tf.constant(1.)
                ],
                                         feed_dict=feed_dict))

        if self.gta or not hparams.predict_linear:
            if self.args.attn == 'style_tokens':
                mels, alignments, stop_tokens = self.session.run(
                    [
                        self.mel_outputs, self.alignments,
                        self.stop_token_prediction
                    ],
                    feed_dict=feed_dict)
            else:
                mels, alignments, stop_tokens, refnet_emt,\
                ref_emt, alignments_emt = self.session.run([self.mel_outputs,self.alignments,self.stop_token_prediction,
                           self.model.tower_refnet_out_emt[0],self.model.tower_ref_mel_emt[0],
                           self.model.tower_alignments_emt],#self.model.tower_context_emt[0],#self.model.tower_refnet_out_spk[0]],
                              feed_dict=feed_dict)

            # import pandas as pd
            # df_cont = pd.DataFrame(cont[0])
            # df_cont.to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\test\cont.csv')
            # pd.DataFrame(refnet_spk).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\test\r_spk.csv')
            # raise

            # print(refnet_emt[:,0:5])
            # print(refnet_spk[:,0:5])
            # for i,(m1,m2,m3) in enumerate(zip(mels[0],ref_emt,ref_spk)):
            # 	np.save('../eval/mels_save/{}_mel.npy'.format(i),m1)
            # 	np.save('../eval/mels_save/{}_ref_emt.npy'.format(i), m2)
            # 	np.save('../eval/mels_save/{}_ref_spk.npy'.format(i), m3)
            # time.sleep(.5)
            # raise

            #Linearize outputs (n_gpus -> 1D)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]
            if self.args.emt_attn and not (self.args.attn == 'style_tokens'):
                alignments_emt = [
                    align_emt for gpu_aligns_emt in alignments_emt
                    for align_emt in gpu_aligns_emt
                ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)

            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
            assert len(mels) == len(linears) == len(texts)

        mels = [
            np.clip(m, T2_output_range[0], T2_output_range[1]) for m in mels
        ]

        if basenames is None:
            #Generate wav and read it
            if hparams.GL_on_GPU:
                wav = self.session.run(
                    self.GLGPU_mel_outputs,
                    feed_dict={self.GLGPU_mel_inputs: mels[0]})
                wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                            hparams.preemphasize)
            else:
                wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            if platform.system() == 'Linux':
                #Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                #windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!'
                )

            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            #Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError(
                    'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.'
                )
                speaker_id = '<no_g>'  #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(
                    speaker_id
                )  #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            if log_dir is not None:
                os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True)
                os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True)
                os.makedirs(os.path.join(log_dir, 'mels'), exist_ok=True)

                mel_filename = os.path.join(
                    out_dir, 'mels',
                    'mel-{}_{}.npy'.format(basenames[i], basenames_refs[i]))
                np.save(mel_filename, mel, allow_pickle=False)

                #save wav (mel -> wav)
                if hparams.GL_on_GPU:
                    wav = self.session.run(
                        self.GLGPU_mel_outputs,
                        feed_dict={self.GLGPU_mel_inputs: mel})
                    wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                hparams.preemphasize)
                else:
                    wav = audio.inv_mel_spectrogram(mel.T, hparams)

                #add silence to make ending of file more noticeable
                wav = np.append(
                    np.append(np.zeros(int(.5 * hparams.sample_rate)), wav),
                    np.zeros(int(.5 * hparams.sample_rate)))
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir, 'wavs/wav-{}_{}.wav'.format(
                                       basenames[i], basenames_refs[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}_{}.png'.format(
                                            basenames[i], basenames_refs[i])),
                                    title='{}'.format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                if self.args.emt_attn and self.args.attn == 'simple':
                    plot.plot_alignment(
                        alignments_emt[i],
                        os.path.join(
                            log_dir, 'plots/alignment_emt-{}_{}.png'.format(
                                basenames[i], basenames_refs[i])),
                        title='{}'.format(texts[i]),
                        split_title=True,
                        max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(mel,
                                      os.path.join(
                                          log_dir,
                                          'plots/mel-{}_{}.png'.format(
                                              basenames[i],
                                              basenames_refs[i])),
                                      title='{}'.format(texts[i]),
                                      split_title=True)
                print("Finished saving {}_{}".format(basenames[i],
                                                     basenames_refs[i]))

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    if hparams.GL_on_GPU:
                        wav = self.session.run(
                            self.GLGPU_lin_outputs,
                            feed_dict={self.GLGPU_lin_inputs: linears[i]})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_linear_spectrogram(
                            linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear_{}.wav'.format(
                                           basenames[i], basenames_refs[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}_{}.png'.format(
                                                  basenames[i],
                                                  basenames_refs[i])),
                                          title='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
コード例 #11
0
    def synthesize(self, text, out_dir, idx, step):
        hparams = self._hparams

        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        #pyin, text = get_pyin(text)
        print(text.split(' '))

        inputs = [np.asarray(text_to_sequence(text.split(' ')))]
        print(inputs)
        input_lengths = [len(inputs[0])]

        feed_dict = {
            self.inputs: np.asarray(inputs, dtype=np.int32),
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        mels, alignments, stop_tokens = self.session.run(
            [self.mel_outputs, self.alignments, self.stop_token_prediction],
            feed_dict=feed_dict)

        mel = mels[0]
        alignment = alignments[0]

        print('pred_mel.shape', mel.shape)
        stop_token = np.round(stop_tokens[0]).tolist()
        target_length = stop_token.index(1) if 1 in stop_token else len(
            stop_token)

        mel = mel[:target_length, :]
        mel = np.clip(mel, T2_output_range[0], T2_output_range[1])

        wav_path = os.path.join(
            out_dir, 'step-{}-{}-wav-from-mel.wav'.format(step, idx))
        wav = audio.inv_mel_spectrogram(mel.T, hparams)
        audio.save_wav(wav, wav_path, sr=hparams.sample_rate)

        pred_mel_path = os.path.join(
            out_dir, 'step-{}-{}-mel-pred.npy'.format(step, idx))
        new_mel = np.clip(
            (mel + T2_output_range[1]) / (2 * T2_output_range[1]), 0, 1)
        np.save(pred_mel_path, new_mel, allow_pickle=False)

        pred_mel_path = os.path.join(
            out_dir, 'step-{}-{}-mel-pred.png'.format(step, idx))
        plot.plot_spectrogram(mel,
                              pred_mel_path,
                              title=datetime.now().strftime('%Y-%m-%d %H:%M'))

        #alignment_path = os.path.join(out_dir, 'step-{}-{}-align.npy'.format(step, idx))
        #np.save(alignment_path, alignment, allow_pickle=False)
        alignment_path = os.path.join(out_dir,
                                      'step-{}-{}-align.png'.format(step, idx))
        plot.plot_alignment(alignment,
                            alignment_path,
                            title=datetime.now().strftime('%Y-%m-%d %H:%M'),
                            split_title=True,
                            max_len=target_length)

        return pred_mel_path, alignment_path
コード例 #12
0
ファイル: synthesizer.py プロジェクト: wyb330/Tacotron-2
    def synthesize(self, text, index, out_dir, log_dir, mel_filename, speaker_id):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        if is_korean_text(text):
            text = normalize_number(text)
            text = split_to_jamo(text, cleaner_names)
        seq = text_to_sequence(text, cleaner_names)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
            self.model.speaker_ids: np.asarray([speaker_id], dtype=np.int32)
        }

        if self.gta:
            feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)

        if self.gta or not hparams.predict_linear:
            mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)

        else:
            linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment],
                                                       feed_dict=feed_dict)
            linear = linear.reshape(-1, hparams.num_freq)

        mels = mels.reshape(-1, hparams.num_mels)  # Thanks to @imdatsolak for pointing this out

        if index is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        # Write the spectrogram to disk
        # Note: outputs mel-spectrogram files and target ones have same names, just different folders
        mel_filename = os.path.join(out_dir, 'speech-mel-{:05d}.npy'.format(index))
        np.save(mel_filename, mels, allow_pickle=False)

        if log_dir is not None:
            # save wav (mel -> wav)
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-mel.wav'.format(index)),
                           sr=hparams.sample_rate)

            if hparams.predict_linear:
                # save wav (linear -> wav)
                wav = audio.inv_linear_spectrogram(linear.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/speech-wav-{:05d}-linear.wav'.format(index)),
                               sr=hparams.sample_rate)

            if is_korean_char(text):
                text = j2h(text)
            # save alignments
            plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/speech-alignment-{:05d}.png'.format(index)),
                                info='{}'.format(text), split_title=True)

            # save mel spectrogram plot
            plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/speech-mel-{:05d}.png'.format(index)),
                                  info='{}'.format(text), split_title=True)

        return mel_filename
コード例 #13
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        #[-max, max] or [0,max]
        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)

            #Linearize outputs (n_gpus -> 1D)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)

            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
            assert len(mels) == len(linears) == len(texts)

        mels = np.clip(mels, T2_output_range[0], T2_output_range[1])

        if basenames is None:
            #Generate wav and read it
            if hparams.GL_on_GPU:
                wav = self.session.run(
                    self.GLGPU_mel_outputs,
                    feed_dict={self.GLGPU_mel_inputs: mels[0]})
                wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                            hparams.preemphasize)
            else:
                wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            if platform.system() == 'Linux':
                #Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                #windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "tacotron/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!'
                )

            return

        saved_mels_paths = []
        speaker_ids = []
        for i, mel in enumerate(mels):
            #Get speaker id for global conditioning (only used with GTA generally)
            if hparams.gin_channels > 0:
                raise RuntimeError(
                    'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.'
                )
                speaker_id = '<no_g>'  #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
                speaker_ids.append(
                    speaker_id
                )  #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
            else:
                speaker_id = '<no_g>'
                speaker_ids.append(speaker_id)

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                if hparams.GL_on_GPU:
                    wav = self.session.run(
                        self.GLGPU_mel_outputs,
                        feed_dict={self.GLGPU_mel_inputs: mel})
                    wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                hparams.preemphasize)
                else:
                    wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        'plots/alignment-{}.png'.format(
                                            basenames[i])),
                                    title='{}'.format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    title='{}'.format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    if hparams.GL_on_GPU:
                        wav = self.session.run(
                            self.GLGPU_lin_outputs,
                            feed_dict={self.GLGPU_lin_inputs: linears[i]})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_linear_spectrogram(
                            linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear.wav'.format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              'plots/linear-{}.png'.format(
                                                  basenames[i])),
                                          title='{}'.format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths, speaker_ids
コード例 #14
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    feat_dir = os.path.join(log_dir, 'features')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(feat_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=1)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)
                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    #Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    attention_losses = []

                    for i in tqdm(range(feeder.test_steps)):
                        eloss, before_loss, after_loss, stop_token_loss, attention_loss, feature_prediction, target_len, align = sess.run(
                            [
                                eval_model.loss, eval_model.before_loss,
                                eval_model.after_loss,
                                eval_model.stop_token_loss,
                                eval_model.attention_loss,
                                eval_model.final_outputs[0],
                                eval_model.targets_lengths[0],
                                eval_model.alignments[0]
                            ])
                        eval_losses.append(eloss)
                        before_losses.append(before_loss)
                        after_losses.append(after_loss)
                        stop_token_losses.append(stop_token_loss)
                        attention_losses.append(attention_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)
                    attention_loss = sum(attention_losses) / len(
                        attention_losses)

                    log('Saving eval log to {}..'.format(eval_dir))
                    #Save some log to monitor model improvement on same unseen sequence
                    wav = audio.synthesize(feature_prediction, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(eval_wav_dir,
                                     'step-{}-eval-waveform.wav'.format(step)),
                        hparams)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     'step-{}-eval-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        max_len=target_len // hparams.outputs_per_step)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')
                    add_eval_stats(summary_writer, step, before_loss,
                                   after_loss, stop_token_loss, attention_loss,
                                   eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)
                    graph_def = tf.graph_util.convert_variables_to_constants(
                        sess, sess.graph_def, ['model/inference/add'])
                    tf.train.write_graph(sess.graph_def,
                                         save_dir,
                                         'graph.pb',
                                         as_text=False)

                    log('\nSaving alignment and World vocoder synthesized waveform..'
                        )
                    input_seq, feature_prediction, alignment, target_length = sess.run(
                        [
                            model.inputs[0], model.final_outputs[0],
                            model.alignments[0], model.targets_lengths[0]
                        ])

                    #save World vocoder waveform for debug
                    wav = audio.synthesize(feature_prediction, hparams)
                    audio.save_wav(
                        wav, os.path.join(wav_dir, 'step-{}.wav'.format(step)),
                        hparams)

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #15
0
    def synthesize(self, text, step, out_dir, log_dir, mel_filenames, cwd):
        hparams = self._hparams
        #[-max, max] or [0,max]
        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        inputs = [np.asarray(text_to_sequence(text))]
        input_lengths = [len(inputs[0])]

        feed_dict = {
            self.inputs: np.asarray(inputs, dtype=np.int32),
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if hparams.predict_linear:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            linear = linears[0]
        else:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)

        new_out = os.path.join(cwd, 'logs-Tacotron-2/wav_plot')

        #Linearize outputs (1D arrays)
        mel = mels[0]
        alignment = alignments[0]
        stop_token = np.round(stop_tokens[0]).tolist()
        target_length = stop_token.index(1) if 1 in stop_token else len(
            stop_token)

        mel = mel[:target_length, :]
        if hparams.predict_linear:
            linear = linear[:target_length, :]
            linear = np.clip(linear, T2_output_range[0], T2_output_range[1])

            wav = audio.inv_linear_spectrogram(linear.T, hparams)
            audio.save_wav(wav,
                           os.path.join(
                               new_out,
                               'eval-step_{}-from_linear.wav'.format(step)),
                           sr=hparams.sample_rate)

        mel = np.clip(mel, T2_output_range[0], T2_output_range[1])
        if not hparams.predict_linear:
            wav = audio.inv_mel_spectrogram(mel.T, hparams)
            audio.save_wav(wav,
                           os.path.join(
                               new_out,
                               'eval-step_{}-from_mel.wav'.format(step)),
                           sr=hparams.sample_rate)

        text = text[:30]

        plot.plot_alignment(alignment,
                            os.path.join(
                                new_out,
                                'eval-step_{}-alignment.png'.format(step)),
                            title='{}'.format(text),
                            split_title=True,
                            max_len=target_length)

        #save mel spectrogram plot
        plot.plot_spectrogram(
            mel,
            os.path.join(new_out, 'eval-step_{}-mel.png'.format(step)),
            title='{}'.format(text),
            split_title=True)

        print('step: {}'.format(step))
        print('pyin: {}'.format(text))
コード例 #16
0
    def synthesize(self, texts, basenames, mel_dir, wav_dir, plot_dir,
                   mel_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        #[-max, max] or [0,max]
        T2_output_range = (
            -hparams.max_abs_value,
            hparams.max_abs_value) if hparams.symmetric_mels else (
                0, hparams.max_abs_value)

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]
        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])
        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }
        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        mels, alignments, stop_tokens = self.session.run(
            [self.mel_outputs, self.alignments, self.stop_token_prediction],
            feed_dict=feed_dict)

        #Linearize outputs (n_gpus -> 1D)
        mels = [mel for gpu_mels in mels for mel in gpu_mels]
        alignments = [
            align for gpu_aligns in alignments for align in gpu_aligns
        ]
        stop_tokens = [
            token for gpu_token in stop_tokens for token in gpu_token
        ]

        #Natural batch synthesis
        #Get Mel lengths for the entire batch from stop_tokens predictions
        target_lengths = self._get_output_lengths(stop_tokens)

        #Take off the batch wise padding
        mels = [
            mel[:target_length, :]
            for mel, target_length in zip(mels, target_lengths)
        ]
        assert len(mels) == len(texts)

        mels = np.clip(mels, T2_output_range[0], T2_output_range[1])

        saved_mels_paths = []
        for i, mel in enumerate(mels):

            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(mel_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            #save wav (mel -> wav)

            wav = self.session.run(self.GLGPU_mel_outputs,
                                   feed_dict={self.GLGPU_mel_inputs: mel})
            wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                        hparams.preemphasize)

            audio.save_wav(wav,
                           os.path.join(wav_dir,
                                        'wav-{}-mel.wav'.format(basenames[i])),
                           sr=hparams.sample_rate)

            #save alignments
            plot.plot_alignment(alignments[i],
                                os.path.join(
                                    plot_dir,
                                    'alignment-{}.png'.format(basenames[i])),
                                title='{}'.format(texts[i]),
                                split_title=True,
                                max_len=target_lengths[i])

            #save mel spectrogram plot
            plot.plot_spectrogram(mel,
                                  os.path.join(
                                      plot_dir,
                                      'mel-{}.png'.format(basenames[i])),
                                  title='{}'.format(texts[i]),
                                  split_title=True)

        return saved_mels_paths
コード例 #17
0
    def synthesize(self, texts, speakers, basenames, out_dir, log_dir,
                   mel_filenames, reference_mels, Lf0s):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]

        #Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            speakers.append(speakers[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
            if reference_mels is not None:
                reference_mels.append(reference_mels[-1])

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = texts  #[np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
            self.speakers: np.asarray(speakers, dtype=np.int32)
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        if reference_mels is not None:
            np_refs = [
                np.asarray(reference_mel) for reference_mel in reference_mels
            ]
            reference_lengths = [len(np_ref) for np_ref in np_refs]

            ref_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_ref = np_refs[size_per_device * i:size_per_device *
                                     (i + 1)]
                device_ref, max_ref_len = self._prepare_targets(
                    device_ref, self._hparams.outputs_per_step)
                ref_seqs = np.concatenate(
                    (ref_seqs, device_ref),
                    axis=1) if ref_seqs is not None else device_ref
                split_infos[i][-1] = max_ref_len
            feed_dict[self.reference_mels] = ref_seqs
            feed_dict[self.reference_lengths] = reference_lengths
            assert len(np_refs) == len(texts)

        ##2020.7.24 加入lf0
        if Lf0s is not None:
            np_Lf0s = [np.asarray(Lf0) for Lf0 in Lf0s]
            Lf0_lengths = [len(np_Lf0) for np_Lf0 in np_Lf0s]

            Lf0_seqs = None

            for i in range(self._hparams.tacotron_num_gpus):
                device_Lf0 = np_Lf0s[size_per_device * i:size_per_device *
                                     (i + 1)]
                device_Lf0, max_Lf0_len = self._prepare_F0_inputs(
                    device_Lf0, max_seq_len)  #保证不要因为分帧问题导致不一样的长度
                #device_Lf0, max_Lf0_len = self._prepare_targets(device_Lf0, self._hparams.outputs_per_step)
                Lf0_seqs = np.concatenate(
                    (Lf0_seqs, device_Lf0),
                    axis=1) if Lf0_seqs is not None else device_Lf0
                split_infos[i][-1] = max_Lf0_len
            feed_dict[self.Lf0s] = Lf0_seqs
            assert len(np_Lf0s) == len(texts)
        if Lf0_seqs.shape[-1] != 2:
            print(2333)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #if not self.gta:
            #	Natural batch synthesis
            #	#Get Mel lengths for the entire batch from stop_tokens predictions
            #	#target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            target_lengths = [9999]
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            #Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels.T, hparams)
            audio.save_wav(wav, 'temp.wav',
                           sr=hparams.sample_rate)  #Find a better way

            chunk = 512
            f = wave.open('temp.wav', 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                            channels=f.getnchannels(),
                            rate=f.getframerate(),
                            output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)

            stream.stop_stream()
            stream.close()

            p.terminate()
            return

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        'mel-{}.npy'.format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(
                    alignments[i],
                    os.path.join(log_dir, 'plots/alignment-{}.png'.format(
                        basenames[i])),
                    title='speaker_id = {:d}'.format(speakers[i]),
                    split_title=True,
                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 'plots/mel-{}.png'.format(basenames[i])),
                    title='speaker_id = {:d}'.format(speakers[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       'wavs/wav-{}-linear.wav'.format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(
                        linears[i],
                        os.path.join(
                            log_dir,
                            'plots/linear-{}.png'.format(basenames[i])),
                        title='speaker_id = {:d}'.format(speakers[i]),
                        split_title=True,
                        auto_aspect=True)

        return saved_mels_paths
コード例 #18
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    wav_plot = os.path.join(log_dir, 'wav_plot')

    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)
    os.makedirs(wav_plot, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.data_dir, args.tacotron_input)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)

    #Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  #For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=20)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    '''
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    '''

    #Train
    with tf.Session() as sess:  #config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)
                        #initial_global_step = global_step.assign(0)
                        #sess.run(initial_global_step)

                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_path,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt, before_loss, after_loss, token_loss, reg_loss = sess.run(
                    [
                        global_step, model.loss, model.optimize,
                        model.before_loss, model.after_loss,
                        model.stop_token_loss, model.regularization_loss
                    ])

                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step{:6d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, mel_before={:.5f}, mel_after={:.5f}, token_loss={:.5f}, reg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average,
                    before_loss, after_loss, token_loss, reg_loss)

                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if np.isnan(loss) or loss > 100.:
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )

                    input_seq, mel_prediction, alignment, target, target_length = sess.run(
                        [
                            model.inputs[0],
                            model.mel_outputs[0],
                            model.alignments[0],
                            model.mel_targets[0],
                            model.targets_lengths[0],
                        ])

                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)

                    audio.save_wav(
                        wav,
                        os.path.join(wav_plot,
                                     'step-{}-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(wav_plot,
                                     'step-{}-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            wav_plot,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)

                    print(', '.join(map(str, input_seq.tolist())))

                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    #Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    #Update Projector
                    log('\nSaving Model Character Embeddings visualization..')
                    add_embedding_stats(summary_writer,
                                        [model.embedding_table.name],
                                        [char_embedding_meta],
                                        checkpoint_state.model_checkpoint_path)
                    log('Tacotron Character embeddings have been updated on tensorboard!'
                        )

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #19
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    # with open("split_train.txt", "w") as file:
    #     for line in feeder._train_meta:
    #         for k in range(len(line)-1):
    #             file.write(line[k]+"|")
    #         file.write(line[-1]+"\n")
    # with open("split_validation.txt", "w") as file:
    #     for line in feeder._test_meta:
    #         for k in range(len(line)-1):
    #             file.write(line[k]+"|")
    #         file.write(line[-1]+"\n")
    # print("Feeder init done !")
    # assert False

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    # TODO Visualize embeddings
    # Embeddings inputs metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  # For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    # # Embeddings speaker metadata
    # speaker_embedding_meta = os.path.join(meta_folder, 'SpeakerEmbeddings.tsv')
    # if not os.path.isfile(speaker_embedding_meta):
    #     with open(speaker_embedding_meta, 'w', encoding='utf-8') as f:
    #         f.write("Filename\tSpeaker\n")
    #         for description in feeder._metadata:
    #             f.write('{}\t{}\n'.format(description[1], description[-1]))
    # speaker_embedding_meta = speaker_embedding_meta.replace(log_dir, '..')

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_path,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            # initializing feeder
            feeder.start_threads(sess)

            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    # Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None
                    speaker_losses = []
                    speaker_loss = None

                    eval_run = [
                        eval_model.tower_loss[0],
                        eval_model.tower_before_loss[0],
                        eval_model.tower_after_loss[0],
                        eval_model.tower_stop_token_loss[0],
                        eval_model.tower_mel_outputs[0][0],
                        eval_model.tower_mel_targets[0][0],
                        eval_model.tower_targets_lengths[0][0],
                        eval_model.tower_alignments[0][0]
                    ]

                    if hparams.predict_linear:
                        eval_run.append(eval_model.tower_linear_loss[0])
                        eval_run.append(eval_model.tower_linear_outputs[0][0])
                        eval_run.append(eval_model.tower_linear_targets[0][0])
                    if hparams.tacotron_multi_speaker:
                        eval_run.append(eval_model.tower_speaker_loss[0])

                    for i in tqdm(range(feeder.test_steps)):
                        blob = sess.run(eval_run)

                        eloss = blob[0]
                        before_loss = blob[1]
                        after_loss = blob[2]
                        stop_token_loss = blob[3]
                        mel_p = blob[4]
                        mel_t = blob[5]
                        t_len = blob[6]
                        align = blob[7]
                        if hparams.predict_linear:
                            linear_loss = blob[8]
                            lin_p = blob[9]
                            lin_t = blob[10]
                        if hparams.tacotron_multi_speaker:
                            speaker_p = blob[11]

                        eval_losses.append(eloss)
                        before_losses.append(before_loss)
                        after_losses.append(after_loss)
                        stop_token_losses.append(stop_token_loss)
                        if hparams.predict_linear:
                            linear_losses.append(linear_loss)
                        if hparams.tacotron_multi_speaker:
                            speaker_losses.append(speaker_p)

                    if hparams.predict_linear:
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                'step-{}-eval-wave-from-linear.wav'.format(
                                    step)),
                            sr=hparams.sample_rate)
                    if hparams.tacotron_multi_speaker:
                        speaker_loss = sum(speaker_losses) / len(
                            speaker_losses)
                    # if hparams.predict_linear:
                    #     for i in tqdm(range(feeder.test_steps)):
                    #         eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run(
                    #             [
                    #                 eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                    #                 eval_model.tower_after_loss[0],
                    #                 eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0],
                    #                 eval_model.tower_mel_outputs[0][0],
                    #                 eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0],
                    #                 eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0],
                    #                 eval_model.tower_linear_targets[0][0],
                    #             ])
                    #         eval_losses.append(eloss)
                    #         before_losses.append(before_loss)
                    #         after_losses.append(after_loss)
                    #         stop_token_losses.append(stop_token_loss)
                    #         linear_losses.append(linear_loss)
                    #     # print("len(eval_loss) : {}".format(len(eval_loss)))
                    #     # print("len(before_losses) : {}".format(len(before_losses)))
                    #     # print("len(after_losses) : {}".format(len(after_losses)))
                    #     # print("len(stop_token_losses) : {}".format(len(stop_token_losses)))
                    #     # print("len(linear_losses) : {}".format(len(linear_losses)))
                    #     # print("division par : {}, dans hparams.predict_linear".format(len(linear_losses)))
                    #     linear_loss = sum(linear_losses) / len(linear_losses)
                    #
                    #     wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                    #     audio.save_wav(wav,
                    #                    os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)),
                    #                    sr=hparams.sample_rate)
                    #
                    # else:
                    #     for i in tqdm(range(feeder.test_steps)):
                    #         eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run([
                    #             eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                    #             eval_model.tower_after_loss[0],
                    #             eval_model.tower_stop_token_loss[0], eval_model.tower_mel_outputs[0][0],
                    #             eval_model.tower_mel_targets[0][0],
                    #             eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0]
                    #         ])
                    #         eval_losses.append(eloss)
                    #         before_losses.append(before_loss)
                    #         after_losses.append(after_loss)
                    #         stop_token_losses.append(stop_token_loss)

                    # print("len(eval_loss) : {}".format(len(eval_loss)))
                    # print("len(before_losses) : {}".format(len(before_losses)))
                    # print("len(after_losses) : {}".format(len(after_losses)))
                    # print("len(stop_token_losses) : {}".format(len(stop_token_losses)))
                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)

                    log('Saving eval log to {}..'.format(eval_dir))
                    # Save some log to monitor model improvement on same unseen sequence
                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     'step-{}-eval-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir,
                            'step-{}-eval-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    if hparams.predict_linear:
                        plot.plot_spectrogram(
                            lin_p,
                            os.path.join(
                                eval_plot_dir,
                                'step-{}-eval-linear-spectrogram.png'.format(
                                    step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            target_spectrogram=lin_t,
                            max_len=t_len,
                            auto_aspect=True)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   eval_loss, speaker_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    # Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_linear_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                                model.tower_linear_targets[0][0],
                            ])

                        # save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        # save griffin lim inverted wav for debug (linear -> wav)
                        wav = audio.inv_linear_spectrogram(
                            linear_prediction.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-wave-from-linear.wav'.format(step)),
                            sr=hparams.sample_rate)

                        # Save real and predicted linear-spectrogram plot to disk (control purposes)
                        plot.plot_spectrogram(
                            linear_prediction,
                            os.path.join(
                                plot_dir,
                                'step-{}-linear-spectrogram.png'.format(step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, loss),
                            target_spectrogram=linear_target,
                            max_len=target_length,
                            auto_aspect=True)

                    else:
                        input_seq, mel_prediction, alignment, target, target_length = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                            ])

                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                # TODO Find a way to revert encoded IPA to original IPA or original text
                # log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    #Get current checkpoint_backup state
                    # checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                    # TODO Visualize embeddings
                    #Update Projector
                    log('\nSaving Model Character Embeddings visualization..')
                    # add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
                    # add_embedding_stats(summary_writer, [model.embedding_speaker.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
                    log('Tacotron Character embeddings have been updated on tensorboard!'
                        )

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #20
0
ファイル: train.py プロジェクト: qingyundou/tacotron_qdou
def train(log_dir, args, input):
    commit = get_git_commit() if args.git else 'None'
    checkpoint_path = os.path.join(log_dir, 'model.ckpt')
    input_path = os.path.join(args.base_dir, input)
    log('Checkpoint path: %s' % checkpoint_path)
    log('Loading training data from: %s' % input_path)
    log('Using model: %s' % args.variant)
    log(hparams_debug_string())

    # Set up DataFeeder:
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        if args.eal_dir:
            from tacotron.datafeeder import DataFeeder_EAL
            feeder = DataFeeder_EAL(coord, input_path, hparams, args.eal_dir)
        else:
            from tacotron.datafeeder import DataFeeder
            feeder = DataFeeder(coord, input_path, hparams)

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.variant, hparams)
        if args.eal_dir:
            model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets,
                             feeder.linear_targets, feeder.pml_targets, is_training=True, 
                             eal=True, locked_alignments=feeder.locked_alignments, 
                             flag_trainAlign=args.eal_trainAlign, flag_trainJoint=args.eal_trainJoint, alignScale=args.eal_alignScale)
        else:
            model.initialize(feeder.inputs, feeder.input_lengths, feeder.mel_targets,
                             feeder.linear_targets, feeder.pml_targets, is_training=True, 
                             gta=True)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model, eal_dir=args.eal_dir)

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

    # Set up fixed alignment synthesizer
    alignment_synth = AlignmentSynthesizer()

    # Set up text for synthesis
    fixed_sentence = 'Scientists at the CERN laboratory say they have discovered a new particle.'

    # Set up denormalisation parameters for synthesis
    mean_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/mean.dat'))
    std_path = os.path.abspath(os.path.join(args.base_dir, input, '..', 'pml_data/std.dat'))
    log('Loading normalisation mean from: {}'.format(mean_path))
    log('Loading normalisation standard deviation from: {}'.format(std_path))
    mean_norm = None
    std_norm = None

    if os.path.isfile(mean_path) and os.path.isfile(std_path):
        mean_norm = np.fromfile(mean_path, 'float32')
        std_norm = np.fromfile(std_path, 'float32')

    # Train!
#     import pdb
#     flag_pdb = False
#     pdb.set_trace()
#     args.checkpoint_interval = 2
#     args.num_steps = 5
    
    with tf.Session() as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())
            
#             pdb.set_trace()
            
            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%d' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s at commit: %s' % (restore_path, commit), slack=True)
            elif args.eal_dir and args.eal_ckpt:
                if args.eal_trainAlign or args.eal_trainJoint:
                    list_var = tf.trainable_variables() + [v for v in tf.global_variables() if 'moving' in v.name]
                    saver_eal = tf.train.Saver(list_var)
                    saver_eal.restore(sess, args.eal_ckpt)
                    log('Loaded weights and batchNorm cache of checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
                elif args.eal_ft:
                    saver.restore(sess, args.eal_ckpt)
                    log('Refining the model from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
                else:
                    list_var = [var for var in tf.global_variables() if 'optimizer' not in var.name]
                    saver_eal = tf.train.Saver(list_var)
                    saver_eal.restore(sess, args.eal_ckpt)
                    log('Initializing the weights from checkpoint: %s at commit: %s' % (args.eal_ckpt, commit), slack=True)
#                 args.num_steps *= 2
#                 sess.run(global_step.assign(0))
            else:
                log('Starting new training run at commit: %s' % commit, slack=True)

            feeder.start_in_session(sess)
            step = 0  # initialise step variable so can use in while condition
            
            while not coord.should_stop() and step <= args.num_steps:
                
#                 pdb.set_trace()
                                
                start_time = time.time()
                if args.eal_trainAlign:
                    step, loss, loss_align, opt = sess.run([global_step, model.loss, model.loss_align, model.optimize])
#                     try:
#                         step, loss, loss_align, opt, tmp_a, tmp_ar = sess.run([global_step, model.loss, model.loss_align, model.optimize, 
#                                                                                model.alignments, model.alignments_ref])
#                     except:
#                         print("Oops!",sys.exc_info()[0],"occured.")
#                         flag_pdb = True
#                     if flag_pdb or np.isnan(loss_align):
#                         pdb.set_trace()
#                         flag_pdb = False
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss_align)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_align=%.05f]' % (
                        step, time_window.average, loss, loss_align, loss_window.average)
                elif args.eal_trainJoint:
                    step, loss, loss_align, loss_joint, opt = sess.run([global_step, model.loss, model.loss_align, 
                                                                        model.loss_joint, model.optimize])
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss_joint)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, loss_align=%.05f, avg_loss_joint=%.05f]' % (
                        step, time_window.average, loss, loss_align, loss_window.average)
                else:
                    step, loss, opt = sess.run([global_step, model.loss, model.optimize])
                    time_window.append(time.time() - start_time)
                    loss_window.append(loss)
                    message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f]' % (
                        step, time_window.average, loss, loss_window.average)
                log(message, slack=(step % args.checkpoint_interval == 0))
                
                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.05f at step %d!' % (loss, step), slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' % (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    summary_elements = []

                    # if the model has linear spectrogram features, use them to synthesize audio
                    if hasattr(model, 'linear_targets'):
                        input_seq, alignment, target_spectrogram, spectrogram = sess.run([
                            model.inputs[0], model.alignments[0], model.linear_targets[0], model.linear_outputs[0]])

                        output_waveform = audio.inv_spectrogram(spectrogram.T)
                        target_waveform = audio.inv_spectrogram(target_spectrogram.T)
                        audio.save_wav(output_waveform, os.path.join(log_dir, 'step-%d-audio.wav' % step))
                        audio.save_wav(target_waveform, os.path.join(log_dir, 'step-%d-target-audio.wav' % step))
                    # otherwise, synthesize audio from PML vocoder features
                    elif hasattr(model, 'pml_targets'):
                        input_seq, alignment, target_pml_features, pml_features = sess.run([
                            model.inputs[0], model.alignments[0], model.pml_targets[0], model.pml_outputs[0]])

                        cfg = Configuration(hparams.sample_rate, hparams.pml_dimension)
                        synth = PMLSynthesizer(cfg)
                        output_waveform = synth.pml_to_wav(pml_features, mean_norm=mean_norm, std_norm=std_norm,
                                                           spec_type=hparams.spec_type)
                        target_waveform = synth.pml_to_wav(target_pml_features, mean_norm=mean_norm, std_norm=std_norm,
                                                           spec_type=hparams.spec_type)

                        sp.wavwrite(os.path.join(log_dir, 'step-%d-target-audio.wav' % step), target_waveform,
                                    hparams.sample_rate, norm_max_ifneeded=True)
                        sp.wavwrite(os.path.join(log_dir, 'step-%d-audio.wav' % step), output_waveform,
                                    hparams.sample_rate, norm_max_ifneeded=True)

                    # we need to adjust the output and target waveforms so the values lie in the interval [-1.0, 1.0]
                    output_waveform /= 1.05 * np.max(np.abs(output_waveform))
                    target_waveform /= 1.05 * np.max(np.abs(target_waveform))

                    summary_elements.append(
                        tf.summary.audio('ideal-%d' % step, np.expand_dims(target_waveform, 0), hparams.sample_rate),
                    )

                    summary_elements.append(
                        tf.summary.audio('sample-%d' % step, np.expand_dims(output_waveform, 0), hparams.sample_rate),
                    )

                    # get the alignment for the top sentence in the batch
                    random_attention_plot = plot.plot_alignment(alignment, os.path.join(log_dir,
                                                                                        'step-%d-random-align.png' % step),
                                                                info='%s, %s, %s, step=%d, loss=%.5f' % (
                                                                args.variant, commit, time_string(), step, loss))

                    summary_elements.append(
                        tf.summary.image('attention-%d' % step, random_attention_plot),
                    )

                    # also process the alignment for a fixed sentence for comparison
                    alignment_synth.load('%s-%d' % (checkpoint_path, step), hparams, model_name=args.variant)
                    fixed_alignment = alignment_synth.synthesize(fixed_sentence)
                    fixed_attention_plot = plot.plot_alignment(fixed_alignment,
                                                               os.path.join(log_dir, 'step-%d-fixed-align.png' % step),
                                                               info='%s, %s, %s, step=%d, loss=%.5f' % (
                                                               args.variant, commit, time_string(), step, loss))

                    summary_elements.append(
                        tf.summary.image('fixed-attention-%d' % step, fixed_attention_plot),
                    )

                    # save the audio and alignment to tensorboard (audio sample rate is hyperparameter)
                    merged = sess.run(tf.summary.merge(summary_elements))

                    summary_writer.add_summary(merged, step)

                    log('Input: %s' % sequence_to_text(input_seq))

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #21
0
ファイル: train.py プロジェクト: tmbtw/Tacotron-2
def train(log_dir, args):
    save_dir = os.path.join(log_dir, 'pretrained/')
    checkpoint_path = os.path.join(save_dir, 'model.ckpt')
    input_path = os.path.join(args.base_dir, args.input)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    step_count = 0
    try:
        #simple text file to keep count of global step
        with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file:
            step_count = int(file.read())
    except:
        print(
            'no step_counter file found, assuming there is no saved checkpoint'
        )

    global_step = tf.Variable(step_count, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.model, hparams)
        if hparams.predict_linear:
            model.initialize(feeder.inputs, feeder.input_lengths,
                             feeder.mel_targets, feeder.token_targets,
                             feeder.linear_targets)
        else:
            model.initialize(feeder.inputs, feeder.input_lengths,
                             feeder.mel_targets, feeder.token_targets)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model)

    #Book keeping
    step = 0
    save_step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                #Restore saved model if the user requested it, Default = True.
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e))

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path))
                saver.restore(sess, checkpoint_state.model_checkpoint_path)

            else:
                if not args.restore:
                    log('Starting new training!')
                else:
                    log('No model to load at {}'.format(save_dir))

            #initiating feeder
            feeder.start_in_session(sess)

            #Training loop
            while not coord.should_stop():
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message, end='\r')

                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step: {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    with open(os.path.join(log_dir, 'step_counter.txt'),
                              'w') as file:
                        file.write(str(step))
                    log('Saving checkpoint to: {}-{}'.format(
                        checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    save_step = step

                    log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target = sess.run(
                            [
                                model.inputs[0],
                                model.mel_outputs[0],
                                model.linear_outputs[0],
                                model.alignments[0],
                                model.mel_targets[0],
                            ])

                        #save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        #save griffin lim inverted wav for debug (linear -> wav)
                        wav = audio.inv_linear_spectrogram(linear_prediction.T)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-waveform-linear.wav'.format(step)))

                    else:
                        input_seq, mel_prediction, alignment, target = sess.run(
                            [
                                model.inputs[0],
                                model.mel_outputs[0],
                                model.alignments[0],
                                model.mel_targets[0],
                            ])

                    #save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    #save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-waveform-mel.wav'.format(step)))

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss))
                    #save real mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        target,
                        os.path.join(
                            plot_dir,
                            'step-{}-real-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, Real'.format(
                            args.model, time_string(), step, loss))
                    #save predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-pred-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5}'.format(
                            args.model, time_string(), step, loss))
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #22
0
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)

					eval_loss = sum(eval_losses) / len(eval_losses)
					before_loss = sum(before_losses) / len(before_losses)
					after_loss = sum(after_losses) / len(after_losses)
					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)

					log('Saving eval log to {}..'.format(eval_dir))
					#Save some log to monitor model improvement on same unseen sequence
					wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss),
						max_len=t_len // hparams.outputs_per_step)
					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
						title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t,
						max_len=t_len)

					if hparams.predict_linear:
						plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)),
							title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t,
							max_len=t_len, auto_aspect=True)

					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
					log('Writing eval summary!')
					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

コード例 #23
0
ファイル: train.py プロジェクト: edgarplor/Tacotron-2
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    #Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  #For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    #Potential Griffin-Lim GPU setup
    if hparams.GL_on_GPU:
        GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels),
                                          name='GLGPU_mel_inputs')
        GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq),
                                          name='GLGPU_lin_inputs')

        GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
            GLGPU_mel_inputs, hparams)
        GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
            GLGPU_lin_inputs, hparams)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=20)

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        ckpt = tf.train.load_checkpoint(
                            checkpoint_state.model_checkpoint_path)
                        variables = list(
                            ckpt.get_variable_to_shape_map().keys())
                        #print('=====================PRINTING VARS===============================')
                        #print(variables)
                        #drop_source_layers = ['Tacotron_model/inference/inputs_embedding','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam_1','Tacotron_model/Tacotron_model/inference/inputs_embedding/Adam']
                        #for v in tf.global_variables():
                        #	if not any(layer in v.op.name for layer in drop_source_layers):
                        #		print('Loading', v.op.name)
                        #		v.load(ckpt.get_tensor(v.op.name), session=sess)

                        # Initialize all variables needed for DS, but not loaded from ckpt
                        #init_op = tf.variables_initializer([v for v in tf.global_variables() if any(layer in v.op.name for layer in drop_source_layers)])
                        #sess.run(init_op)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_path,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    #Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None

                    if hparams.predict_linear:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run(
                                [
                                    eval_model.tower_loss[0],
                                    eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_linear_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0],
                                    eval_model.tower_linear_outputs[0][0],
                                    eval_model.tower_linear_targets[0][0],
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            linear_losses.append(linear_loss)
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        if hparams.GL_on_GPU:
                            wav = sess.run(GLGPU_lin_outputs,
                                           feed_dict={GLGPU_lin_inputs: lin_p})
                            wav = audio.inv_preemphasis(
                                wav, hparams.preemphasis, hparams.preemphasize)
                        else:
                            wav = audio.inv_linear_spectrogram(
                                lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                'step-{}-eval-wave-from-linear.wav'.format(
                                    step)),
                            sr=hparams.sample_rate)

                    else:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
                                [
                                    eval_model.tower_loss[0],
                                    eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)

                    log('Saving eval log to {}..'.format(eval_dir))
                    #Save some log to monitor model improvement on same unseen sequence
                    if hparams.GL_on_GPU:
                        wav = sess.run(GLGPU_mel_outputs,
                                       feed_dict={GLGPU_mel_inputs: mel_p})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     'step-{}-eval-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir,
                            'step-{}-eval-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    if hparams.predict_linear:
                        plot.plot_spectrogram(
                            lin_p,
                            os.path.join(
                                eval_plot_dir,
                                'step-{}-eval-linear-spectrogram.png'.format(
                                    step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            target_spectrogram=lin_t,
                            max_len=t_len,
                            auto_aspect=True)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    if hparams.predict_linear:
                        input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_linear_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                                model.tower_linear_targets[0][0],
                            ])

                        #save predicted linear spectrogram to disk (debug)
                        linear_filename = 'linear-prediction-step-{}.npy'.format(
                            step)
                        np.save(os.path.join(linear_dir, linear_filename),
                                linear_prediction.T,
                                allow_pickle=False)

                        #save griffin lim inverted wav for debug (linear -> wav)
                        if hparams.GL_on_GPU:
                            wav = sess.run(GLGPU_lin_outputs,
                                           feed_dict={
                                               GLGPU_lin_inputs:
                                               linear_prediction
                                           })
                            wav = audio.inv_preemphasis(
                                wav, hparams.preemphasis, hparams.preemphasize)
                        else:
                            wav = audio.inv_linear_spectrogram(
                                linear_prediction.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                wav_dir,
                                'step-{}-wave-from-linear.wav'.format(step)),
                            sr=hparams.sample_rate)

                        #Save real and predicted linear-spectrogram plot to disk (control purposes)
                        plot.plot_spectrogram(
                            linear_prediction,
                            os.path.join(
                                plot_dir,
                                'step-{}-linear-spectrogram.png'.format(step)),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, loss),
                            target_spectrogram=linear_target,
                            max_len=target_length,
                            auto_aspect=True)

                    else:
                        input_seq, mel_prediction, alignment, target, target_length = sess.run(
                            [
                                model.tower_inputs[0][0],
                                model.tower_mel_outputs[0][0],
                                model.tower_alignments[0][0],
                                model.tower_mel_targets[0][0],
                                model.tower_targets_lengths[0][0],
                            ])

                    #save predicted mel spectrogram to disk (debug)
                    mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    #save griffin lim inverted wav for debug (mel -> wav)
                    if hparams.GL_on_GPU:
                        wav = sess.run(
                            GLGPU_mel_outputs,
                            feed_dict={GLGPU_mel_inputs: mel_prediction})
                        wav = audio.inv_preemphasis(wav, hparams.preemphasis,
                                                    hparams.preemphasize)
                    else:
                        wav = audio.inv_mel_spectrogram(
                            mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-wave-from-mel.wav'.format(step)),
                        sr=hparams.sample_rate)

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-mel-spectrogram.png'.format(step)),
                        title='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    #Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    #Update Projector
                    log('\nSaving Model Character Embeddings visualization..')
                    add_embedding_stats(summary_writer,
                                        [model.embedding_table.name],
                                        [char_embedding_meta],
                                        checkpoint_state.model_checkpoint_path)
                    log('Tacotron Character embeddings have been updated on tensorboard!'
                        )

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #24
0
ファイル: train.py プロジェクト: duvtedudug/Tacotron-2
def train(log_dir, args, hparams):
	save_dir = os.path.join(log_dir, 'taco_pretrained/')
	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
	input_path = os.path.join(args.base_dir, args.tacotron_input)
	plot_dir = os.path.join(log_dir, 'plots')
	wav_dir = os.path.join(log_dir, 'wavs')
	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
	eval_dir = os.path.join(log_dir, 'eval-dir')
	eval_plot_dir = os.path.join(eval_dir, 'plots')
	eval_wav_dir = os.path.join(eval_dir, 'wavs')
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(plot_dir, exist_ok=True)
	os.makedirs(wav_dir, exist_ok=True)
	os.makedirs(mel_dir, exist_ok=True)
	os.makedirs(eval_plot_dir, exist_ok=True)
	os.makedirs(eval_wav_dir, exist_ok=True)

	if hparams.predict_linear:
		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
		os.makedirs(linear_dir, exist_ok=True)

	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log('Using model: {}'.format(args.model))
	log(hparams_debug_string())

	#Start by setting a seed for repeatability
	tf.set_random_seed(hparams.tacotron_random_seed)

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	global_step = tf.Variable(0, name='global_step', trainable=False)
	model, stats = model_train_mode(args, feeder, hparams, global_step)
	eval_model = model_test_mode(args, feeder, hparams, global_step)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=5)

	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				#Restore saved model if the user requested it, Default = True.
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e))

			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
				saver.restore(sess, checkpoint_state.model_checkpoint_path)

			else:
				if not args.restore:
					log('Starting new training!')
				else:
					log('No model to load at {}'.format(save_dir))

			#initializing feeder
			feeder.start_threads(sess)

			#Training loop
			while not coord.should_stop() and step < args.tacotron_train_steps:
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r')

				if np.isnan(loss):
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)

				if step % args.eval_interval == 0:
					#Run eval and save eval stats
					log('\nRunning evaluation at step {}'.format(step))

					eval_losses = []
					before_losses = []
					after_losses = []
					stop_token_losses = []
					linear_losses = []
					linear_loss = None

					if hparams.predict_linear:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], 
								eval_model.mel_targets[0], eval_model.targets_lengths[0], 
								eval_model.alignments[0], eval_model.linear_outputs[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)
							linear_losses.append(linear_loss)
						linear_loss = sum(linear_losses) / len(linear_losses)

						wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
						audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate)
					else:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0],
								eval_model.targets_lengths[0], eval_model.alignments[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)

					eval_loss = sum(eval_losses) / len(eval_losses)
					before_loss = sum(before_losses) / len(before_losses)
					after_loss = sum(after_losses) / len(after_losses)
					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)

					log('Saving eval log to {}..'.format(eval_dir))
					#Save some log to monitor model improvement on same unseen sequence
					wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate)

					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss),
						max_len=t_len // hparams.outputs_per_step)
					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t,
						max_len=t_len)

					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
					log('Writing eval summary!')
					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

				
				if step % args.checkpoint_interval == 0:
					#Save model and current global step
					saver.save(sess, checkpoint_path, global_step=global_step)
					
					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
					if hparams.predict_linear:
						input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([
							model.inputs[0],
							model.mel_outputs[0],
							model.linear_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

						#save predicted linear spectrogram to disk (debug)
						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)

						#save griffin lim inverted wav for debug (linear -> wav)
						wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)

					else:
						input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0],
							model.mel_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

					#save predicted mel spectrogram to disk (debug)
					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)

					#save griffin lim inverted wav for debug (mel -> wav)
					wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

					#save alignment plot to disk (control purposes)
					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
						max_len=target_length // hparams.outputs_per_step)
					#save real and predicted mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
						max_len=target_length)
					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps))
			return save_dir

		except Exception as e:
			log('Exiting due to exception: {}'.format(e))
			traceback.print_exc()
			coord.request_stop(e)
コード例 #25
0
def train(log_dir, args, hparams):
	save_dir = os.path.join(log_dir, 'taco_pretrained/')
	checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
	input_path = os.path.join(args.base_dir, args.tacotron_input)
	plot_dir = os.path.join(log_dir, 'plots')
	wav_dir = os.path.join(log_dir, 'wavs')
	mel_dir = os.path.join(log_dir, 'mel-spectrograms')
	eval_dir = os.path.join(log_dir, 'eval-dir')
	eval_plot_dir = os.path.join(eval_dir, 'plots')
	eval_wav_dir = os.path.join(eval_dir, 'wavs')
	os.makedirs(eval_dir, exist_ok=True)
	os.makedirs(plot_dir, exist_ok=True)
	os.makedirs(wav_dir, exist_ok=True)
	os.makedirs(mel_dir, exist_ok=True)
	os.makedirs(eval_plot_dir, exist_ok=True)
	os.makedirs(eval_wav_dir, exist_ok=True)

	if hparams.predict_linear:
		linear_dir = os.path.join(log_dir, 'linear-spectrograms')
		os.makedirs(linear_dir, exist_ok=True)

	log('Checkpoint path: {}'.format(checkpoint_path))
	log('Loading training data from: {}'.format(input_path))
	log('Using model: {}'.format(args.model))
	log(hparams_debug_string())

	#Start by setting a seed for repeatability
	tf.set_random_seed(hparams.tacotron_random_seed)

	#Set up data feeder
	coord = tf.train.Coordinator()
	with tf.variable_scope('datafeeder') as scope:
		feeder = Feeder(coord, input_path, hparams)

	#Set up model:
	global_step = tf.Variable(0, name='global_step', trainable=False)
	model, stats = model_train_mode(args, feeder, hparams, global_step)
	eval_model = model_test_mode(args, feeder, hparams, global_step)

	#Book keeping
	step = 0
	time_window = ValueWindow(100)
	loss_window = ValueWindow(100)
	saver = tf.train.Saver(max_to_keep=5)

	log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps))

	#Memory allocation on the GPU as needed
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True

	#Train
	with tf.Session(config=config) as sess:
		try:
			summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
			sess.run(tf.global_variables_initializer())

			#saved model restoring
			if args.restore:
				#Restore saved model if the user requested it, Default = True.
				try:
					checkpoint_state = tf.train.get_checkpoint_state(save_dir)
				except tf.errors.OutOfRangeError as e:
					log('Cannot restore checkpoint: {}'.format(e))

			if (checkpoint_state and checkpoint_state.model_checkpoint_path):
				log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
				saver.restore(sess, checkpoint_state.model_checkpoint_path)

			else:
				if not args.restore:
					log('Starting new training!')
				else:
					log('No model to load at {}'.format(save_dir))

			#initializing feeder
			feeder.start_threads(sess)

			#Training loop
			while not coord.should_stop() and step < args.tacotron_train_steps:
				start_time = time.time()
				step, loss, opt = sess.run([global_step, model.loss, model.optimize])
				time_window.append(time.time() - start_time)
				loss_window.append(loss)
				message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
					step, time_window.average, loss, loss_window.average)
				log(message, end='\r')

				if loss > 1000 or np.isnan(loss):
					log('Loss exploded to {:.5f} at step {}'.format(loss, step))
					raise Exception('Loss exploded')

				if step % args.summary_interval == 0:
					log('\nWriting summary at step {}'.format(step))
					summary_writer.add_summary(sess.run(stats), step)

				if step % args.eval_interval == 0:
					#Run eval and save eval stats
					log('\nRunning evaluation at step {}'.format(step))

					eval_losses = []
					before_losses = []
					after_losses = []
					stop_token_losses = []
					linear_losses = []
					linear_loss = None

					if hparams.predict_linear:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.linear_loss, eval_model.mel_outputs[0], 
								eval_model.mel_targets[0], eval_model.targets_lengths[0], 
								eval_model.alignments[0], eval_model.linear_outputs[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)
							linear_losses.append(linear_loss)
						linear_loss = sum(linear_losses) / len(linear_losses)

						wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
						audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-linear.wav'.format(step)), sr=hparams.sample_rate)
					else:
						for i in tqdm(range(feeder.test_steps)):
							eloss, before_loss, after_loss, stop_token_loss, mel_p, mel_t, t_len, align = sess.run(
								[eval_model.loss, eval_model.before_loss, eval_model.after_loss,
								eval_model.stop_token_loss, eval_model.mel_outputs[0], eval_model.mel_targets[0],
								eval_model.targets_lengths[0], eval_model.alignments[0]])
							eval_losses.append(eloss)
							before_losses.append(before_loss)
							after_losses.append(after_loss)
							stop_token_losses.append(stop_token_loss)

					eval_loss = sum(eval_losses) / len(eval_losses)
					before_loss = sum(before_losses) / len(before_losses)
					after_loss = sum(after_losses) / len(after_losses)
					stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)

					log('Saving eval log to {}..'.format(eval_dir))
					#Save some log to monitor model improvement on same unseen sequence
					wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
					audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), sr=hparams.sample_rate)

					plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eloss),
						max_len=t_len // hparams.outputs_per_step)
					plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eloss), target_spectrogram=mel_t,
						max_len=t_len)

					log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
					log('Writing eval summary!')
					add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

				
				if step % args.checkpoint_interval == 0:
					#Save model and current global step
					saver.save(sess, checkpoint_path, global_step=global_step)
					
					log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')
					if hparams.predict_linear:
						input_seq, mel_prediction, linear_prediction, alignment, target, target_length = sess.run([
							model.inputs[0],
							model.mel_outputs[0],
							model.linear_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

						#save predicted linear spectrogram to disk (debug)
						linear_filename = 'linear-prediction-step-{}.npy'.format(step)
						np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)

						#save griffin lim inverted wav for debug (linear -> wav)
						wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
						audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)

					else:
						input_seq, mel_prediction, alignment, target, target_length = sess.run([model.inputs[0],
							model.mel_outputs[0],
							model.alignments[0],
							model.mel_targets[0],
							model.targets_lengths[0],
							])

					#save predicted mel spectrogram to disk (debug)
					mel_filename = 'mel-prediction-step-{}.npy'.format(step)
					np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False)

					#save griffin lim inverted wav for debug (mel -> wav)
					wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
					audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

					#save alignment plot to disk (control purposes)
					plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
						info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss),
						max_len=target_length // hparams.outputs_per_step)
					#save real and predicted mel-spectrogram plot to disk (control purposes)
					plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
						info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
						max_len=target_length)
					log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))

			log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps))
			return save_dir

		except Exception as e:
			log('Exiting due to exception: {}'.format(e))
			traceback.print_exc()
			coord.request_stop(e)
コード例 #26
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, 'taco_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'tacotron_events')
    meta_folder = os.path.join(log_dir, 'metas')
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt')
    input_path = os.path.join(args.base_dir, args.tacotron_input)

    if hparams.predict_linear:
        linear_dir = os.path.join(log_dir, 'linear-spectrograms')
        os.makedirs(linear_dir, exist_ok=True)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams, args)

    #Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, hparams, model)
    # if args.TEST:
    # 	for v in tf.global_variables():
    # 		print(v)

    #Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, 'CharacterEmbeddings.tsv')
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, 'w', encoding='utf-8') as f:
            for symbol in symbols:
                if symbol == ' ':
                    symbol = '\\s'  #For visual purposes, swap space with \s

                f.write('{}\n'.format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, '..')

    #Potential Griffin-Lim GPU setup
    if hparams.GL_on_GPU:
        GLGPU_mel_inputs = tf.placeholder(tf.float32, (None, hparams.num_mels),
                                          name='GLGPU_mel_inputs')
        GLGPU_lin_inputs = tf.placeholder(tf.float32, (None, hparams.num_freq),
                                          name='GLGPU_lin_inputs')

        GLGPU_mel_outputs = audio.inv_mel_spectrogram_tensorflow(
            GLGPU_mel_inputs, hparams)
        GLGPU_lin_outputs = audio.inv_linear_spectrogram_tensorflow(
            GLGPU_lin_inputs, hparams)

    #Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    loss_bef_window = ValueWindow(100)
    loss_aft_window = ValueWindow(100)
    loss_stop_window = ValueWindow(100)
    loss_reg_window = ValueWindow(100)
    loss_emt_window = ValueWindow(100)
    loss_spk_window = ValueWindow(100)
    loss_orthog_window = ValueWindow(100)
    loss_up_emt_window = ValueWindow(100)
    loss_up_spk_window = ValueWindow(100)
    loss_mo_up_emt_window = ValueWindow(100)
    loss_mo_up_spk_window = ValueWindow(100)
    if args.nat_gan:
        d_loss_t_window = ValueWindow(100)
        d_loss_p_window = ValueWindow(100)
        d_loss_up_window = ValueWindow(100)
        g_loss_p_window = ValueWindow(100)
        g_loss_up_window = ValueWindow(100)

    saver = tf.train.Saver(max_to_keep=args.max_to_keep)

    if args.opt_ref_no_mo and not (args.restart_optimizer_r):
        print(
            "WILL ATTEMPT TO RESTORE OPTIMIZER R - SET ARGS.RESTART_OPTIMIZER_R IF RETRAINING A MODEL THAT DIDN'T HAVE THE OPTIMIZER R"
        )

    assert (not (args.restart_nat_gan_d and args.restore_nat_gan_d_sep))

    var_list = tf.global_variables()
    var_list = [v for v in var_list if not ('pretrained' in v.name)]
    var_list = [
        v for v in var_list
        if not ('nat_gan' in v.name or 'optimizer_n' in v.name)
    ] if (args.restart_nat_gan_d or args.restore_nat_gan_d_sep) else var_list
    var_list = [
        v for v in var_list
        if not ('optimizer_r' in v.name or 'optimizer_3' in v.name)
    ] if args.restart_optimizer_r else var_list
    saver_restore = tf.train.Saver(var_list=var_list)

    if args.unpaired and args.pretrained_emb_disc:
        saver_restore_emt_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('pretrained_ref_enc_emt' in v.name)
        ])
        saver_restore_spk_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('pretrained_ref_enc_spk' in v.name)
        ])
    elif args.unpaired and args.pretrained_emb_disc_all:
        saver_restore_emt_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables() if ('refnet_emt' in v.name)
        ])
        saver_restore_spk_disc = tf.train.Saver(var_list=[
            v for v in tf.global_variables() if ('refnet_spk' in v.name)
        ])

    if args.nat_gan:
        saver_nat_gan = tf.train.Saver(var_list=[
            v for v in tf.global_variables()
            if ('nat_gan' in v.name or 'optimizer_n' in v.name)
        ])
        save_dir_nat_gan = r'nat_gan/pretrained_model'

    log('Tacotron training set to a maximum of {} steps'.format(
        args.tacotron_train_steps))
    if hparams.tacotron_fine_tuning:
        print('FINE TUNING SET TO TRUE - MAKE SURE THIS IS WHAT YOU WANT!')

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    eval_feed_dict, emt_labels, spk_labels, \
    basenames, basenames_refs = get_eval_feed_dict(hparams, args.synth_metadata_filename,
                  eval_model, args.input_dir, args.flip_spk_emt)

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
            # for x in tf.global_variables():
            # 	print(x)

            sess.run(tf.global_variables_initializer())
            #saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver_restore.restore(
                            sess, checkpoint_state.model_checkpoint_path)

                    else:
                        raise ValueError(
                            'No model to load at {}'.format(save_dir))

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            if args.unpaired and (args.pretrained_emb_disc
                                  or args.pretrained_emb_disc_all):
                save_dir_emt = r'spk_disc/pretrained_model_emt_disc'
                checkpoint_state_emt = tf.train.get_checkpoint_state(
                    save_dir_emt)
                saver_restore_emt_disc.restore(
                    sess, checkpoint_state_emt.model_checkpoint_path)
                log('Loaded Emotion Discriminator from checkpoint {}'.format(
                    checkpoint_state_emt.model_checkpoint_path),
                    slack=True)

                save_dir_spk = r'spk_disc/pretrained_model_spk_disc'
                checkpoint_state_spk = tf.train.get_checkpoint_state(
                    save_dir_spk)
                saver_restore_spk_disc.restore(
                    sess, checkpoint_state_spk.model_checkpoint_path)
                log('Loaded Speaker Discriminator from checkpoint {}'.format(
                    checkpoint_state_spk.model_checkpoint_path),
                    slack=True)

            if args.nat_gan and args.restore_nat_gan_d_sep:
                checkpoint_state_nat_gan = tf.train.get_checkpoint_state(
                    save_dir_nat_gan)
                saver_nat_gan.restore(
                    sess, checkpoint_state_nat_gan.model_checkpoint_path)
                log('Loaded Nat Gan Discriminator from checkpoint {}'.format(
                    checkpoint_state_nat_gan.model_checkpoint_path),
                    slack=True)

            #initializing feeder
            feeder.start_threads(sess)

            #Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                # vars = [global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                # 				model.regularization_loss,model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss]
                # out = [step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog]
                # message = 'Step {:7d} {:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f},' \
                # 					'reg={:.5f}, emt={:.5f}, spk={:.5f}, orthog={:.5f}'.format(step, time_window.average, loss, loss_window.average,
                # 																																		 loss_bef_window.average, loss_aft_window.average,
                # 																																		 loss_stop_window.average, loss_reg_window.average,
                # 																																		 loss_emt_window.average, loss_spk_window.average,
                # 																																		 loss_orthog_window.average)
                # if args.unpaired:
                # 	vars += [model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt, model.style_emb_loss_mel_out_up_spk]
                # 	out += [loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk]
                # 	message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}]'.format(loss_up_emt_window.average,
                # 																																												loss_up_spk_window.average,
                # 																																												loss_mo_up_emt_window.average,
                # 																																												loss_mo_up_spk_window.average)
                # if False:
                # 	vars += [model.tower_style_emb_logit_emt[0], model.tower_emt_labels[0],model.tower_style_emb_logit_up_emt[0],
                # 					model.tower_emt_up_labels[0],model.tower_spk_labels[0]]
                # 	out += [emt_logit, emt_labels, emt_up_logit, emt_up_labels, spk_labels]
                #
                # out = sess.run([vars])

                if args.nat_gan and (args.restart_nat_gan_d
                                     or not (args.restore)) and step == 0:
                    log("Will start with Training Nat GAN Discriminator",
                        end='\r')
                    disc_epochs = 300 if args.unpaired else 200
                    disc_epochs = 0 if args.TEST else disc_epochs
                    for i in range(disc_epochs + 1):
                        d_loss_t, d_loss_p, d_loss_up,\
                        d_loss_t_emt, d_loss_p_emt, d_loss_up_emt, \
                        d_loss_t_spk, d_loss_p_spk, d_loss_up_spk, \
                        opt_n = sess.run([model.d_loss_targ, model.d_loss_p, model.d_loss_up,
                                                 model.d_loss_targ_emt, model.d_loss_p_emt, model.d_loss_up_emt,
                                                 model.d_loss_targ_spk, model.d_loss_p_spk, model.d_loss_up_spk,
                                                 model.optimize_n])
                        message = 'step: {}, d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f},' \
                             ' d_loss_t_emt={:.5f}, d_loss_p_emt ={:.5f}, d_loss_up_emt ={:.5f},' \
                             ' d_loss_t_spk={:.5f}, d_loss_p_spk ={:.5f}, d_loss_up_spk ={:.5f}'.format(i, d_loss_t, d_loss_p, d_loss_up,
                                                                 d_loss_t_emt, d_loss_p_emt, d_loss_up_emt,
                                                                 d_loss_t_spk, d_loss_p_spk, d_loss_up_spk)
                        log(message, end='\r')
                    os.makedirs(r'nat_gan', exist_ok=True)
                    os.makedirs(r'nat_gan/pretrained_model', exist_ok=True)
                    checkpoint_path_nat_gan = os.path.join(
                        save_dir_nat_gan, 'nat_gan_model.ckpt')
                    saver_nat_gan.save(sess,
                                       checkpoint_path_nat_gan,
                                       global_step=i)

                if args.nat_gan:
                    d_loss_t, d_loss_p, d_loss_up, opt_n = sess.run([
                        model.d_loss_targ, model.d_loss_p, model.d_loss_up,
                        model.optimize_n
                    ])

                if args.unpaired:
                    step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels, opt_r\
                    = sess.run([global_step, model.ratio, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                        model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                        model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                          model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0], model.optimize_r])

                else:
                    step, tfr, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,dec_out,opt_r = sess.run([global_step, model.helper._ratio, model.loss,
                        model.optimize, model.before_loss, model.after_loss, model.stop_token_loss,
                        model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                        model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                        model.style_emb_loss_mel_out_up_spk, model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0],model.tower_decoder_output[0],model.optimize_r])

                    # step, loss, opt, bef, aft, stop, reg, loss_emt, loss_spk, loss_orthog, \
                    # loss_up_emt, loss_up_spk, loss_mo_up_emt, loss_mo_up_spk, g_loss_p, g_loss_up, mels,ref_emt,ref_spk,ref_up_emt,ref_up_spk,emb,enc_out,enc_out_up,\
                    # stop_pred, targ, inp, inp_len,targ_len,stop_targ,mels_up,dec_out,dec_out_up,opt_r\
                    # = sess.run([global_step, model.loss, model.optimize,model.before_loss, model.after_loss,model.stop_token_loss,
                    # 				model.regularization_loss, model.style_emb_loss_emt, model.style_emb_loss_spk, model.style_emb_orthog_loss,
                    # 				model.style_emb_loss_up_emt, model.style_emb_loss_up_spk,model.style_emb_loss_mel_out_up_emt,
                    # 						model.style_emb_loss_mel_out_up_spk,model.g_loss_p, model.g_loss_up, model.tower_mel_outputs[0],
                    # 						model.tower_refnet_out_emt[0],model.tower_refnet_out_spk[0],model.tower_refnet_out_up_emt[0],model.tower_refnet_out_up_spk[0],
                    # 						model.tower_embedded_inputs[0], model.tower_encoder_outputs[0],model.tower_encoder_outputs_up[0],model.tower_stop_token_prediction[0],
                    # 						model.tower_mel_targets[0],model.tower_inputs[0],model.tower_input_lengths[0],model.tower_targets_lengths[0],
                    # 						model.tower_stop_token_targets[0],model.tower_mel_outputs_up[0],model.tower_decoder_output[0],model.tower_decoder_output_up[0],model.optimize_r])
                    #
                    # if args.save_output_vars:
                    # 	import pandas as pd
                    # 	pd.DataFrame(emb[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emb.csv')
                    # 	pd.DataFrame(enc_out[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out.csv')
                    # 	pd.DataFrame(enc_out_up[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\enc_out_up.csv')
                    # 	pd.DataFrame(stop_pred[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop.csv')
                    # 	pd.DataFrame(targ[:, 0, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ.csv')
                    # 	pd.DataFrame(inp[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp.csv')
                    # 	pd.DataFrame(inp_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\inp_len.csv')
                    # 	pd.DataFrame(targ_len[:]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\targ_len.csv')
                    # 	pd.DataFrame(stop_targ[:, :]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\stop_targ.csv')
                    # 	pd.DataFrame(mels_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels_up.csv')
                    # 	pd.DataFrame(dec_out_up[:, 0, 0:5]).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out_up.csv')

                    if args.save_output_vars:
                        import pandas as pd
                        pd.DataFrame(mels[:, 0, 0:5]).to_csv(
                            r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\mels.csv'
                        )
                        pd.DataFrame(dec_out[:, 0, 0:5]).to_csv(
                            r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\dec_out.csv'
                        )

                # import pandas as pd
                # print(emt_logit.shape, emt_labels.shape)
                # if len(emt_logit.shape)>2:
                # 	emt_logit = emt_logit.squeeze(1)
                # 	emt_up_logit = emt_up_logit.squeeze(1)
                # emt_labels = emt_labels.reshape(-1,1)
                # emt_up_labels = emt_up_labels.reshape(-1, 1)
                # spk_labels = spk_labels.reshape(-1, 1)
                # df = np.concatenate((emt_logit,emt_labels,spk_labels,emt_up_logit,emt_up_labels),axis=1)
                # print(emt_labels)
                # print(emt_logit)
                # print(emt_up_labels)
                # print(emt_up_logit)
                #
                # pd.DataFrame(df).to_csv(r'C:\Users\t-mawhit\Documents\code\Tacotron-2\eval\mels_save\emt_logit_.001_up_10k.csv')
                # raise

                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                loss_bef_window.append(bef)
                loss_aft_window.append(aft)
                loss_stop_window.append(stop)
                loss_reg_window.append(reg)
                loss_emt_window.append(loss_emt)
                loss_spk_window.append(loss_spk)
                loss_orthog_window.append(loss_orthog)
                loss_up_emt_window.append(loss_up_emt)
                loss_up_spk_window.append(loss_up_spk)
                loss_mo_up_emt_window.append(loss_mo_up_emt)
                loss_mo_up_spk_window.append(loss_mo_up_spk)

                if args.nat_gan:
                    d_loss_t_window.append(d_loss_t)
                    d_loss_p_window.append(d_loss_p)
                    d_loss_up_window.append(d_loss_up)
                    g_loss_p_window.append(g_loss_p)
                    g_loss_up_window.append(g_loss_up)

                message = 'Step {:7d} {:.3f} sec/step, tfr={:.3f}, loss={:.5f}, avg_loss={:.5f}, bef={:.5f}, aft={:.5f}, stop={:.5f}, reg={:.5f}'.format(
                    step, time_window.average, tfr, loss, loss_window.average,
                    loss_bef_window.average, loss_aft_window.average,
                    loss_stop_window.average, loss_reg_window.average)
                if args.emt_attn:
                    message += ' emt={:.5f}, spk={:.5f}, spk_l2={:.5f}'.format(
                        loss_emt_window.average, loss_spk_window.average,
                        loss_orthog_window.average)
                else:
                    message += ' emt={:.5f}, spk={:.5f}, orthog={:.5f},'.format(
                        loss_emt_window.average, loss_spk_window.average,
                        loss_orthog_window.average)
                if args.unpaired:
                    message += ' up_emt={:.5f}, up_spk={:.5f}, mo_up_emt={:.5f}, mo_up_spk={:.5f}'.format(
                        loss_up_emt_window.average, loss_up_spk_window.average,
                        loss_mo_up_emt_window.average,
                        loss_mo_up_spk_window.average)
                if args.nat_gan:
                    message += ' d_loss_t={:.5f}, d_loss_p ={:.5f}, d_loss_up ={:.5f}, g_loss_p ={:.5f}, g_loss_up ={:.5f}'.format(
                        d_loss_t_window.average, d_loss_p_window.average,
                        d_loss_up_window.average, g_loss_p_window.average,
                        g_loss_up_window.average)

                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                if np.isnan(loss) or loss > 100.:
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                # if step % args.eval_interval == 0:
                # 	#Run eval and save eval stats
                # 	log('\nRunning evaluation and saving model at step {}'.format(step))
                # 	saver.save(sess, checkpoint_path, global_step=global_step)
                #
                # 	eval_losses = []
                # 	before_losses = []
                # 	after_losses = []
                # 	stop_token_losses = []
                # 	linear_losses = []
                # 	linear_loss = None
                #
                # 	if hparams.predict_linear:
                # 		for i in tqdm(range(feeder.test_steps)):
                # 			eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, mel_t, t_len, align, lin_p, lin_t = sess.run([
                # 				eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
                # 				eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0],
                # 				eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0],
                # 				eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0],
                # 				eval_model.tower_linear_targets[0][0],
                # 				])
                # 			eval_losses.append(eloss)
                # 			before_losses.append(before_loss)
                # 			after_losses.append(after_loss)
                # 			stop_token_losses.append(stop_token_loss)
                # 			linear_losses.append(linear_loss)
                # 		linear_loss = sum(linear_losses) / len(linear_losses)
                #
                # 		if hparams.GL_on_GPU:
                # 			wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: lin_p})
                # 			wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                # 		else:
                # 			wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                # 		audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
                #
                # 	else:
                # 		for i in tqdm(range(feeder.test_steps)):
                # 			eloss, before_loss, after_loss, stop_token_loss, input_seq, mel_p, mel_t, t_len, align = sess.run([
                # 				eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0],
                # 				eval_model.tower_stop_token_loss[0],eval_model.tower_inputs[0][0], eval_model.tower_mel_outputs[0][0],
                # 				eval_model.tower_mel_targets[0][0],
                # 				eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0]
                # 				])
                # 			eval_losses.append(eloss)
                # 			before_losses.append(before_loss)
                # 			after_losses.append(after_loss)
                # 			stop_token_losses.append(stop_token_loss)
                #
                # 	eval_loss = sum(eval_losses) / len(eval_losses)
                # 	before_loss = sum(before_losses) / len(before_losses)
                # 	after_loss = sum(after_losses) / len(after_losses)
                # 	stop_token_loss = sum(stop_token_losses) / len(stop_token_losses)
                #
                # 	# log('Saving eval log to {}..'.format(eval_dir))
                # 	#Save some log to monitor model improvement on same unseen sequence
                # 	if hparams.GL_on_GPU:
                # 		wav = sess.run(GLGPU_mel_outputs, feed_dict={GLGPU_mel_inputs: mel_p})
                # 		wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                # 	else:
                # 		wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                # 	audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)
                #
                # 	input_seq = sequence_to_text(input_seq)
                # 	plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)),
                # 		title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss, input_seq),
                # 		max_len=t_len // hparams.outputs_per_step)
                # 	plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)),
                # 		title='{}, {}, step={}, loss={:.5f}\n{}'.format(args.model, time_string(), step, eval_loss,input_seq), target_spectrogram=mel_t,
                # 		max_len=t_len)
                #
                # 	if hparams.predict_linear:
                # 		plot.plot_spectrogram(lin_p, os.path.join(eval_plot_dir, 'step-{}-eval-linear-spectrogram.png'.format(step)),
                # 			title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=lin_t,
                # 			max_len=t_len, auto_aspect=True)
                #
                # 	log('Step {:7d} [eval loss: {:.3f}, before loss: {:.3f}, after loss: {:.3f}, stop loss: {:.3f}]'.format(step, eval_loss, before_loss, after_loss, stop_token_loss))
                # 	# log('Writing eval summary!')
                # 	add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or step == 300:
                    #Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    log('\nSaved model at step {}'.format(step))

                if step % args.eval_interval == 0:

                    if hparams.predict_linear:
                        raise ValueError('predict linear not implemented')
                        # input_seq, mel_prediction, linear_prediction, alignment, target, target_length, linear_target = sess.run([
                        # 	model.tower_inputs[0][0],
                        # 	model.tower_mel_outputs[0][0],
                        # 	model.tower_linear_outputs[0][0],
                        # 	model.tower_alignments[0][0],
                        # 	model.tower_mel_targets[0][0],
                        # 	model.tower_targets_lengths[0][0],
                        # 	model.tower_linear_targets[0][0],
                        # 	])
                        #
                        # #save predicted linear spectrogram to disk (debug)
                        # linear_filename = 'linear-prediction-step-{}.npy'.format(step)
                        # np.save(os.path.join(linear_dir, linear_filename), linear_prediction.T, allow_pickle=False)
                        #
                        # #save griffin lim inverted wav for debug (linear -> wav)
                        # if hparams.GL_on_GPU:
                        # 	wav = sess.run(GLGPU_lin_outputs, feed_dict={GLGPU_lin_inputs: linear_prediction})
                        # 	wav = audio.inv_preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
                        # else:
                        # 	wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
                        # audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-linear.wav'.format(step)), sr=hparams.sample_rate)
                        #
                        # #Save real and predicted linear-spectrogram plot to disk (control purposes)
                        # plot.plot_spectrogram(linear_prediction, os.path.join(plot_dir, 'step-{}-linear-spectrogram.png'.format(step)),
                        # 	title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=linear_target,
                        # 	max_len=target_length, auto_aspect=True)

                    else:
                        input_seqs, mels, alignments,\
                        stop_tokens = sess.run([eval_model.tower_inputs,
                              eval_model.tower_mel_outputs,
                              eval_model.tower_alignments,
                              eval_model.tower_stop_token_prediction],
                                 feed_dict=eval_feed_dict)

                        # num_evals = len(input_seqs) if False else 1
                        # for i in range(num_evals):
                        # 	input_seq = input_seqs[i]
                        # 	mel_prediction = mel_predictions[i]
                        # 	alignment = alignments[i]
                        # 	target = targets[i]
                        # 	target_length = target_lengths[i]
                        # 	emt = emts[i]
                        # 	spk = spks[i]
                        # 	if args.emt_attn and args.attn=='simple':
                        # 		alignment_emt = alignments_emt[0][i]

                        # Linearize outputs (n_gpus -> 1D)
                        inp = [
                            inp for gpu_inp in input_seqs for inp in gpu_inp
                        ]
                        mels = [mel for gpu_mels in mels for mel in gpu_mels]
                        # targets = [target for gpu_targets in targets for target in gpu_targets]
                        alignments = [
                            align for gpu_aligns in alignments
                            for align in gpu_aligns
                        ]
                        stop_tokens = [
                            token for gpu_token in stop_tokens
                            for token in gpu_token
                        ]

                        try:
                            target_lengths = get_output_lengths(stop_tokens)

                            # Take off the batch wise padding
                            mels = [
                                mel[:target_length, :]
                                for mel, target_length in zip(
                                    mels, target_lengths)
                            ]

                            T2_output_range = (
                                -hparams.max_abs_value, hparams.max_abs_value
                            ) if hparams.symmetric_mels else (
                                0, hparams.max_abs_value)
                            mels = [
                                np.clip(m, T2_output_range[0],
                                        T2_output_range[1]) for m in mels
                            ]

                            folder_bucket = 'step_{}'.format(step // 500)
                            folder_wavs_save = os.path.join(
                                wav_dir, folder_bucket)
                            folder_plot_save = os.path.join(
                                plot_dir, folder_bucket)
                            os.makedirs(folder_wavs_save, exist_ok=True)
                            os.makedirs(folder_plot_save, exist_ok=True)

                            for i, (mel, align, basename,
                                    basename_ref) in enumerate(
                                        zip(mels, alignments, basenames,
                                            basenames_refs)):

                                #save griffin lim inverted wav for debug (mel -> wav)
                                if hparams.GL_on_GPU:
                                    wav = sess.run(
                                        GLGPU_mel_outputs,
                                        feed_dict={GLGPU_mel_inputs: mel})
                                    wav = audio.inv_preemphasis(
                                        wav, hparams.preemphasis,
                                        hparams.preemphasize)
                                else:
                                    wav = audio.inv_mel_spectrogram(
                                        mel.T, hparams)
                                audio.save_wav(
                                    wav,
                                    os.path.join(
                                        folder_wavs_save,
                                        'step_{}_wav_{}_{}_{}.wav'.format(
                                            step, i, basename, basename_ref)),
                                    sr=hparams.sample_rate)

                                input_seq = sequence_to_text(inp[i])
                                #save alignment plot to disk (control purposes)
                                try:
                                    plot.plot_alignment(
                                        align,
                                        os.path.join(
                                            folder_plot_save,
                                            'step_{}_wav_{}_{}_{}_align.png'.
                                            format(step, i, basename,
                                                   basename_ref)),
                                        title='{}, {}, step={}\n{}'.format(
                                            args.model, time_string(), step,
                                            input_seq),
                                        max_len=target_lengths[i] //
                                        hparams.outputs_per_step)
                                except:
                                    print("failed to plot alignment")
                                try:
                                    #save real and predicted mel-spectrogram plot to disk (control purposes)
                                    plot.plot_spectrogram(
                                        mel,
                                        os.path.join(
                                            folder_plot_save,
                                            'step-{}-{}-mel-spectrogram.png'.
                                            format(step, i)),
                                        title='{}, {}, step={}\n{}'.format(
                                            args.model, time_string(), step,
                                            input_seq))
                                    # target_spectrogram=targets[i],
                                    # max_len=target_lengths[i])
                                except:
                                    print("failed to plot spectrogram")

                            log('Saved synthesized samples for step {}'.format(
                                step),
                                end='\r')
                        except:
                            print("Couldn't synthesize samples")
                        # log('Input at step {}: {}'.format(step, input_seq), end='\r')

                # if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                # 	#Get current checkpoint state
                # 	checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                #
                # 	#Update Projector
                # 	log('\nSaving Model Character Embeddings visualization..')
                # 	add_embedding_stats(summary_writer, [model.embedding_table.name], [char_embedding_meta], checkpoint_state.model_checkpoint_path)
                # 	log('Tacotron Character embeddings have been updated on tensorboard!')

            log('Tacotron training complete after {} global steps!'.format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)