Ejemplo n.º 1
0
def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
               plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
    # Save some results for evaluation
    attention_path = str(
        plot_dir.joinpath("attention_step_{}_sample_{}".format(
            step, sample_num)))
    save_attention(attention, attention_path)

    # save predicted mel spectrogram to disk (debug)
    mel_output_fpath = mel_output_dir.joinpath(
        "mel-prediction-step-{}_sample_{}.npy".format(step, sample_num))
    np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False)

    # save griffin lim inverted wav for debug (mel -> wav)
    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
    wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format(
        step, sample_num))
    audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate)

    # save real and predicted mel-spectrogram plot to disk (control purposes)
    spec_fpath = plot_dir.joinpath(
        "step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num))
    title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron",
                                                      time_string(), step,
                                                      loss)
    plot_spectrogram(mel_prediction,
                     str(spec_fpath),
                     title=title_str,
                     target_spectrogram=target_spectrogram,
                     max_len=target_spectrogram.size // hparams.num_mels)
    print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
Ejemplo n.º 2
0
def run_mel_strip():
    import numpy as np
    from tools.spec_processor import find_endpoint, find_silences
    from synthesizer.audio import inv_mel_spectrogram, save_wav
    from synthesizer.hparams import hparams
    from matplotlib import pyplot as plt
    inpath = Path(
        r'E:\lab\zhrtvc\zhrtvc\toolbox\saved_files\mels\wavs-P00173I-001_20170001P00173I0068.wav_1567509749_我家朵朵是世界上最漂亮的朵朵。。知道自己是什么样的人。要做什么。无需活在别人非议或期待里。你勤奋.npy'
    )
    data = np.load(inpath)
    data = data.T
    print(data.shape)
    end_idx = find_silences(data, min_silence_sec=0.5, hop_silence_sec=0.2)
    print(end_idx, len(data))
    out_dir = Path(r'data/syns')
    for i, pair in enumerate(zip(end_idx[:-1], end_idx[1:]), 1):
        a, b = pair
        wav = inv_mel_spectrogram(data[a[-1]:b[0]].T, hparams)
        save_wav(wav, out_dir.joinpath(f'sil-{i:02d}.wav'),
                 hparams.sample_rate)
    plt.imshow(data.T)
    plt.colorbar()
    plt.show()
Ejemplo n.º 3
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, "taco_pretrained")
    plot_dir = os.path.join(log_dir, "plots")
    wav_dir = os.path.join(log_dir, "wavs")
    mel_dir = os.path.join(log_dir, "mel-spectrograms")
    eval_dir = os.path.join(log_dir, "eval-dir")
    eval_plot_dir = os.path.join(eval_dir, "plots")
    eval_wav_dir = os.path.join(eval_dir, "wavs")
    tensorboard_dir = os.path.join(log_dir, "tacotron_events")
    meta_folder = os.path.join(log_dir, "metas")
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")
    if hparams.if_use_speaker_classifier:
        metadat_fpath = os.path.join(args.synthesizer_root,
                                     "train_augment_speaker.txt")
    else:
        metadat_fpath = os.path.join(args.synthesizer_root, "train.txt")

    log("Checkpoint path: {}".format(checkpoint_fpath))
    log("Loading training data from: {}".format(metadat_fpath))
    log("Using model: Tacotron")
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope("datafeeder") as scope:
        feeder = Feeder(coord, metadat_fpath, hparams)

    # Set up model:
    global_step = tf.Variable(0, name="global_step", trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    # Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv")
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, "w", encoding="utf-8") as f:
            for symbol in symbols:
                if symbol == " ":
                    symbol = "\\s"  # For visual purposes, swap space with \s

                f.write("{}\n".format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, "..")

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)

    log("Tacotron training set to a maximum of {} steps".format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if checkpoint_state and checkpoint_state.model_checkpoint_path:
                        log("Loading checkpoint {}".format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log("No model to load at {}".format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_fpath,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log("Cannot restore checkpoint: {}".format(e), slack=True)
            else:
                log("Starting new training!", slack=True)
                saver.save(sess, checkpoint_fpath, global_step=global_step)

            # initializing feeder
            feeder.start_threads(sess)

            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, adversial_loss, opt = sess.run([
                    global_step, model.loss, model.adversial_loss,
                    model.optimize
                ])
                loss -= adversial_loss
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, adv_loss={:.5f}]".format(
                    step, time_window.average, loss, loss_window.average,
                    adversial_loss)
                log(message,
                    end="\r",
                    slack=(step % args.checkpoint_interval == 0))
                print(message)

                if loss > 100 or np.isnan(loss):
                    log("Loss exploded to {:.5f} at step {}".format(
                        loss, step))
                    raise Exception("Loss exploded")

                if step % args.summary_interval == 0:
                    log("\nWriting summary at step {}".format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    # Run eval and save eval stats
                    log("\nRunning evaluation at step {}".format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None
                    adversial_losses = []

                    if hparams.predict_linear:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \
       mel_t, t_len, align, lin_p, lin_t = sess.run(
                                [
                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_linear_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0],
                                    eval_model.tower_linear_outputs[0][0],
                                    eval_model.tower_linear_targets[0][0],
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            linear_losses.append(linear_loss)
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                "step-{}-eval-wave-from-linear.wav".format(
                                    step)),
                            sr=hparams.sample_rate)

                    else:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, adversial_loss, mel_p, mel_t, t_len,\
       align = sess.run(
                                [
                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_adversial_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            adversial_losses.append(adversial_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)
                    adversial_loss = sum(adversial_losses) / len(
                        adversial_losses)

                    log("Saving eval log to {}..".format(eval_dir))
                    # Save some log to monitor model improvement on same unseen sequence
                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            "step-{}-eval-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     "step-{}-eval-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir, "step-{"
                            "}-eval-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    if hparams.predict_linear:
                        plot.plot_spectrogram(
                            lin_p,
                            os.path.join(
                                eval_plot_dir,
                                "step-{}-eval-linear-spectrogram.png".format(
                                    step)),
                            title="{}, {}, step={}, loss={:.5f}".format(
                                "Tacotron", time_string(), step, eval_loss),
                            target_spectrogram=lin_t,
                            max_len=t_len,
                            auto_aspect=True)

                    log("Eval loss for global step {}: {:.3f}".format(
                        step, eval_loss))
                    log("Writing eval summary!")
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   adversial_loss, eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
                        step == 300:
                    # Save model and current global step
                    saver.save(sess, checkpoint_fpath, global_step=global_step)

                    log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.."
                        )
                    input_seq, mel_prediction, alignment, target, target_length = sess.run(
                        [
                            model.tower_inputs[0][0],
                            model.tower_mel_outputs[0][0],
                            model.tower_alignments[0][0],
                            model.tower_mel_targets[0][0],
                            model.tower_targets_lengths[0][0],
                        ])

                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = "mel-prediction-step-{}.npy".format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     "step-{}-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     "step-{}-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            "step-{}-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    #log("Input at step {}: {}".format(step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    # Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    # Update Projector
                    #log("\nSaving Model Character Embeddings visualization..")
                    #add_embedding_stats(summary_writer, [model.embedding_table.name],
                    #                    [char_embedding_meta],
                    #                    checkpoint_state.model_checkpoint_path)
                    #log("Tacotron Character embeddings have been updated on tensorboard!")

            log("Tacotron training complete after {} global steps!".format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log("Exiting due to exception: {}".format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
 def griffin_lim(mel):
     """
     Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
     with the same parameters present in hparams.py.
     """
     return audio.inv_mel_spectrogram(mel, hparams)
Ejemplo n.º 5
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames,
                   embed_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        feed_dict[self.speaker_embeddings] = [
            np.load(f) for f in embed_filenames
        ]

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            raise NotImplemented()

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        "mel-{}.npy".format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   "wavs/wav-{}-mel.wav".format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        "plots/alignment-{}.png".format(
                                            basenames[i])),
                                    title="{}".format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 "plots/mel-{}.png".format(basenames[i])),
                    title="{}".format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       "wavs/wav-{}-linear.wav".format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              "plots/linear-{}.png".format(
                                                  basenames[i])),
                                          title="{}".format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths
Ejemplo n.º 6
0
    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
    # can concatenate the mel spectrograms to a single one.
    mel = np.concatenate(mels, axis=1)
    # The vocoder can take a callback function to display the generation. More on that later. For
    # now we'll simply hide it like this:
    no_action = lambda *args: None
    print("\tTesting the vocoder...")
    # For the sake of making this test short, we'll pass a short target length. The target length
    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
    # that has a detrimental effect on the quality of the audio. The default parameters are
    # recommended in general.
    # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
    generated_wav = audio.inv_mel_spectrogram(mel, hparams.hparams)
    print("All test passed! You can now synthesize speech.\n\n")

    ## Interactive speech generation
    print(
        "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
        "show how you can interface this project easily with your own. See the source code for "
        "an explanation of what is happening.\n")

    print("Interactive generation loop")
    num_generated = 0
    while True:
        try:
            # Get the reference audio filepath
            message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
                      "wav, m4a, flac, ...):\n"
Ejemplo n.º 7
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, "taco_pretrained")
    plot_dir = os.path.join(log_dir, "plots")
    wav_dir = os.path.join(log_dir, "wavs")
    mel_dir = os.path.join(log_dir, "mel-spectrograms")
    eval_dir = os.path.join(log_dir, "eval-dir")
    eval_plot_dir = os.path.join(eval_dir, "plots")
    eval_wav_dir = os.path.join(eval_dir, "wavs")
    tensorboard_dir = os.path.join(log_dir, "tacotron_events")
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")

    log("Checkpoint path: {}".format(checkpoint_fpath))
    log("Using model: Tacotron")
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope("datafeeder") as scope:
        feeder = Feeder(coord, hparams)

    # Set up model:
    global_step = tf.Variable(0, name="global_step", trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    #eval_model = model_test_mode(args, feeder, hparams, global_step)

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=2)

    log("Tacotron training set to a maximum of {} steps".format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if checkpoint_state and checkpoint_state.model_checkpoint_path:
                        log("Loading checkpoint {}".format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log("No model to load at {}".format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_fpath,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log("Cannot restore checkpoint: {}".format(e), slack=True)
            else:
                log("Starting new training!", slack=True)
                saver.save(sess, checkpoint_fpath, global_step=global_step)

            # initializing feeder
            feeder.start_threads(sess)
            print("Feeder is intialized and model is ready to train.......")

            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end="\r",
                    slack=(step % args.checkpoint_interval == 0))
                print(message)

                if loss > 100 or np.isnan(loss):
                    log("Loss exploded to {:.5f} at step {}".format(
                        loss, step))
                    raise Exception("Loss exploded")

                if step % args.summary_interval == 0:
                    log("\nWriting summary at step {}".format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    pass


                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
                        step == 300:
                    # Save model and current global step
                    saver.save(sess, checkpoint_fpath, global_step=global_step)

                    log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.."
                        )
                    input_seq, mel_prediction, alignment, target, target_length = sess.run(
                        [
                            model.tower_inputs[0][0],
                            model.tower_mel_outputs[0][0],
                            model.tower_alignments[0][0],
                            model.tower_mel_targets[0][0],
                            model.tower_targets_lengths[0][0],
                        ])

                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = "mel-prediction-step-{}.npy".format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     "step-{}-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     "step-{}-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            "step-{}-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    # Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            log("Tacotron training complete after {} global steps!".format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log("Exiting due to exception: {}".format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
Ejemplo n.º 8
0
def griffin_lim(mel):
    return audio.inv_mel_spectrogram(mel, hparams)
Ejemplo n.º 9
0
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path,
          ground_truth: bool, save_every: int, backup_every: int,
          force_restart: bool):
    # Check to make sure the hop length is correctly factorised
    # assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
    print("Initializing the model...")
    # model = WaveRNN(
    #     rnn_dims=hp.voc_rnn_dims,
    #     fc_dims=hp.voc_fc_dims,
    #     bits=hp.bits,
    #     pad=hp.voc_pad,
    #     upsample_factors=hp.voc_upsample_factors,
    #     feat_dims=hp.num_mels,
    #     compute_dims=hp.voc_compute_dims,
    #     res_out_dims=hp.voc_res_out_dims,
    #     res_blocks=hp.voc_res_blocks,
    #     hop_length=hp.hop_length,
    #     sample_rate=hp.sample_rate,
    #     mode=hp.voc_mode
    # ).cuda()
    model = model_VC(32, 256, 512, 32).cuda()
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    for p in optimizer.param_groups:
        p["lr"] = hp.voc_lr

    loss_recon = nn.MSELoss()
    loss_content = nn.L1Loss()
    # Load the weights
    model_dir = models_dir.joinpath(run_id)
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir.joinpath(run_id + ".pt")
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of AutoVC from scratch\n")
        model.save(weights_fpath, optimizer)
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("AutoVC weights loaded from step %d" % model.step)

    # Initialize the dataset
    metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
        voc_dir.joinpath("synthesized.txt")
    mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath(
        "mels_gta")
    wav_dir = syn_dir.joinpath("audio")
    #2019.11.26
    embed_dir = syn_dir.joinpath("embeds")

    dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir, embed_dir)
    test_loader = DataLoader(dataset,
                             batch_size=1,
                             shuffle=True,
                             pin_memory=True)

    # Begin the training
    simple_table([('Batch size', hp.voc_batch_size), ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])

    for epoch in range(1, 350):

        model.train()
        data_loader = DataLoader(dataset,
                                 collate_fn=collate_vocoder,
                                 batch_size=hp.voc_batch_size,
                                 num_workers=2,
                                 shuffle=True,
                                 pin_memory=True)
        start = time.time()
        running_loss = 0.

        for i, (m, e, _) in enumerate(data_loader, 1):
            #print("e:",e.shape)
            #print("m:",m.shape)
            model.train()
            m, e = m.cuda(), e.cuda()
            # Forward pass
            C, X_C, X_before, X_after, _ = model(m, e, e)

            #c_org shape: torch.Size([100, 256, 1])
            #x shape: torch.Size([100, 80, 544])
            #c_org_expand shape torch.Size([100, 256, 544])
            #encoder_outputs shape: torch.Size([100, 544, 320])
            #C shape: torch.Size([100, 544, 64])
            #X shape: torch.Size([100, 1, 544, 80])
            X_after = X_after.squeeze(1).permute(0, 2, 1)
            X_before = X_before.squeeze(1).permute(0, 2, 1)

            #print("C shape:",C.shape)
            #if X_C:
            #    print("X_C shape:",X_C.shape)
            #print("X shape:",X.shape)
            # Backward pass
            loss_rec_before = loss_recon(X_before, m)
            loss_rec_after = loss_recon(X_after, m)
            loss_c = loss_content(C, X_C)
            loss = loss_rec_before + loss_rec_after + loss_c
            #print("recon loss:",loss1)
            #print("content loss:",loss2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print("loss:",loss.item())
            running_loss += loss.item()
            #print("running loss:",running_loss)
            speed = i / (time.time() - start)
            avg_loss = running_loss / i
            #print("avg_loss:",avg_loss)
            step = model.get_step()

            if hp.decay_learning_rate == True:
                p["lr"] = _learning_rate_decay(p["lr"], step)
            k = step // 1000
            if step % 100 == 0 and step != 0:
                model.eval()
                plt.figure(1)
                C, X_C, X_before, X_after, _ = model(m, e, e)
                X_after = X_after.squeeze(1).permute(0, 2, 1)
                mel_out = torch.tensor(X_after).clone().detach().cpu().numpy()

                from synthesizer import audio
                from synthesizer.hparams import hparams
                wav = audio.inv_mel_spectrogram(mel_out[0, :, :], hparams)
                librosa.output.write_wav("out.wav", np.float32(wav),
                                         hparams.sample_rate)

                mel_out = mel_out[0, :, :].transpose(1, 0)
                plt.imshow(mel_out.T, interpolation='nearest', aspect='auto')
                plt.title("Generate Spectrogram")
                save_path = model_dir
                p_path = save_path.joinpath("generate.png")
                plt.savefig(p_path)

                plt.figure(2)
                m_out = m.squeeze(1).permute(0, 2, 1)
                m_out = torch.tensor(m).clone().detach().cpu().numpy()
                m_out = m_out[0, :, :].transpose(1, 0)
                plt.imshow(m_out.T, interpolation='nearest', aspect='auto')
                plt.title("Orignal Spectrogram")
                o_path = save_path.joinpath("orignal.png")
                plt.savefig(o_path)

            if backup_every != 0 and step % backup_every == 0:
                model.checkpoint(model_dir, optimizer)

            if save_every != 0 and step % save_every == 0:
                model.save(weights_fpath, optimizer)
                torch.save(model, "model_ttsdb_48_48.pkl")

            msg = f"| Epoch: {epoch} ({i}/{len(data_loader)}) | " \
                f"Loss: {avg_loss:.4f} | {speed:.1f} " \
                f"steps/s | Step: {k}k | "
            stream(msg)

    # gen_testset(model, test_loader, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,hp.voc_target,model_dir)
        print("")
Ejemplo n.º 10
0
                mel = torch.from_numpy(mel[None, ...])

                embedding_tr =  embedding_tr[np.newaxis, :, np.newaxis]
                embedding_tr =torch.tensor(embedding_tr)
                mel,e1,embedding_tr = mel.cuda(),e1.cuda(),embedding_tr.cuda()
                #print("mel shape:",mel.shape)
                #print("e1 shape:",e1.shape)
                #print("e2 shape:",e2.shape)

                C,X_C,X_before,X_after,_ = model(mel, e1, embedding_tr)
                mel_out = torch.tensor(X_after).clone().detach().cpu().numpy()
                #print("mel_out shape:",mel_out.shape)
                if use_wavrnn:
                    wav = vocoder_wavrnn.infer_waveform(mel_out[0,0,:,:].T)
                else:
                    wav = audio.inv_mel_spectrogram(mel_out[0,0,:,:].T, hparams)
                wav = librosa.resample(wav,16000,24000)
                out_dir="/data/VCTK/out_v5/vcc2020-teams:00004/"
                if not os.path.exists(out_dir):
                    os.mkdir(out_dir)
                fname = t +"_"+ s +"_"+name[:-4]+".wav"
                out_dir_fpath = out_dir+"/"+fname
                librosa.output.write_wav(out_dir_fpath, wav.astype(np.float32),24000)
                print("write:{}".format(fname))