Beispiel #1
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
               plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
    # Save some results for evaluation
    attention_path = str(
        plot_dir.joinpath("attention_step_{}_sample_{}".format(
            step, sample_num)))
    save_attention(attention, attention_path)

    # save predicted mel spectrogram to disk (debug)
    mel_output_fpath = mel_output_dir.joinpath(
        "mel-prediction-step-{}_sample_{}.npy".format(step, sample_num))
    np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False)

    # save griffin lim inverted wav for debug (mel -> wav)
    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
    wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format(
        step, sample_num))
    audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate)

    # save real and predicted mel-spectrogram plot to disk (control purposes)
    spec_fpath = plot_dir.joinpath(
        "step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num))
    title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron",
                                                      time_string(), step,
                                                      loss)
    plot_spectrogram(mel_prediction,
                     str(spec_fpath),
                     title=title_str,
                     target_spectrogram=target_spectrogram,
                     max_len=target_spectrogram.size // hparams.num_mels)
    print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
def run_eval_part1(args):
  syn_ckpt = args.syn_checkpoint
  speaker_name = args.speaker_name
  eval_results_dir = os.path.join(args.eval_results_dir,
                                  speaker_name)
  if not os.path.exists(eval_results_dir):
    os.makedirs(eval_results_dir)

  speaker_id = int(speaker_name)

  sentences = [
    "Either measure the temperature with a bath thermometer or test the water with your wrist",
    "A test is a deliberate action or experiment to find out how well something works",
    "This was demonstrated in a laboratory experiment with rats",
    "This evidence supports the view that there is too much violence on television",
  ]

  # sentences = [sen.upper() for sen in sentences]

  print('eval part1> model: %s.' % syn_ckpt)
  syner = syn_infer.Synthesizer(syn_ckpt)

  ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt)
  ckpt_step = "step-"+str(ckpt_step.group(1)) if ckpt_step else syn_ckpt

  speaker_embed = np.eye(251, dtype=np.float32)[speaker_id]
  for i, text in enumerate(sentences):
    path = os.path.join(eval_results_dir,
                        "%s-%s-eval-%03d_%s.wav" % (speaker_name, ckpt_step, i, "lpcnet"))
    print('[{:<10}]: {}'.format('processing', path))
    mel_spec = syner.synthesize_spectrograms([text], [speaker_embed])[
        0]  # batch synthesize
    print('[{:<10}]:'.format('text:'), text)
    print(np.shape(mel_spec))
    # mel_spec is world output feat

    #############
    # f0, sp, ap = np.split(mel_spec, [1, 514])
    # # f0 *= 100.0
    # # sp /= 1000.0
    # f0 = np.ascontiguousarray(f0.T, dtype=np.float64)
    # sp = np.ascontiguousarray(sp.T, dtype=np.float64)
    # ap = np.ascontiguousarray(ap.T, dtype=np.float64)
    # f0 = np.squeeze(f0, -1)
    # print(np.shape(f0), np.shape(sp), np.shape(ap))
    # wav = pw.synthesize(f0, sp, ap, hparams.sample_rate)
    ##########

    lf0 = mel_spec[:, :, 0]
    mgc = mel_spec[:, :, 1:1 + hparams.n_mgc]
    bap = mel_spec[:, :, 1 + hparams.n_mgc:]
    wav = audio.synthesize(lf0, mgc, bap)
    audio.save_wav(wav, path, hparams.sample_rate)
Beispiel #4
0
    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return

        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_%d" % int(time.time())
        audio.save_wav(wav, _out_record_dir.joinpath(name + '.wav'), encoder.sampling_rate)  # save

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #5
0
def run_mel_strip():
    import numpy as np
    from tools.spec_processor import find_endpoint, find_silences
    from synthesizer.audio import inv_mel_spectrogram, save_wav
    from synthesizer.hparams import hparams
    from matplotlib import pyplot as plt
    inpath = Path(
        r'E:\lab\zhrtvc\zhrtvc\toolbox\saved_files\mels\wavs-P00173I-001_20170001P00173I0068.wav_1567509749_我家朵朵是世界上最漂亮的朵朵。。知道自己是什么样的人。要做什么。无需活在别人非议或期待里。你勤奋.npy'
    )
    data = np.load(inpath)
    data = data.T
    print(data.shape)
    end_idx = find_silences(data, min_silence_sec=0.5, hop_silence_sec=0.2)
    print(end_idx, len(data))
    out_dir = Path(r'data/syns')
    for i, pair in enumerate(zip(end_idx[:-1], end_idx[1:]), 1):
        a, b = pair
        wav = inv_mel_spectrogram(data[a[-1]:b[0]].T, hparams)
        save_wav(wav, out_dir.joinpath(f'sil-{i:02d}.wav'),
                 hparams.sample_rate)
    plt.imshow(data.T)
    plt.colorbar()
    plt.show()
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, "taco_pretrained")
    plot_dir = os.path.join(log_dir, "plots")
    wav_dir = os.path.join(log_dir, "wavs")
    mel_dir = os.path.join(log_dir, "mel-spectrograms")
    eval_dir = os.path.join(log_dir, "eval-dir")
    eval_plot_dir = os.path.join(eval_dir, "plots")
    eval_wav_dir = os.path.join(eval_dir, "wavs")
    tensorboard_dir = os.path.join(log_dir, "tacotron_events")
    meta_folder = os.path.join(log_dir, "metas")
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")
    if hparams.if_use_speaker_classifier:
        metadat_fpath = os.path.join(args.synthesizer_root,
                                     "train_augment_speaker.txt")
    else:
        metadat_fpath = os.path.join(args.synthesizer_root, "train.txt")

    log("Checkpoint path: {}".format(checkpoint_fpath))
    log("Loading training data from: {}".format(metadat_fpath))
    log("Using model: Tacotron")
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope("datafeeder") as scope:
        feeder = Feeder(coord, metadat_fpath, hparams)

    # Set up model:
    global_step = tf.Variable(0, name="global_step", trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    eval_model = model_test_mode(args, feeder, hparams, global_step)

    # Embeddings metadata
    char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv")
    if not os.path.isfile(char_embedding_meta):
        with open(char_embedding_meta, "w", encoding="utf-8") as f:
            for symbol in symbols:
                if symbol == " ":
                    symbol = "\\s"  # For visual purposes, swap space with \s

                f.write("{}\n".format(symbol))

    char_embedding_meta = char_embedding_meta.replace(log_dir, "..")

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)

    log("Tacotron training set to a maximum of {} steps".format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if checkpoint_state and checkpoint_state.model_checkpoint_path:
                        log("Loading checkpoint {}".format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log("No model to load at {}".format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_fpath,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log("Cannot restore checkpoint: {}".format(e), slack=True)
            else:
                log("Starting new training!", slack=True)
                saver.save(sess, checkpoint_fpath, global_step=global_step)

            # initializing feeder
            feeder.start_threads(sess)

            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, adversial_loss, opt = sess.run([
                    global_step, model.loss, model.adversial_loss,
                    model.optimize
                ])
                loss -= adversial_loss
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, adv_loss={:.5f}]".format(
                    step, time_window.average, loss, loss_window.average,
                    adversial_loss)
                log(message,
                    end="\r",
                    slack=(step % args.checkpoint_interval == 0))
                print(message)

                if loss > 100 or np.isnan(loss):
                    log("Loss exploded to {:.5f} at step {}".format(
                        loss, step))
                    raise Exception("Loss exploded")

                if step % args.summary_interval == 0:
                    log("\nWriting summary at step {}".format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    # Run eval and save eval stats
                    log("\nRunning evaluation at step {}".format(step))

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    stop_token_losses = []
                    linear_losses = []
                    linear_loss = None
                    adversial_losses = []

                    if hparams.predict_linear:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \
       mel_t, t_len, align, lin_p, lin_t = sess.run(
                                [
                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_linear_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0],
                                    eval_model.tower_linear_outputs[0][0],
                                    eval_model.tower_linear_targets[0][0],
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            linear_losses.append(linear_loss)
                        linear_loss = sum(linear_losses) / len(linear_losses)

                        wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
                        audio.save_wav(
                            wav,
                            os.path.join(
                                eval_wav_dir,
                                "step-{}-eval-wave-from-linear.wav".format(
                                    step)),
                            sr=hparams.sample_rate)

                    else:
                        for i in tqdm(range(feeder.test_steps)):
                            eloss, before_loss, after_loss, stop_token_loss, adversial_loss, mel_p, mel_t, t_len,\
       align = sess.run(
                                [
                                    eval_model.tower_loss[0], eval_model.tower_before_loss[0],
                                    eval_model.tower_after_loss[0],
                                    eval_model.tower_stop_token_loss[0],
                                    eval_model.tower_adversial_loss[0],
                                    eval_model.tower_mel_outputs[0][0],
                                    eval_model.tower_mel_targets[0][0],
                                    eval_model.tower_targets_lengths[0][0],
                                    eval_model.tower_alignments[0][0]
                                ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            stop_token_losses.append(stop_token_loss)
                            adversial_losses.append(adversial_loss)

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    stop_token_loss = sum(stop_token_losses) / len(
                        stop_token_losses)
                    adversial_loss = sum(adversial_losses) / len(
                        adversial_losses)

                    log("Saving eval log to {}..".format(eval_dir))
                    # Save some log to monitor model improvement on same unseen sequence
                    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(
                            eval_wav_dir,
                            "step-{}-eval-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    plot.plot_alignment(
                        align,
                        os.path.join(eval_plot_dir,
                                     "step-{}-eval-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, eval_loss),
                        max_len=t_len // hparams.outputs_per_step)
                    plot.plot_spectrogram(
                        mel_p,
                        os.path.join(
                            eval_plot_dir, "step-{"
                            "}-eval-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, eval_loss),
                        target_spectrogram=mel_t,
                        max_len=t_len)

                    if hparams.predict_linear:
                        plot.plot_spectrogram(
                            lin_p,
                            os.path.join(
                                eval_plot_dir,
                                "step-{}-eval-linear-spectrogram.png".format(
                                    step)),
                            title="{}, {}, step={}, loss={:.5f}".format(
                                "Tacotron", time_string(), step, eval_loss),
                            target_spectrogram=lin_t,
                            max_len=t_len,
                            auto_aspect=True)

                    log("Eval loss for global step {}: {:.3f}".format(
                        step, eval_loss))
                    log("Writing eval summary!")
                    add_eval_stats(summary_writer, step, linear_loss,
                                   before_loss, after_loss, stop_token_loss,
                                   adversial_loss, eval_loss)

                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
                        step == 300:
                    # Save model and current global step
                    saver.save(sess, checkpoint_fpath, global_step=global_step)

                    log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.."
                        )
                    input_seq, mel_prediction, alignment, target, target_length = sess.run(
                        [
                            model.tower_inputs[0][0],
                            model.tower_mel_outputs[0][0],
                            model.tower_alignments[0][0],
                            model.tower_mel_targets[0][0],
                            model.tower_targets_lengths[0][0],
                        ])

                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = "mel-prediction-step-{}.npy".format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     "step-{}-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     "step-{}-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            "step-{}-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)
                    #log("Input at step {}: {}".format(step, sequence_to_text(input_seq)))

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    # Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    # Update Projector
                    #log("\nSaving Model Character Embeddings visualization..")
                    #add_embedding_stats(summary_writer, [model.embedding_table.name],
                    #                    [char_embedding_meta],
                    #                    checkpoint_state.model_checkpoint_path)
                    #log("Tacotron Character embeddings have been updated on tensorboard!")

            log("Tacotron training complete after {} global steps!".format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log("Exiting due to exception: {}".format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
Beispiel #7
0
    def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames,
                   embed_filenames):
        hparams = self._hparams
        cleaner_names = [x.strip() for x in hparams.cleaners.split(",")]

        assert 0 == len(texts) % self._hparams.tacotron_num_gpus
        seqs = [
            np.asarray(text_to_sequence(text, cleaner_names)) for text in texts
        ]
        input_lengths = [len(seq) for seq in seqs]

        size_per_device = len(seqs) // self._hparams.tacotron_num_gpus

        #Pad inputs according to each GPU max length
        input_seqs = None
        split_infos = []
        for i in range(self._hparams.tacotron_num_gpus):
            device_input = seqs[size_per_device * i:size_per_device * (i + 1)]
            device_input, max_seq_len = self._prepare_inputs(device_input)
            input_seqs = np.concatenate(
                (input_seqs, device_input),
                axis=1) if input_seqs is not None else device_input
            split_infos.append([max_seq_len, 0, 0, 0])

        feed_dict = {
            self.inputs: input_seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32),
        }

        if self.gta:
            np_targets = [
                np.load(mel_filename) for mel_filename in mel_filenames
            ]
            target_lengths = [len(np_target) for np_target in np_targets]

            #pad targets according to each GPU max length
            target_seqs = None
            for i in range(self._hparams.tacotron_num_gpus):
                device_target = np_targets[size_per_device *
                                           i:size_per_device * (i + 1)]
                device_target, max_target_len = self._prepare_targets(
                    device_target, self._hparams.outputs_per_step)
                target_seqs = np.concatenate(
                    (target_seqs, device_target),
                    axis=1) if target_seqs is not None else device_target
                split_infos[i][
                    1] = max_target_len  #Not really used but setting it in case for future development maybe?

            feed_dict[self.targets] = target_seqs
            assert len(np_targets) == len(texts)

        feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32)
        feed_dict[self.speaker_embeddings] = [
            np.load(f) for f in embed_filenames
        ]

        if self.gta or not hparams.predict_linear:
            mels, alignments, stop_tokens = self.session.run(
                [
                    self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            if not self.gta:
                #Natural batch synthesis
                #Get Mel lengths for the entire batch from stop_tokens predictions
                target_lengths = self._get_output_lengths(stop_tokens)

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            assert len(mels) == len(texts)

        else:
            linears, mels, alignments, stop_tokens = self.session.run(
                [
                    self.linear_outputs, self.mel_outputs, self.alignments,
                    self.stop_token_prediction
                ],
                feed_dict=feed_dict)
            #Linearize outputs (1D arrays)
            linears = [
                linear for gpu_linear in linears for linear in gpu_linear
            ]
            mels = [mel for gpu_mels in mels for mel in gpu_mels]
            alignments = [
                align for gpu_aligns in alignments for align in gpu_aligns
            ]
            stop_tokens = [
                token for gpu_token in stop_tokens for token in gpu_token
            ]

            #Natural batch synthesis
            #Get Mel/Linear lengths for the entire batch from stop_tokens predictions
            # target_lengths = self._get_output_lengths(stop_tokens)
            target_lengths = [9999]

            #Take off the batch wise padding
            mels = [
                mel[:target_length, :]
                for mel, target_length in zip(mels, target_lengths)
            ]
            linears = [
                linear[:target_length, :]
                for linear, target_length in zip(linears, target_lengths)
            ]
            assert len(mels) == len(linears) == len(texts)

        if basenames is None:
            raise NotImplemented()

        saved_mels_paths = []
        for i, mel in enumerate(mels):
            # Write the spectrogram to disk
            # Note: outputs mel-spectrogram files and target ones have same names, just different folders
            mel_filename = os.path.join(out_dir,
                                        "mel-{}.npy".format(basenames[i]))
            np.save(mel_filename, mel, allow_pickle=False)
            saved_mels_paths.append(mel_filename)

            if log_dir is not None:
                #save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav,
                               os.path.join(
                                   log_dir,
                                   "wavs/wav-{}-mel.wav".format(basenames[i])),
                               sr=hparams.sample_rate)

                #save alignments
                plot.plot_alignment(alignments[i],
                                    os.path.join(
                                        log_dir,
                                        "plots/alignment-{}.png".format(
                                            basenames[i])),
                                    title="{}".format(texts[i]),
                                    split_title=True,
                                    max_len=target_lengths[i])

                #save mel spectrogram plot
                plot.plot_spectrogram(
                    mel,
                    os.path.join(log_dir,
                                 "plots/mel-{}.png".format(basenames[i])),
                    title="{}".format(texts[i]),
                    split_title=True)

                if hparams.predict_linear:
                    #save wav (linear -> wav)
                    wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                    audio.save_wav(wav,
                                   os.path.join(
                                       log_dir,
                                       "wavs/wav-{}-linear.wav".format(
                                           basenames[i])),
                                   sr=hparams.sample_rate)

                    #save linear spectrogram plot
                    plot.plot_spectrogram(linears[i],
                                          os.path.join(
                                              log_dir,
                                              "plots/linear-{}.png".format(
                                                  basenames[i])),
                                          title="{}".format(texts[i]),
                                          split_title=True,
                                          auto_aspect=True)

        return saved_mels_paths
def run_eval_part1(args):
    speaker_enc_ckpt = args.speaker_encoder_checkpoint
    syn_ckpt = args.syn_checkpoint
    speaker_name = args.speaker_name
    eval_results_dir = os.path.join(args.eval_results_dir, speaker_name)
    if not os.path.exists(eval_results_dir):
        os.makedirs(eval_results_dir)
    speaker_audio_dirs = {
        "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"],
        "biaobei_speaker": [
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav",
        ],
        "SLR68_DEV_3756_22": [
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203118.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203322.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203824.wav"
        ],
        "SLR38_P00001A": [
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0004.wav",
        ],
        "aishell_C0002": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav",
        ],
        "aishell_C0896": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav",
        ],
    }[speaker_name]
    sentences = [
        # '美国主持人听到“中国”就插话',
        # '勉励乡亲们为过上更加幸福美好的生活继续团结奋斗。',
        # '中国基建领域又来了一款“神器”, 哪里不平平哪里',
        # '违反中央八项规定精神和廉洁纪律,违规出入私人会所和打高尔夫球',
        # '陪审团未能就其盗窃和藏匿文物罪名作出裁决',
        # '于美国首都华盛顿国家记者俱乐部召开的新闻发布会上说',
        # '杭州市卫健委某直属单位一名拟提副处级干部刘某公示期间,纪检监察组照例对其个人重大事项进行抽查',
        # '我国森林面积、森林蓄积分别增长一倍左右,人工林面积居全球第一',
        # '打打打打打打打打打打打',
        # '卡尔普陪外孙玩滑梯。',
        # '假语村言,别再拥抱我。',
        # '宝马配挂跛骡鞍,貂蝉怨枕董翁榻。',
        # '中国地震台网速报,'
        # '中国地震台网正式测定,',
        # '06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震',
        # '中国地震台网速报,中国地震台网正式测定:06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震',
        # '震源深度9千米,震中位于海中,距台湾岛最近约47公里。',
        # '刚刚,台湾发生5.8级地震,与此同时,泉州厦门漳州震感明显,',
        # '此次台湾地震发生后,许多网友为同胞祈福,愿平安,',
        '新世界百货望京店',
        '全聚德烤鸭店王府井店',
        '麻烦帮我把空调温度调整到二十四',
        '请帮我显示中央一套',  # aishell IC0896W0001.wav
        '确定下载三帝狂野飙车',  # aishell IC0896W0002.wav
        '请帮我开启深圳卫视国际频道',  # aishell IC0896W0003.wav
        '您吃饭了吗,我今天吃的太撑了',
        '您吃饭了吗?',
        '你多大了,你到底多大了,我猜你三十了,他多大了,他到底多大了,他猜你三十了',
        '二毛你今天沒课嘛还和李霞聊天',
    ]

    text2pinyin = partial(get_pinyin, std=True, pb=True)
    sentences = [' '.join(text2pinyin(sent)) for sent in sentences]

    print('eval part1> model: %s.' % syn_ckpt)
    syner = syn_infer.Synthesizer(syn_ckpt)
    encoder_infer.load_model(speaker_enc_ckpt)

    ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt)
    ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt

    speaker_audio_wav_list = [
        encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs
    ]
    speaker_audio_wav = np.concatenate(speaker_audio_wav_list)
    print(os.path.join(eval_results_dir, '000_refer_speaker_audio.wav'))
    audio.save_wav(
        speaker_audio_wav,
        os.path.join(eval_results_dir, '000_refer_speaker_audio.wav'),
        hparams.sample_rate)
    speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav)
    for i, text in enumerate(sentences):
        path = os.path.join(eval_results_dir,
                            "%s-eval-%03d.wav" % (ckpt_step, i))
        print('[{:<10}]: {}'.format('processing', path))
        mel_spec = syner.synthesize_spectrograms(
            [text], [speaker_embed])[0]  # batch synthesize
        print('[{:<10}]:'.format('text:'), text)
        # print(np.shape(mel_spec))
        wav = syner.griffin_lim(mel_spec)
        audio.save_wav(wav, path, hparams.sample_rate)
Beispiel #9
0
def train(log_dir, args, hparams):
    save_dir = os.path.join(log_dir, "taco_pretrained")
    plot_dir = os.path.join(log_dir, "plots")
    wav_dir = os.path.join(log_dir, "wavs")
    mel_dir = os.path.join(log_dir, "mel-spectrograms")
    eval_dir = os.path.join(log_dir, "eval-dir")
    eval_plot_dir = os.path.join(eval_dir, "plots")
    eval_wav_dir = os.path.join(eval_dir, "wavs")
    tensorboard_dir = os.path.join(log_dir, "tacotron_events")
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)

    checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt")

    log("Checkpoint path: {}".format(checkpoint_fpath))
    log("Using model: Tacotron")
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.tacotron_random_seed)

    # Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope("datafeeder") as scope:
        feeder = Feeder(coord, hparams)

    # Set up model:
    global_step = tf.Variable(0, name="global_step", trainable=False)
    model, stats = model_train_mode(args, feeder, hparams, global_step)
    #eval_model = model_test_mode(args, feeder, hparams, global_step)

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=2)

    log("Tacotron training set to a maximum of {} steps".format(
        args.tacotron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if checkpoint_state and checkpoint_state.model_checkpoint_path:
                        log("Loading checkpoint {}".format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log("No model to load at {}".format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_fpath,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log("Cannot restore checkpoint: {}".format(e), slack=True)
            else:
                log("Starting new training!", slack=True)
                saver.save(sess, checkpoint_fpath, global_step=global_step)

            # initializing feeder
            feeder.start_threads(sess)
            print("Feeder is intialized and model is ready to train.......")

            # Training loop
            while not coord.should_stop() and step < args.tacotron_train_steps:
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format(
                    step, time_window.average, loss, loss_window.average)
                log(message,
                    end="\r",
                    slack=(step % args.checkpoint_interval == 0))
                print(message)

                if loss > 100 or np.isnan(loss):
                    log("Loss exploded to {:.5f} at step {}".format(
                        loss, step))
                    raise Exception("Loss exploded")

                if step % args.summary_interval == 0:
                    log("\nWriting summary at step {}".format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.eval_interval == 0:
                    pass


                if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \
                        step == 300:
                    # Save model and current global step
                    saver.save(sess, checkpoint_fpath, global_step=global_step)

                    log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.."
                        )
                    input_seq, mel_prediction, alignment, target, target_length = sess.run(
                        [
                            model.tower_inputs[0][0],
                            model.tower_mel_outputs[0][0],
                            model.tower_alignments[0][0],
                            model.tower_mel_targets[0][0],
                            model.tower_targets_lengths[0][0],
                        ])

                    # save predicted mel spectrogram to disk (debug)
                    mel_filename = "mel-prediction-step-{}.npy".format(step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            mel_prediction.T,
                            allow_pickle=False)

                    # save griffin lim inverted wav for debug (mel -> wav)
                    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     "step-{}-wave-from-mel.wav".format(step)),
                        sr=hparams.sample_rate)

                    # save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     "step-{}-align.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        max_len=target_length // hparams.outputs_per_step)
                    # save real and predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        mel_prediction,
                        os.path.join(
                            plot_dir,
                            "step-{}-mel-spectrogram.png".format(step)),
                        title="{}, {}, step={}, loss={:.5f}".format(
                            "Tacotron", time_string(), step, loss),
                        target_spectrogram=target,
                        max_len=target_length)

                if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1:
                    # Get current checkpoint state
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

            log("Tacotron training complete after {} global steps!".format(
                args.tacotron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log("Exiting due to exception: {}".format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
Beispiel #10
0
def run_eval_part1(args):
    speaker_enc_ckpt = args.speaker_encoder_checkpoint
    syn_ckpt = args.syn_checkpoint
    speaker_name = args.speaker_name
    eval_results_dir = os.path.join(args.eval_results_dir, speaker_name)
    if not os.path.exists(eval_results_dir):
        os.makedirs(eval_results_dir)
    speaker_audio_dirs = {
        "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"],
        "vctk_p225": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_005.wav",
        ],
        "vctk_p226": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_005.wav",
        ],
        "vctk_p227": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_005.wav",
        ],
        "vctk_p228": [
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_005.wav",
        ],
        "biaobei_speaker": [
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav",
        ],
        "aishell_C0002": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav",
        ],
        "aishell_C0896": [
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav",
            "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav",
        ],
    }[speaker_name]
    sentences = [
        "THAT MATTER OF TROY AND ACHILLES WRATH ONE TWO THREE RATS",
        "ENDED THE QUEST OF THE HOLY GRAAL JERUSALEM A HANDFUL OF ASHES BLOWN BY THE WIND EXTINCT",
        "She can scoop these things into three red bags",
        "and we will go meet her Wednesday at the train station",
        "This was demonstrated in a laboratory experiment with rats."
    ]

    sentences = [sen.upper() for sen in sentences]

    sentences.append(
        "This was demonstrated in a laboratory experiment with rats")

    print('eval part1> model: %s.' % syn_ckpt)
    syner = syn_infer.Synthesizer(syn_ckpt)
    encoder_infer.load_model(speaker_enc_ckpt)

    ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt)
    ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt

    speaker_audio_wav_list = [
        encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs
    ]
    speaker_audio_wav = np.concatenate(speaker_audio_wav_list)
    print(
        os.path.join(eval_results_dir,
                     '%s-000_refer_speaker_audio.wav' % speaker_name))
    audio.save_wav(
        speaker_audio_wav,
        os.path.join(eval_results_dir,
                     '%s-000_refer_speaker_audio.wav' % speaker_name),
        hparams.sample_rate)
    speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav)
    for i, text in enumerate(sentences):
        path = os.path.join(
            eval_results_dir,
            "%s-%s-eval-%03d.wav" % (speaker_name, ckpt_step, i))
        print('[{:<10}]: {}'.format('processing', path))
        mel_spec = syner.synthesize_spectrograms(
            [text], [speaker_embed])[0]  # batch synthesize
        print('[{:<10}]:'.format('text:'), text)
        # print(np.shape(mel_spec))
        wav = syner.griffin_lim(mel_spec)
        audio.save_wav(wav, path, hparams.sample_rate)