コード例 #1
0
def audio(outputs, res_pth):
    src = outputs[0]
    res = outputs[1]

    # save audio
    save_wav(src, res_pth + '_src.wav')
    save_wav(res, res_pth + '_res.wav')
コード例 #2
0
def extract_mel(wav_filename, out_wav_path, out_dir, key, hparams, args):
    if not os.path.exists(wav_filename):
        print("Wav file {} doesn't exists.".format(wav_filename))
        return None

    wav = audio.load_wav(wav_filename, sr=hparams.sample_rate)
    # Process wav samples
    wav = audio.trim_silence(wav, hparams)
    n_samples = len(wav)

    # Extract mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    n_frames = mel_spectrogram.shape[1]
    if n_frames > hparams.max_acoustic_length:
        print(
            "Ignore wav {} because the frame number {} is too long (Max {} frames in hparams.yaml)."
            .format(wav_filename, n_frames, hparams.max_acoustic_length))
        return None

    # Align features
    desired_frames = int(min(n_samples / hparams.hop_size, n_frames))
    wav = wav[:desired_frames * hparams.hop_size]
    mel_spectrogram = mel_spectrogram[:, :desired_frames]
    n_samples = wav.shape[0]
    n_frames = mel_spectrogram.shape[1]
    assert (n_samples / hparams.hop_size == n_frames)

    # Save intermediate acoustic features
    mel_filename = os.path.join(out_dir, key + '.npy')
    np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
    audio.save_wav(wav, out_wav_path, hparams)

    return (wav_filename, mel_filename, n_samples, n_frames)
コード例 #3
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                checkpoint_dir=None):

    idx = 1  # idx = np.random.randint(0, len(mel_outputs))

    # Alignment
    path = os.path.join(checkpoint_dir,
                        "step{}_alignment.png".format(global_step))
    alignment = attn[idx].cpu().data.numpy(
    )  # alignment = attn[idx].cpu().data.numpy()[:, :input_length]
    plot_alignment(alignment.T,
                   path,
                   info="tacotron, step={}".format(global_step))

    # Predicted spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = os.path.join(checkpoint_dir,
                        "step{}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = os.path.join(checkpoint_dir,
                        "step{}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    plot_spectrogram(linear_output, path)
コード例 #4
0
    def synthesize(self, inputs):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq_input = [text_to_sequence(j, cleaner_names) for j in inputs]
        seq_length = [len(j) for j in seq_input]
        max_len = max(seq_length)
        inputs = [_pad_input(j, max_len) for j in seq_input]
        seq = np.stack((x for x in inputs))

        # seq = text_to_sequence(text, cleaner_names)
        if not self.model_filename.endswith('.pb'):
            feed_dict = {
                self.model.inputs: np.asarray(seq, dtype=np.int32),
                self.model.input_lengths: np.asarray(seq_length, dtype=np.int32)
            }
        else:
            feed_dict = {
                self.inputs: np.asarray(seq, dtype=np.int32),
                self.input_lengths: np.asarray(seq_length, dtype=np.int32)
            }

        wav = self.session.run(self.wav_output, feed_dict=feed_dict)

        output = []
        print('wav.shape:', wav.shape)
        for wav_index in range(wav.shape[0]):
            wav_index_temp = audio.inv_preemphasis(wav[wav_index])

            wav_index_temp = wav_index_temp[:audio.find_endpoint(wav_index_temp)]
            # wav_index_temp = vad_check(wav_index_temp, hparams.sample_rate)

            out = io.BytesIO()
            audio.save_wav(wav_index_temp, out)
            output.append(out)
        return output
コード例 #5
0
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T, hparams)
    save_wav(waveform, audio_path, hparams.sample_rate)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
コード例 #6
0
def main():
    data_foler = "data"
    wavs = [
        os.path.join(data_foler, file[:-4]) for file in os.listdir(data_foler)
        if file.endswith(".wav")
    ]
    outputs_lws = [file + ".lws.gen.wav" for file in wavs]
    wavs = [
        audio.load_wav(wav_path + ".wav", hparams.sample_rate)
        for wav_path in wavs
    ]

    lws_processor = lws.lws(
        512, 128, mode="speech")  # 512: window length; 128: window shift
    i = 0
    for x in wavs:
        X = lws_processor.stft(x)  # where x is a single-channel waveform
        X0 = np.abs(X)  # Magnitude spectrogram
        print('{:6}: {:5.2f} dB'.format('Abs(X)',
                                        lws_processor.get_consistency(X0)))
        X1 = lws_processor.run_lws(
            X0
        )  # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
        print(X1.shape)
        print('{:6}: {:5.2f} dB'.format('LWS',
                                        lws_processor.get_consistency(X1)))
        print(X1.shape)
        wav = lws_processor.istft(X1).astype(np.float32)

        audio.save_wav(wav, outputs_lws[i])
        i += 1
コード例 #7
0
def audio(output, pth):
    mel_outputs, mel_outputs_postnet, _ = output
    wav = inv_melspectrogram(to_arr(mel_outputs[0]))
    wav_postnet = inv_melspectrogram(to_arr(mel_outputs_postnet[0]))
    save_wav(wav, pth + '.wav')
    save_wav(wav_postnet, pth + '_post.wav')
    print('wav save to:', pth + '.wav')
    print('postnet_wav save to:', pth + '_post.wav')
コード例 #8
0
def save_current_model(args, checkpoint_path, global_step, hparams, loss,
                       model, plot_dir, saver, sess, step, wav_dir):
    # Save model and current global step
    saver.save(sess, checkpoint_path, global_step=global_step)
    log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
        )
    input_seq, mel_prediction, linear_prediction, attention_mask_sample, targets_mel, target_length, linear_target = sess.run(
        [
            model.inputs[0],
            model.post_net_predictions[0],
            model.mag_pred[0],
            model.alignments[0],
            model.targets_mel[0],
            model.targets_length[0],
            model.targets_mag[0],
        ])
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    # save griffin lim inverted wav for debug (linear -> wav)
    wav = audio.inv_linear_spectrogram(linear_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    # Save real and predicted linear-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(
        linear_prediction,
        os.path.join(plot_dir, '{}-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, loss),
        target_spectrogram=linear_target,
        max_len=target_length,
        auto_aspect=True)
    # save griffin lim inverted wav for debug (mel -> wav)
    wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
    audio.save_wav(wav,
                   os.path.join(wav_dir, '{}-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    # save alignment plot to disk (control purposes)
    for i in range(len(alignments)):
        plot.plot_alignment(
            alignments[i],
            os.path.join(plot_dir,
                         '{}_{}-align.png'.format(step, alignment_titles[i])),
            title='{}, {}, step={}, loss={:.5f}'.format(
                args.model, time_string(), step, loss),
            max_len=target_length // hparams.reduction_factor)
    # save real and predicted mel-spectrogram plot to disk (control purposes)
    plot.plot_spectrogram(mel_prediction,
                          os.path.join(plot_dir,
                                       '{}-mel-spectrogram.png'.format(step)),
                          title='{}, {}, step={}, loss={:.5f}'.format(
                              args.model, time_string(), step, loss),
                          target_spectrogram=targets_mel,
                          max_len=target_length)
    log('Input at step {}: {}'.format(step, sequence_to_text(input_seq)))
コード例 #9
0
ファイル: generate.py プロジェクト: taktak1/GAN-TTS
def synthesis(args):

    model = create_model(args)
    if args.resume is not None:
        attempt_to_restore(model, args.resume, args.use_cuda)

    device = torch.device("cuda" if args.use_cuda else "cpu")

    output_dir = "samples"
    os.makedirs(output_dir, exist_ok=True)

    lists = []
    for filename in os.listdir(os.path.join(args.input, 'mel')):
        lists.append(filename)
    start = time.time()
    conditions = [
        np.load(os.path.join(args.input, 'mel', filename))
        for filename in lists
    ]
    lengths = [condition.shape[0] for condition in conditions]
    max_len = max(lengths)
    conditions = [
        np.concatenate(
            (condition,
             np.zeros((max_len - condition.shape[0], condition.shape[1]))),
            axis=0) for condition in conditions
    ]

    conditions = np.stack(conditions)
    conditions = torch.FloatTensor(conditions)
    conditions = conditions.transpose(1, 2).to(device)
    batch_size = conditions.size()[0]
    z = torch.randn(batch_size, args.z_dim).to(device).normal_(0.0, 0.6)
    print(conditions.shape)
    audios = model(conditions, z)
    audios = audios.cpu().squeeze().detach().numpy()
    print(audios.shape)
    for (i, filename) in enumerate(lists):
        name = filename.split('.')[0]
        sample = np.load(os.path.join(args.input, 'audio', filename))
        sample = mu_law_decode(mu_law_encode(sample))
        save_wav(np.squeeze(sample),
                 '{}/{}_target.wav'.format(output_dir, name))
        save_wav(
            np.asarray(audios[i])[:len(sample)],
            '{}/{}.wav'.format(output_dir, name))
    print("Time used: {:.3f}".format(time.time() - start))
コード例 #10
0
def test_synth(model, step, dst_path):

    f = open('test.txt')
    for line in f:
        if len(line) > 2:
            line = line.split('\n')[0]
            fname = line.split()[0]
            content = line.split()[1:]
            content = ' '.join(k for k in content)
            content = re.sub(r'[^\w\s]', '', content)
            text = ' '.join(
                str(charids[k.lower()] if k.lower() in
                    charids.keys() else charids['UNK']) for k in content)
            waveform, alignment, _ = tts(model, text.split())
            dst_wav_path = join(dst_path, "{}_step{}.wav".format(fname, step))
            audio.save_wav(waveform, dst_wav_path)

    model.train()
コード例 #11
0
ファイル: synthesize.py プロジェクト: ishine/byte2speech
    def save_i(i):
        try:
            name = names[i]
            mel = mel_aft[i][:generated_lengths[i]]
            np.save(os.path.join(output_dir, '%s.npy' % name), mel)
            wav = mel2wav(mel)
            save_wav(wav, os.path.join(output_dir, '%s.wav' % name))
            if save_trimmed_wave:
                wav_trim = trim_silence_intervals(wav)
                save_wav(wav_trim, os.path.join(output_dir, '%s_trim.wav' % name))
            plot_mel(os.path.join(output_dir, '%s_mel.png' % name), mel)

            if n_plot_alignment is None or i < n_plot_alignment:
                aligns = [a[i].transpose([0, 2, 1]) for a in alignments["encdec"]]
                plot_attn(aligns, os.path.join(output_dir, '%s_align.png' % (name)),
                          enc_length=input_lengths[i], dec_length=generated_lengths[i])

        except:
            logging.error('Fail to produce eval output: ' + names[i])
            logging.error(traceback.format_exc())
コード例 #12
0
def synthesis(args):

    model = create_model(args)
    if args.resume is not None:
        attempt_to_restore(model, args.resume, args.use_cuda)
    model.after_update()

    output_dir = "out"
    os.makedirs(output_dir, exist_ok=True)

    lists = []
    for filename in os.listdir(os.path.join(args.input, 'mel')):
        lists.append(filename)
    start = time.time()
    conditions = [
        np.load(os.path.join(args.input, 'mel', filename))
        for filename in lists
    ]
    lengths = [condition.shape[0] for condition in conditions]
    max_len = max(lengths)
    conditions = [
        np.concatenate(
            (condition,
             np.zeros((max_len - condition.shape[0], condition.shape[1]))),
            axis=0) for condition in conditions
    ]

    conditions = np.stack(conditions)
    conditions = torch.FloatTensor(conditions)
    audios = model.generate(conditions)
    print(audios.shape)
    for (i, filename) in enumerate(lists):
        name = filename.split('.')[0]
        sample = np.load(os.path.join(args.input, 'audio', filename))
        save_wav(np.squeeze(sample),
                 '{}/{}_target.wav'.format(output_dir, name))
        save_wav(
            np.asarray(audios[i])[:len(sample)],
            '{}/{}.wav'.format(output_dir, name))
    print("Time used: {:.3f}".format(time.time() - start))
コード例 #13
0
def synthesis(args):

    model = create_model(args)
    model.eval()

    if args.resume is not None:
        attempt_to_restore(model, args.resume, args.use_cuda)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    model.to(device)
    model.remove_weight_norm()

    output_dir = "samples"
    target_dir = os.path.join(output_dir, "target")
    predict_dir = os.path.join(output_dir, "predict")
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(target_dir, exist_ok=True)
    os.makedirs(predict_dir, exist_ok=True)

    avg_rtf = []
    for filename in os.listdir(os.path.join(args.input, 'mel')):
        start = time.time()
        conditions = np.load(os.path.join(args.input, 'mel', filename))
        conditions = torch.FloatTensor(conditions).unsqueeze(0)
        conditions = conditions.transpose(1, 2).to(device)
        audio = model(conditions)
        audio = audio.cpu().squeeze().detach().numpy()
        print(audio.shape)
        name = filename.split('.')[0]
        sample = np.load(os.path.join(args.input, 'audio', filename))
        save_wav(np.squeeze(sample),
                 '{}/{}_target.wav'.format(target_dir, name))
        save_wav(np.asarray(audio), '{}/{}.wav'.format(predict_dir, name))
        time_used = time.time() - start
        rtf = time_used / (len(audio) / 16000)
        avg_rtf.append(rtf)
        print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))
    print("Average RTF: {:.3f}".format(sum(avg_rtf) / len(avg_rtf)))
コード例 #14
0
ファイル: generate.py プロジェクト: willawebb/GANTTSResults
def synthesis(args):

    model = create_model(args)
    if args.resume is not None:
        attempt_to_restore(model, args.resume, args.use_cuda)

    device = torch.device("cuda" if args.use_cuda else "cpu")
    model.to(device)

    output_dir = "samples"
    os.makedirs(output_dir, exist_ok=True)

    avg_rtf = []
    for filename in os.listdir(os.path.join(args.input, 'mel')):
        start = time.time()
        conditions = np.load(os.path.join(args.input, 'mel', filename))
        conditions = torch.FloatTensor(conditions).unsqueeze(0)
        conditions = conditions.transpose(1, 2).to(device)

        batch_size = conditions.size()[0]
        z = torch.randn(batch_size, args.z_dim).to(device).normal_(0.0, 1.0)
        audios = model(conditions, z)
        audios = audios.cpu().squeeze().detach().numpy()
        print(audios.shape)
        name = filename.split('.')[0]
        sample = np.load(os.path.join(args.input, 'audio', filename))
        sample = mu_law_decode(mu_law_encode(sample))
        save_wav(np.squeeze(sample),
                 '{}/{}_target.wav'.format(output_dir, name))
        save_wav(np.asarray(audios), '{}/{}.wav'.format(output_dir, name))
        time_used = time.time() - start
        rtf = time_used / (len(audios) / 24000)
        avg_rtf.append(rtf)
        print("Time used: {:.3f}, RTF: {:.4f}".format(time_used, rtf))

    print("Average RTF: {:.3f}".format(sum(avg_rtf) / len(avg_rtf)))
コード例 #15
0
def train(log_dir, args):
    checkpoint_path = os.path.join(hdfs_ckpts, log_dir, 'model.ckpt')
    log(hp.to_string(), is_print=False)
    log('Loading training data from: %s' % args.tfr_dir)
    log('Checkpoint path: %s' % checkpoint_path)
    log('Using model: sygst tacotron2')

    tf_dset = TFDataSet(hp, args.tfr_dir)
    feats = tf_dset.get_train_next()
    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    training = tf.placeholder_with_default(True, shape=(), name='training')
    with tf.name_scope('model'):
        model = Tacotron2SYGST(hp)
        model(feats['inputs'],
              mel_inputs=feats['mel_targets'],
              spec_inputs=feats['linear_targets'],
              spec_lengths=feats['spec_lengths'],
              ref_inputs=feats['mel_targets'],
              ref_lengths=feats['spec_lengths'],
              arousal_labels=feats['soft_arousal_labels'],
              valence_labels=feats['soft_valance_labels'],
              training=training)
        """
        text_x, mel_x, spec_x, spec_len, aro, val = debug_data(2, 5, 10)
        model(text_x, mel_x, spec_x, spec_len, mel_x, spec_len, aro, val, training=training)
        """
        model.add_loss()
        model.add_optimizer(global_step)
        stats = model.add_stats()

    # Bookkeeping:
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=50, keep_checkpoint_every_n_hours=2)

    # Train!
    config = tf.ConfigProto(allow_soft_placement=True,
                            gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())
            if args.restore_step:
                # Restore from a checkpoint if the user requested it.
                restore_path = '%s-%s' % (checkpoint_path, args.restore_step)
                saver.restore(sess, restore_path)
                log('Resuming from checkpoint: %s' % restore_path, slack=True)
            else:
                log('Starting a new training run ...', slack=True)
            """
            fetches = [global_step, model.optimize, model.loss, model.mel_loss, model.spec_loss,
                       model.stop_loss, model.arousal_loss, model.valence_loss, model.mel_grad_norms_max,
                       model.spec_grad_norms_max, model.stop_grad_norms_max, model.aro_grad_norms_max, model.val_grad_norms_max]
            """
            fetches = [
                global_step, model.optimize, model.loss, model.mel_loss,
                model.spec_loss, model.stop_loss, model.arousal_loss,
                model.valence_loss
            ]
            for _ in range(_max_step):
                start_time = time.time()
                sess.run(debug.get_ops())
                # step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g = sess.run(fetches)
                step, _, loss, mel_loss, spec_loss, stop_loss, aro_loss, val_loss = sess.run(
                    fetches)
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f,mg=%.4f,spg=%.4f,sg=%.4f,ag=%.4f,vg=%.4f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss, aro_loss, val_loss, mel_g, spec_g, stop_g, aro_g, val_g)
                """
                message = 'Step %-7d [%.3f sec/step,ml=%.3f,spl=%.3f,sl=%.3f,al=%.3f,vl=%.3f]' % (
                    step, time_window.average, mel_loss, spec_loss, stop_loss,
                    aro_loss, val_loss)
                log(message, slack=(step % args.checkpoint_interval == 0))

                if loss > 100 or math.isnan(loss):
                    log('Loss exploded to %.5f at step %d!' % (loss, step),
                        slack=True)
                    raise Exception('Loss Exploded')

                if step % args.summary_interval == 0:
                    log('Writing summary at step: %d' % step)
                    try:
                        summary_writer.add_summary(sess.run(stats), step)
                    except Exception as e:
                        log(f'summary failed and ignored: {str(e)}')

                if step % args.checkpoint_interval == 0:
                    log('Saving checkpoint to: %s-%d' %
                        (checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    log('Saving audio and alignment...')
                    gt_mel, gt_spec, seq, mel, spec, align = sess.run([
                        model.mel_targets[0], model.spec_targets[0],
                        model.text_targets[0], model.mel_outputs[0],
                        model.spec_outputs[0], model.alignment_outputs[0]
                    ])
                    text = sequence_to_text(seq)
                    wav = audio.inv_spectrogram(hp, spec.T)
                    wav_path = os.path.join(log_dir,
                                            'step-%d-audio.wav' % step)
                    mel_path = os.path.join(log_dir, 'step-%d-mel.png' % step)
                    spec_path = os.path.join(log_dir,
                                             'step-%d-spec.png' % step)
                    align_path = os.path.join(log_dir,
                                              'step-%d-align.png' % step)
                    info = '%s, %s, step=%d, loss=%.5f\n %s' % (
                        args.model, time_string(), step, loss, text)
                    plot.plot_alignment(align, align_path, info=info)
                    plot.plot_mel(mel, mel_path, info=info, gt_mel=gt_mel)
                    plot.plot_mel(spec, spec_path, info=info, gt_mel=gt_spec)
                    audio.save_wav(hp, wav, wav_path)
                    log('Input: %s' % text)

        except Exception as e:
            log('Exiting due to exception: %s' % e, slack=True)
            traceback.print_exc()
コード例 #16
0
            #text = ' '.join(k for k in line.decode("utf-8").split()[1:])
            #text = '< ' + text + ' >'
            #text = [phids[l] for l in text.split()]
            text, qF0s = get_textNqF0s(line, phids)

            # Generating from original speaker
            spk = speakers_dict[fname[0]]
            waveform, alignment, _ = tts(model, text, spk, qF0s)
            fname_generated = '_'.join(k for k in fname[1:])
            fname_generated = fname_generated + '_generated'
            dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_generated, file_name_suffix))
            dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_generated))
            plot_alignment(alignment.T, dst_alignment_path,
                           info="tacotron, {}".format(checkpoint_path))
            audio.save_wav(waveform, dst_wav_path)

            # Generating from a different speaker
            spk = np.random.randint(len(speakers))
            #fname = fname.split('_')
            #fname[0] = ids2speakers[spk]
            fname_transferred = '_'.join(k for k in fname[1:])
            fname_transferred = fname_transferred + '_transferred'
            print("I picked a random number as ", spk, " the corresponding speaker from the dictionary is ", ids2speakers[spk], " the filename I am storing is ", fname_transferred)
            print(text, fname_transferred)
            waveform, alignment, _ = tts(model, text, spk, qF0s)
            dst_wav_path = join(dst_dir, "{}{}.wav".format(fname_transferred, file_name_suffix))
            dst_alignment_path = join(dst_dir, "{}_alignment.png".format(fname_transferred))
            plot_alignment(alignment.T, dst_alignment_path,
                           info="tacotron, {}".format(checkpoint_path))
            audio.save_wav(waveform, dst_wav_path)
コード例 #17
0
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              use_manual_attention=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence, mel) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    #plot_path = add_prefix(plot_path, time_str)
    if use_manual_attention:
        plot_path = add_postfix(plot_path, "manual")

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        end_idx_counter = 0
        attention_argmax = alignment.argmax(0)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        max_counter = min((attention_argmax == end_idx).sum(), 5)

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]
        mel = mel[:spec_end_idx]

    audio_out = inv_linear_spectrogram(wav.T, hparams)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]
        mel = mel[:index[-1] // hparams.hop_size]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_wav(audio_out, current_path, hparams.sample_rate)

        #hccho
        mel_path = current_path.replace(".wav", ".npy")
        np.save(mel_path, mel)

        return True
    else:
        io_out = io.BytesIO()
        save_wav(audio_out, io_out, hparams.sample_rate)
        result = io_out.getvalue()
        return result
コード例 #18
0
            plt.plot(h)
            plt.ylim(0, h.max())
            plt.xlim(0, len(h))
        plt.draw()
        plt.savefig("figures/genes_" + species_names[i] + ".png")
        plt.gcf().clear()

    print("Storing final generation...")
    filenames = []
    samples = []
    group_by_species = {}
    for i in range(len(GA.originals)):
        group_by_species[i] = [
            org for org in GA.curr_generation if org.species == i
        ]
    for i in range(len(GA.originals)):
        peers = group_by_species[i]
        for j in range(len(peers)):
            filenames.append("GA." + species_names[i][:-4] + "_" + str(j) +
                             ".wav")
            samples.append(peers[j].waveform)

    for gen, output in zip(samples, filenames):
        out = io.BytesIO()
        audio.save_wav(gen, out)

        with open("output/" + output, "wb") as f:
            f.write(out.getvalue())

    print("Program complete.")
コード例 #19
0
def main():

    with tf.device(
            '/cpu:0'):  # cpu가 더 빠르다. gpu로 설정하면 Error. tf.device 없이 하면 더 느려진다.
        config = get_arguments()
        started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
        logdir = os.path.join(config.logdir, 'generate', started_datestring)
        print('logdir0-------------' + logdir)

        if not os.path.exists(logdir):
            os.makedirs(logdir)

        load_hparams(hparams, config.checkpoint_dir)

        sess = tf.Session()
        scalar_input = hparams.scalar_input
        net = WaveNetModel(
            batch_size=config.batch_size,
            dilations=hparams.dilations,
            filter_width=hparams.filter_width,
            residual_channels=hparams.residual_channels,
            dilation_channels=hparams.dilation_channels,
            quantization_channels=hparams.quantization_channels,
            out_channels=hparams.out_channels,
            skip_channels=hparams.skip_channels,
            use_biases=hparams.use_biases,
            scalar_input=hparams.scalar_input,
            global_condition_channels=hparams.gc_channels,
            global_condition_cardinality=config.gc_cardinality,
            local_condition_channels=hparams.num_mels,
            upsample_factor=hparams.upsample_factor,
            legacy=hparams.legacy,
            residual_legacy=hparams.residual_legacy,
            train_mode=False
        )  # train 단계에서는 global_condition_cardinality를 AudioReader에서 파악했지만, 여기서는 넣어주어야 함

        if scalar_input:
            samples = tf.placeholder(tf.float32, shape=[net.batch_size, None])
        else:
            samples = tf.placeholder(
                tf.int32, shape=[net.batch_size, None]
            )  # samples: mu_law_encode로 변환된 것. one-hot으로 변환되기 전. (batch_size, 길이)

        # local condition이 (N,T,num_mels) 여야 하지만, 길이 1까지로 들어가야하기 때무넹, (N,1,num_mels) --> squeeze하면 (N,num_mels)
        upsampled_local_condition = tf.placeholder(
            tf.float32, shape=[net.batch_size, hparams.num_mels])

        next_sample = net.predict_proba_incremental(
            samples, upsampled_local_condition, [config.gc_id] * net.batch_size
        )  # Fast Wavenet Generation Algorithm-1611.09482 algorithm 적용

        # making local condition data. placeholder - upsampled_local_condition 넣어줄 upsampled local condition data를 만들어 보자.
        print('logdir0-------------' + logdir)
        mel_input = np.load(config.mel)
        sample_size = mel_input.shape[0] * hparams.hop_size
        mel_input = np.tile(mel_input, (config.batch_size, 1, 1))
        with tf.variable_scope('wavenet', reuse=tf.AUTO_REUSE):
            upsampled_local_condition_data = net.create_upsample(
                mel_input, upsample_type=hparams.upsample_type)

        var_list = [
            var for var in tf.global_variables() if 'queue' not in var.name
        ]
        saver = tf.train.Saver(var_list)
        print('Restoring model from {}'.format(config.checkpoint_dir))

        load(saver, sess, config.checkpoint_dir)
        init_op = tf.group(tf.initialize_all_variables(),
                           net.queue_initializer)

        sess.run(init_op)  # 이 부분이 없으면, checkpoint에서 복원된 값들이 들어 있다.

        quantization_channels = hparams.quantization_channels
        if config.wav_seed:
            # wav_seed의 길이가 receptive_field보다 작으면, padding이라도 해야 되는 거 아닌가? 그냥 짧으면 짧은 대로 return함  --> 그래서 너무 짧으면 error
            seed = create_seed(config.wav_seed, hparams.sample_rate,
                               quantization_channels, net.receptive_field,
                               scalar_input)  # --> mu_law encode 된 것.
            if scalar_input:
                waveform = seed.tolist()
            else:
                waveform = sess.run(
                    seed).tolist()  # [116, 114, 120, 121, 127, ...]

            print('Priming generation...')
            for i, x in enumerate(waveform[-net.receptive_field:-1]
                                  ):  # 제일 마지막 1개는 아래의 for loop의 첫 loop에서 넣어준다.
                if i % 100 == 0:
                    print('Priming sample {}/{}'.format(
                        i, net.receptive_field),
                          end='\r')
                sess.run(next_sample,
                         feed_dict={
                             samples:
                             np.array([x] * net.batch_size).reshape(
                                 net.batch_size, 1),
                             upsampled_local_condition:
                             np.zeros([net.batch_size, hparams.num_mels])
                         })
            print('Done.')
            waveform = np.array([waveform[-net.receptive_field:]] *
                                net.batch_size)
        else:
            # Silence with a single random sample at the end.
            if scalar_input:
                waveform = [0.0] * (net.receptive_field - 1)
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform, 2 * np.random.rand(net.batch_size).reshape(
                            net.batch_size, -1) - 1
                    ],
                    axis=-1)  # -1~1사이의 random number를 만들어 끝에 붙힌다.
                # wavefor: shape(batch_size,net.receptive_field )
            else:
                waveform = [quantization_channels / 2] * (
                    net.receptive_field - 1
                )  # 필요한 receptive_field 크기보다 1개 작게 만든 후, 아래에서 random하게 1개를 덧붙힌다.
                waveform = np.array(waveform * net.batch_size).reshape(
                    net.batch_size, -1)
                waveform = np.concatenate(
                    [
                        waveform,
                        np.random.randint(quantization_channels,
                                          size=net.batch_size).reshape(
                                              net.batch_size, -1)
                    ],
                    axis=-1)  # one hot 변환 전. (batch_size, 5117)

        start_time = time.time()
        upsampled_local_condition_data = sess.run(
            upsampled_local_condition_data)
        last_sample_timestamp = datetime.now()
        for step in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size

            window = waveform[:,
                              -1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)

            # Run the WaveNet to predict the next sample.

            # fast가 아닌경우. window: [128.0, 128.0, ..., 128.0, 178, 185]
            # fast인 경우, window는 숫자 1개.
            prediction = sess.run(
                next_sample,
                feed_dict={
                    samples:
                    window,
                    upsampled_local_condition:
                    upsampled_local_condition_data[:, step, :]
                }
            )  # samples는 mu law encoding된 것. 계산 과정에서 one hot으로 변환된다.  --> (batch_size,256)

            if scalar_input:
                sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
            else:
                # Scale prediction distribution using temperature.
                # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
                # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
                np.seterr(divide='ignore')
                scaled_prediction = np.log(
                    prediction
                ) / config.temperature  # config.temperature인 경우는 값의 변화가 없다.
                scaled_prediction = (
                    scaled_prediction - np.logaddexp.reduce(
                        scaled_prediction, axis=-1, keepdims=True)
                )  # np.log(np.sum(np.exp(scaled_prediction)))
                scaled_prediction = np.exp(scaled_prediction)
                np.seterr(divide='warn')

                # Prediction distribution at temperature=1.0 should be unchanged after
                # scaling.
                if config.temperature == 1.0:
                    np.testing.assert_allclose(
                        prediction,
                        scaled_prediction,
                        atol=1e-5,
                        err_msg=
                        'Prediction scaling at temperature=1.0 is not working as intended.'
                    )

                # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
                sample = [[
                    np.random.choice(np.arange(quantization_channels), p=p)
                ] for p in scaled_prediction]  # choose one sample per batch

            waveform = np.concatenate([waveform, sample],
                                      axis=-1)  #window.shape: (N,1)

            # Show progress only once per second.
            current_sample_timestamp = datetime.now()
            time_since_print = current_sample_timestamp - last_sample_timestamp
            if time_since_print.total_seconds() > 1.:
                duration = time.time() - start_time
                print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(
                    step + 1, sample_size, duration),
                      end='\r')
                last_sample_timestamp = current_sample_timestamp

        # Introduce a newline to clear the carriage return from the progress.
        print()

        # Save the result as a wav file.
        if hparams.input_type == 'raw':
            out = waveform[:, net.receptive_field:]
        elif hparams.input_type == 'mulaw':
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=False)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})
        else:  # 'mulaw-quantize'
            decode = mu_law_decode(samples,
                                   quantization_channels,
                                   quantization=True)
            out = sess.run(
                decode, feed_dict={samples: waveform[:, net.receptive_field:]})

        # save wav

        for i in range(net.batch_size):
            config.wav_out_path = logdir + '/test-{}.wav'.format(i)
            mel_path = config.wav_out_path.replace(".wav", ".png")

            gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(
                np.float32).T
            audio.save_wav(out[i], config.wav_out_path,
                           hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.

            plot.plot_spectrogram(gen_mel_spectrogram,
                                  mel_path,
                                  title='generated mel spectrogram',
                                  target_spectrogram=mel_input[i])
        print('Finished generating.')
コード例 #20
0
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence, mel) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        # attention이 text의 마지막까지 왔다면, 그 뒷부분은 버린다.
        end_idx_counter = 0
        attention_argmax = alignment.argmax(
            0
        )  # alignment: text length(encoder), target length(decoder)   ==> target length(decoder)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        # max_counter = min((attention_argmax == end_idx).sum(), 5) + 1
        # 20200612 위 로직을 보면 attention_argmax에서 end_idx랑 같은 값을 count한 거(실제 끝 값)랑 5를 min해서 max_counter를 정하게 되어 있다.
        #          한국말은 끝음을 오래 발음하는 경향이 있기 때문에 5로 자르지 않고 실제 발음한거만큼 끝까지 사용할 필요가 있어서 아래 로직으로 교체한다.
        #          (설계자가 왜 5로 잘랐는지는 미지수)
        max_counter = (attention_argmax == end_idx).sum()

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]
        mel = mel[:spec_end_idx]

    audio_out = inv_linear_spectrogram(wav.T, hparams)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]
        mel = mel[:index[-1] // hparams.hop_size]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_wav(audio_out, current_path, hparams.sample_rate)

        #hccho
        mel_path = current_path.replace(".wav", ".npy")
        np.save(mel_path, mel)

        #return True
        return audio_out
    else:
        io_out = io.BytesIO()
        save_wav(audio_out, io_out, hparams.sample_rate)
        result = io_out.getvalue()

        return audio_out
コード例 #21
0
ファイル: train.py プロジェクト: samprate1st/Tacotron-3
def train(log_dir, args):
    save_dir = os.path.join(log_dir, 'pretrained/')
    checkpoint_path = os.path.join(save_dir, 'model.ckpt')
    input_path = os.path.join(args.base_dir, args.input)
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    mel_dir = os.path.join(log_dir, 'mel-spectrograms')
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(mel_dir, exist_ok=True)
    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Loading training data from: {}'.format(input_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    #Set up data feeder
    coord = tf.train.Coordinator()
    with tf.variable_scope('datafeeder') as scope:
        feeder = Feeder(coord, input_path, hparams)

    #Set up model:
    step_count = 0
    try:
        #simple text file to keep count of global step
        with open(os.path.join(log_dir, 'step_counter.txt'), 'r') as file:
            step_count = int(file.read())
    except:
        print(
            'no step_counter file found, assuming there is no saved checkpoint'
        )

    global_step = tf.Variable(step_count, name='global_step', trainable=False)
    with tf.variable_scope('model') as scope:
        model = create_model(args.model, hparams)
        model.initialize(feeder.inputs, feeder.input_lengths,
                         feeder.mel_targets, feeder.token_targets)
        model.add_loss()
        model.add_optimizer(global_step)
        stats = add_stats(model)

    #Book keeping
    step = 0
    save_step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)

    #Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    #Train
    with tf.Session(config=config) as sess:
        try:
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            sess.run(tf.global_variables_initializer())

            #saved model restoring
            if args.restore:
                #Restore saved model if the user requested it, Default = True.
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)
                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e))

            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                log('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path))
                saver.restore(sess, checkpoint_state.model_checkpoint_path)

            else:
                if not args.restore:
                    log('Starting new training!')
                else:
                    log('No model to load at {}'.format(save_dir))

            #initiating feeder
            feeder.start_in_session(sess)

            #Training loop
            while not coord.should_stop():
                start_time = time.time()
                step, loss, opt = sess.run(
                    [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time_window.average, loss, loss_window.average)
                log(message, end='\r')

                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step: {}'.format(step))
                    summary_writer.add_summary(sess.run(stats), step)

                if step % args.checkpoint_interval == 0:
                    with open(os.path.join(log_dir, 'step_counter.txt'),
                              'w') as file:
                        file.write(str(step))
                    log('Saving checkpoint to: {}-{}'.format(
                        checkpoint_path, step))
                    saver.save(sess, checkpoint_path, global_step=step)
                    save_step = step

                    log('Saving alignment, Mel-Spectrograms and griffin-lim inverted waveform..'
                        )
                    input_seq, prediction, alignment, target = sess.run([
                        model.inputs[0],
                        model.mel_outputs[0],
                        model.alignments[0],
                        model.mel_targets[0],
                    ])
                    #save predicted spectrogram to disk (for plot and manual evaluation purposes)
                    mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(
                        step)
                    np.save(os.path.join(mel_dir, mel_filename),
                            prediction.T,
                            allow_pickle=False)

                    #save griffin lim inverted wav for debug.
                    wav = audio.inv_mel_spectrogram(prediction.T)
                    audio.save_wav(
                        wav,
                        os.path.join(wav_dir,
                                     'step-{}-waveform.wav'.format(step)))

                    #save alignment plot to disk (control purposes)
                    plot.plot_alignment(
                        alignment,
                        os.path.join(plot_dir,
                                     'step-{}-align.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5f}'.format(
                            args.model, time_string(), step, loss))
                    #save real mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        target,
                        os.path.join(
                            plot_dir,
                            'step-{}-real-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, Real'.format(
                            args.model, time_string(), step, loss))
                    #save predicted mel-spectrogram plot to disk (control purposes)
                    plot.plot_spectrogram(
                        prediction,
                        os.path.join(
                            plot_dir,
                            'step-{}-pred-mel-spectrogram.png'.format(step)),
                        info='{}, {}, step={}, loss={:.5}'.format(
                            args.model, time_string(), step, loss))
                    log('Input at step {}: {}'.format(
                        step, sequence_to_text(input_seq)))

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
            coord.request_stop(e)
コード例 #22
0
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import numpy as np
from utils import audio
from hparams import hparams as hps

linear_path = './data/linear-000001.npy'
linear_name = linear_path.split('/')[-1].split('.')[0]
linear_p = np.load(linear_path)

mel_path = r'./data/mel-000001.npy'
mel_name = mel_path.split('/')[-1].split('.')[0]
mel_p = np.load(mel_path)

# 保存线性频谱
wav = audio.inv_linear_spectrogram(linear_p.T, hps)
audio.save_wav(wav, os.path.join("./data", "{}.wav".format(linear_name)), hps)

# 保存mel频谱
wav = audio.inv_mel_spectrogram(mel_p.T, hps)
audio.save_wav(wav, os.path.join("./data", "{}.wav".format(mel_name)), hps)
コード例 #23
0
def eval_step(sess,logdir,step,waveform,upsampled_local_condition_data,speaker_id_data,mel_input_data,samples,speaker_id,upsampled_local_condition,next_sample,temperature=1.0):
    waveform = waveform[:,:1]
    
    sample_size = upsampled_local_condition_data.shape[1]
    last_sample_timestamp = datetime.now()
    start_time = time.time()
    for step2 in range(sample_size):  # 원하는 길이를 구하기 위해 loop sample_size
        window = waveform[:,-1:]  # 제일 끝에 있는 1개만 samples에 넣어 준다.  window: shape(N,1)
        

        prediction = sess.run(next_sample, feed_dict={samples: window,upsampled_local_condition: upsampled_local_condition_data[:,step2,:],speaker_id: speaker_id_data })


        if hparams.scalar_input:
            sample = prediction  # logistic distribution으로부터 sampling 되었기 때문에, randomness가 있다.
        else:
            # Scale prediction distribution using temperature.
            # 다음 과정은 config.temperature==1이면 각 원소를 합으로 나누어주는 것에 불과. 이미 softmax를 적용한 겂이므로, 합이 1이된다. 그래서 값의 변화가 없다.
            # config.temperature가 1이 아니며, 각 원소의 log취한 값을 나눈 후, 합이 1이 되도록 rescaling하는 것이 된다.
            np.seterr(divide='ignore')
            scaled_prediction = np.log(prediction) / temperature   # config.temperature인 경우는 값의 변화가 없다.
            scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction,axis=-1,keepdims=True))  # np.log(np.sum(np.exp(scaled_prediction)))
            scaled_prediction = np.exp(scaled_prediction)
            np.seterr(divide='warn')
    
            # Prediction distribution at temperature=1.0 should be unchanged after
            # scaling.
            if temperature == 1.0:
                np.testing.assert_allclose( prediction, scaled_prediction, atol=1e-5, err_msg='Prediction scaling at temperature=1.0 is not working as intended.')
            
            # argmax로 선택하지 않기 때문에, 같은 입력이 들어가도 달라질 수 있다.
            sample = [[np.random.choice(np.arange(hparams.quantization_channels), p=p)] for p in scaled_prediction]  # choose one sample per batch
        
        waveform = np.concatenate([waveform,sample],axis=-1)   #window.shape: (N,1)

        # Show progress only once per second.
        current_sample_timestamp = datetime.now()
        time_since_print = current_sample_timestamp - last_sample_timestamp
        if time_since_print.total_seconds() > 1.:
            duration = time.time() - start_time
            print('Sample {:3<d}/{:3<d}, ({:.3f} sec/step)'.format(step2 + 1, sample_size, duration), end='\r')
            last_sample_timestamp = current_sample_timestamp
    
    print('\n')
    # Save the result as a wav file.    
    if hparams.input_type == 'raw':
        out = waveform[:,1:]
    elif hparams.input_type == 'mulaw':
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=False)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})
    else:  # 'mulaw-quantize'
        decode = mu_law_decode(samples, hparams.quantization_channels,quantization=True)
        out = sess.run(decode, feed_dict={samples: waveform[:,1:]})          
        
        
    # save wav
    
    for i in range(1):
        wav_out_path= logdir + '/test-{}-{}.wav'.format(step,i)
        mel_path =  wav_out_path.replace(".wav", ".png")
        
        gen_mel_spectrogram = audio.melspectrogram(out[i], hparams).astype(np.float32).T
        audio.save_wav(out[i], wav_out_path, hparams.sample_rate)  # save_wav 내에서 out[i]의 값이 바뀐다.
        
        plot.plot_spectrogram(gen_mel_spectrogram, mel_path, title='generated mel spectrogram{}'.format(step),target_spectrogram=mel_input_data[i])  
コード例 #24
0
    model.eval()
    with torch.no_grad():
        _, linear_output = model(seq, mel_input)
        # print(np.shape(linear_output))

    # trans_linear = audio.trans(linear_output[0].cpu().numpy())
    wav = audio.inv_spectrogram(linear_output[0].cpu().numpy())
    # print(audio.find_endpoint(wav))
    # print(np.shape(wav))
    wav = wav[:audio.find_endpoint(wav)]
    # print(np.shape(wav))
    return wav


if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define model
    model = nn.DataParallel(Tacotron()).to(device)
    print("Model Have Been Defined")

    # Load checkpoint
    checkpoint = torch.load(os.path.join(
        hparams.checkpoint_path, 'checkpoint_40.pth.tar'))
    model.load_state_dict(checkpoint['model'])

    text = "in being comparatively modern."
    wav = synthesizer(model, text, device)
    audio.save_wav(wav, "test.wav")
コード例 #25
0
ファイル: train.py プロジェクト: dyelax/wavenet
def train(log_dir, metadata_path, data_path):
    tf.reset_default_graph()
    vocoder = Vocoder(hparams)
    vocoder.init_synthesizer(hparams.batch_size)

    coord = tf.train.Coordinator()
    reader = DataFeeder(metadata_filename=metadata_path,
                        coord=coord,
                        receptive_field=vocoder.net.receptive_field,
                        gc_enable=hparams.gc_enable,
                        sample_size=hparams.sample_size,
                        npy_dataroot=data_path,
                        num_mels=hparams.num_mels,
                        speaker_id=None)

    if hparams.gc_enable:
        audio_batch, lc_batch, gc_batch = reader.dequeue(hparams.batch_size)
    else:
        audio_batch, lc_batch = reader.dequeue(hparams.batch_size)
        gc_batch = None

    loss = vocoder.loss(audio_batch, lc_batch, gc_batch)

    sess = tf.Session()
    last_step, _ = vocoder.load(sess, log_dir)
    last_step = last_step or 0

    all_params = tf.trainable_variables()
    global_step = tf.get_variable(
        'global_step', [],
        initializer=tf.constant_initializer(last_step),
        trainable=False)

    decay_steps = hparams.NUM_STEPS_RATIO_PER_DECAY * hparams.max_num_step
    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(hparams.initial_learning_rate,
                                    global_step,
                                    decay_steps,
                                    hparams.LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)

    # lr = hparams.initial_learning_rate
    optimizer = optimizer_factory['adam'](learning_rate=lr, momentum=None)

    if hparams.clip_thresh > 0:
        grads_and_vars = optimizer.compute_gradients(loss, all_params)
        grads_and_vars = list(
            filter(lambda t: t[0] is not None, grads_and_vars))
        capped_gvs = [(tf.clip_by_norm(grad, hparams.clip_thresh), var)
                      for grad, var in grads_and_vars]
        optim = optimizer.apply_gradients(capped_gvs)
    else:
        optim = optimizer.minimize(loss,
                                   var_list=all_params,
                                   global_step=global_step)

    # Track the moving averages of all trainable variables.
    ema = tf.train.ExponentialMovingAverage(hparams.MOVING_AVERAGE_DECAY,
                                            global_step)
    maintain_averages_op = tf.group(ema.apply(all_params))
    train_op = tf.group(optim, maintain_averages_op)

    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    reader.start_threads(sess)

    try:
        print_loss = 0.
        start_time = time()
        for step in range(last_step + 1, hparams.max_num_step):

            if gc_batch is None:
                fetches = [audio_batch, vocoder.upsampled_lc, loss, train_op]
                _x, _lc, _loss, _ = sess.run(fetches)
                _gc = None
            else:
                fetches = [
                    audio_batch, vocoder.upsampled_lc, gc_batch, loss, train_op
                ]
                _x, _lc, _gc, _loss, _ = sess.run(fetches)

            print_loss += _loss

            if step % PRINT_LOSS_EVERY == 0:
                duration = time() - start_time
                print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format(
                    step, print_loss / PRINT_LOSS_EVERY,
                    duration / PRINT_LOSS_EVERY))
                start_time = time()
                print_loss = 0.

            if step % hparams.checkpoint_interval == 0:
                vocoder.save(sess, log_dir, step)

            if step % hparams.train_eval_interval == 0:
                samples = vocoder.synthesize(sess, _x.shape[1], _lc, _gc)
                targets = _x.reshape(hparams.batch_size, -1)

                for j in range(hparams.batch_size):
                    predicted_wav = samples[j, :]
                    target_wav = targets[j, :]
                    predicted_wav_path = os.path.join(
                        log_dir, 'predicted_{}_{}.wav'.format(step, j))
                    target_wav_path = os.path.join(
                        log_dir, 'target_{}_{}.wav'.format(step, j))
                    save_wav(predicted_wav, predicted_wav_path)
                    save_wav(target_wav, target_wav_path)

    except Exception as error:
        print(error)
    finally:
        coord.request_stop()
        coord.join(threads)
        sess.close()
コード例 #26
0
    def synthesize(self, texts, basenames, log_dir, mel_filenames):
        hparams = self._hparams

        # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
        while len(texts) % hparams.synthesis_batch_size != 0:
            texts.append(texts[-1])
            basenames.append(basenames[-1])
            if mel_filenames is not None:
                mel_filenames.append(mel_filenames[-1])
        sequences = [np.asarray(text_to_sequence(text)) for text in texts]
        input_lengths = [len(seq) for seq in sequences]
        seqs, max_seq_len = self._prepare_inputs(sequences)

        feed_dict = {
            self.inputs: seqs,
            self.input_lengths: np.asarray(input_lengths, dtype=np.int32)
        }

        linears, mels, alignments, audio_length = self.session.run(
            [self.linear_outputs, self.mel_outputs, self.alignments[0], self.audio_length],
            feed_dict=feed_dict)
        # Natural batch synthesis
        # Get Mel/Linear lengths for the entire batch from stop_tokens predictions
        target_lengths = audio_length

        if basenames is None:
            # Generate wav and read it
            wav = audio.inv_mel_spectrogram(mels[0].T, hparams)
            audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate)  # Find a better way

            if platform.system() == 'Linux':
                # Linux wav reader
                os.system('aplay temp.wav')

            elif platform.system() == 'Windows':
                # windows wav reader
                os.system('start /min mplay32 /play /close temp.wav')

            else:
                raise RuntimeError(
                    'Your OS type is not supported yet, please add it to "centaur/synthesizer.py, line-165" and feel free to make a Pull Request ;) Thanks!')

            return

        for i, mel in enumerate(mels):

            if log_dir is not None:
                # save wav (mel -> wav)
                wav = audio.inv_mel_spectrogram(mel.T, hparams)
                audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)
                alignments_samples, alignment_titles = self.get_alignments(alignments)
                for idx in range(len(alignments_samples)):
                    # save alignments
                    plot.plot_alignment(alignments_samples[idx],
                                        os.path.join(log_dir, 'plots/{}.png'.format(
                                            alignment_titles[
                                                idx])),
                                        title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i])

                # save mel spectrogram plot
                plot.plot_spectrogram(mel,
                                      os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True)

                # save wav (linear -> wav)

                wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
                audio.save_wav(wav,
                               os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])),
                               sr=hparams.sample_rate)

                # save linear spectrogram plot
                plot.plot_spectrogram(linears[i],
                                      os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
                                      title='{}'.format(texts[i]), split_title=True, auto_aspect=True)
コード例 #27
0
def run_eval(args, eval_dir, eval_model, eval_plot_dir, eval_wav_dir, feeder,
             hparams, sess, step, summary_writer):
    # Run eval and save eval stats
    log('\nRunning evaluation at step {}'.format(step))
    sum_eval_loss = 0.0
    sum_mel_loss = 0.0
    sum_stop_token_loss = 0.0
    sum_linear_loss = 0.0
    count = 0.0
    mel_p = None
    mel_t = None
    t_len = None
    attention_mask_sample = None
    lin_p = None
    lin_t = None
    for _ in tqdm(range(feeder.test_steps)):
        test_eloss, test_mel_loss, test_stop_token_loss, test_linear_loss, mel_p, mel_t, t_len, attention_mask_sample, lin_p, lin_t = sess.run(
            [
                eval_model.loss,
                eval_model.mel_loss,
                eval_model.stop_token_loss,
                eval_model.linear_loss,
                eval_model.post_net_predictions[0],
                eval_model.targets_mel[0],
                eval_model.targets_length[0],
                eval_model.alignments[0],
                eval_model.mag_pred[0],
                eval_model.targets_mag[0],
            ])
        sum_eval_loss += test_eloss
        sum_mel_loss += test_mel_loss
        sum_stop_token_loss += test_stop_token_loss
        sum_linear_loss += test_linear_loss
        count += 1.0
    wav = audio.inv_linear_spectrogram(lin_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir,
                                '{}-eval-linear.wav'.format(step)),
                   sr=hparams.sample_rate)
    if count > 0.0:
        eval_loss = sum_eval_loss / count
        mel_loss = sum_mel_loss / count
        stop_token_loss = sum_stop_token_loss / count
        linear_loss = sum_linear_loss / count
    else:
        eval_loss = sum_eval_loss
        mel_loss = sum_mel_loss
        stop_token_loss = sum_stop_token_loss
        linear_loss = sum_linear_loss
    log('Saving eval log to {}..'.format(eval_dir))
    # Save some log to monitor model improvement on same unseen sequence
    wav = audio.inv_mel_spectrogram(mel_p.T, hparams)
    audio.save_wav(wav,
                   os.path.join(eval_wav_dir, '{}-eval-mel.wav'.format(step)),
                   sr=hparams.sample_rate)
    alignments, alignment_titles = get_alignments(attention_mask_sample)
    for i in range(len(alignments)):
        plot.plot_alignment(alignments[i],
                            os.path.join(
                                eval_plot_dir, '{}_{}-eval-align.png'.format(
                                    step, alignment_titles[i])),
                            title='{}, {}, step={}, loss={:.5f}'.format(
                                args.model, time_string(), step, eval_loss),
                            max_len=t_len // hparams.reduction_factor)
    plot.plot_spectrogram(
        mel_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-mel-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=mel_t,
        max_len=t_len)
    plot.plot_spectrogram(
        lin_p,
        os.path.join(eval_plot_dir,
                     '{}-eval-linear-spectrogram.png'.format(step)),
        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(),
                                                    step, eval_loss),
        target_spectrogram=lin_t,
        max_len=t_len,
        auto_aspect=True)
    log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss))
    log('Writing eval summary!')
    add_eval_stats(summary_writer, step, linear_loss, mel_loss,
                   stop_token_loss, eval_loss)
コード例 #28
0
    model.eval()
    with torch.no_grad():
        _, linear_output = model(seq, mel_input)
        # print(np.shape(linear_output))

    # trans_linear = audio.trans(linear_output[0].cpu().numpy())
    wav = audio.inv_spectrogram(linear_output[0].cpu().numpy())
    # print(audio.find_endpoint(wav))
    # print(np.shape(wav))
    wav = wav[:audio.find_endpoint(wav)]
    # print(np.shape(wav))
    return wav


if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define model
    model = Tacotron().to(device)
    print("Model Have Been Defined")

    # Load checkpoint
    checkpoint = torch.load(
        os.path.join(hparams.checkpoint_path, 'checkpoint_20500.pth.tar'))
    model.load_state_dict(checkpoint['model'])
    print("Load Done")

    text = "I am very happy to see you again."
    wav = synthesizer(model, text, device)
    audio.save_wav(wav, text + ".wav")
コード例 #29
0
def audio(output, pth):
    mel_outputs, mel_outputs_postnet, _ = output
    #wav = inv_melspectrogram(to_arr(mel_outputs[0]))
    wav_postnet = inv_melspectrogram(to_arr(mel_outputs_postnet[0]))
    #save_wav(wav, pth+'.wav')
    save_wav(wav_postnet, pth + '.wav')
コード例 #30
0
from utils.audio import melspectrogram,inv_mel_spectrogram,load_wav,save_wav


wav_path = "LJ001-0008.wav"
raw_wav = load_wav(wav_path)
mel_spec = melspectrogram(raw_wav)
inv_wav = inv_mel_spectrogram(mel_spec)
save_wav(inv_wav,"inv.wav")