Exemple #1
0
def save_images(final_mel, target_mel, filename):
    spectrogram1 = plot_spectrogram_to_numpy(final_mel)
    plt.imsave(os.path.join('validation_tests', filename + '_generated.png'),
               spectrogram1.transpose((1, 2, 0)))
    spectrogram2 = plot_spectrogram_to_numpy(target_mel)
    plt.imsave(os.path.join('validation_tests', filename + '_target.png'),
               spectrogram2.transpose((1, 2, 0)))
Exemple #2
0
    def log_validation(self, reduced_loss, model, y, y_pred, iteration):
        self.add_scalar("validation.loss", reduced_loss, iteration)
        _, mel_outputs, gate_outputs, alignments = y_pred
        mel_targets, gate_targets = y

        # plot distribution of parameters
        for tag, value in model.named_parameters():
            tag = tag.replace(".", "/")
            self.add_histogram(tag, value.data.cpu().numpy(), iteration)

        # plot alignment, mel_target and predicted, gate target and predicted
        idx = random.randint(0, alignments.size()[0] - 1)
        self.add_image("alignment",
                       plot_alignment_to_numpy(
                           alignments[idx].data.cpu().numpy().T),
                       iteration,
                       dataformats="HWC")
        self.add_image("mel_target",
                       plot_spectrogram_to_numpy(
                           mel_targets[idx].data.cpu().numpy()),
                       iteration,
                       dataformats="HWC")
        self.add_image("mel_predicted",
                       plot_spectrogram_to_numpy(
                           mel_outputs[idx].data.cpu().numpy()),
                       iteration,
                       dataformats="HWC")
        self.add_image("gate",
                       plot_gate_outputs_to_numpy(
                           gate_targets[idx].data.cpu().numpy(),
                           torch.sigmoid(
                               gate_outputs[idx].data.cpu().numpy())),
                       iteration,
                       dataformats="HWC")
def helper(data, name, hp, store_path):

    if not os.path.exists(store_path):
        os.makedirs(store_path, exist_ok=True)
    spectrogram = plot_spectrogram_to_numpy(data[0].cpu().detach().numpy())
    plt.imsave(os.path.join(store_path, name + '.png'),
               spectrogram.transpose((1, 2, 0)))
    with torch.enable_grad():
        waveform, wavespec = Reconstruct(hp).inverse(data[0], iters=2000)
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join(store_path, 'Final ' + name + '.png'),
               wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr)
    audio.export(os.path.join(store_path, name + '.wav'), format='wav')
Exemple #4
0
def store(generated, path, hp, idx, class_label):
    if not os.path.exists(path):
        os.makedirs(path)
    torch.save(generated,
               os.path.join(path, '{}_{}.pt'.format(class_label, idx)))
    spectrogram = plot_spectrogram_to_numpy(
        generated[0].cpu().detach().numpy())
    plt.imsave(os.path.join(path, '{}_{}.png'.format(class_label, idx)),
               spectrogram.transpose((1, 2, 0)))
    with torch.enable_grad():
        waveform, wavespec = Reconstruct(hp).inverse(generated[0])
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join(path, 'Final {}_{}.png'.format(class_label, idx)),
               wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr)
    audio.export(os.path.join(path, '{}_{}.wav'.format(class_label, idx)),
                 format='wav')
Exemple #5
0
    hp = HParam(args.config)
    infer_hp = HParam(args.infer_config)

    assert args.timestep % t_div[hp.model.tier] == 0, \
        "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep)

    model = MelNet(hp, args, infer_hp).cuda()
    model.load_tiers()
    model.eval()

    with torch.no_grad():
        generated = model.sample(args.input)

    os.makedirs('temp', exist_ok=True)
    torch.save(generated, os.path.join('temp', args.name + '.pt'))
    spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy())
    plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0)))

    waveform, wavespec = Reconstruct(hp).inverse(generated[0])
    wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy())
    plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0)))

    waveform = waveform.unsqueeze(-1)
    waveform = waveform.cpu().detach().numpy()
    waveform *= 32768 / waveform.max()
    waveform = waveform.astype(np.int16)
    audio = audiosegment.from_numpy_array(
        waveform,
        framerate=hp.audio.sr
    )
    audio.export(os.path.join('temp', args.name + '.wav'), format='wav')
Exemple #6
0
def train(log_dir, args, hparams):
    voicefilter_audio = Audio(hparams)

    save_dir = os.path.join(log_dir, 'extract_pretrained')
    plot_dir = os.path.join(log_dir, 'plots')
    wav_dir = os.path.join(log_dir, 'wavs')
    spec_dir = os.path.join(log_dir, 'spec-spectrograms')
    eval_dir = os.path.join(log_dir, 'eval-dir')
    #eval_plot_dir = os.path.join(eval_dir, 'plots')
    eval_wav_dir = os.path.join(eval_dir, 'wavs')
    tensorboard_dir = os.path.join(log_dir, 'extractron_events')
    meta_folder = os.path.join(log_dir, 'metas')

    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(wav_dir, exist_ok=True)
    os.makedirs(spec_dir, exist_ok=True)
    os.makedirs(eval_dir, exist_ok=True)
    #os.makedirs(eval_plot_dir, exist_ok=True)
    os.makedirs(eval_wav_dir, exist_ok=True)
    os.makedirs(tensorboard_dir, exist_ok=True)
    os.makedirs(meta_folder, exist_ok=True)

    checkpoint_path = os.path.join(save_dir, 'extractron_model.ckpt')
    checkpoint_path2 = os.path.join(save_dir, 'super_extractron_model.ckpt')
    #input_paths = [os.path.join(args.base_dir, args.extractron_input)]
    #if args.extractron_inputs:
    #    input_paths = [os.path.join(args.base_dir, arg_input_path)
    #                   for arg_input_path in args.extractron_inputs]
    #if args.extractron_input_glob:
    #    input_paths = glob.glob(args.extractron_input_glob)

    log('Checkpoint path: {}'.format(checkpoint_path))
    log('Using model: {}'.format(args.model))
    log(hparams_debug_string())

    # Start by setting a seed for repeatability
    tf.set_random_seed(hparams.extractron_random_seed)

    # Set up data feeder
    with tf.variable_scope('datafeeder'):
        feeder = Feeder(hparams)
        feeder.setup_dataset(args.dataset, args.eval_dataset)

        class DotDict(dict):
            """
            a dictionary that supports dot notation
            as well as dictionary access notation
            usage: d = DotDict() or d = DotDict({'val1':'first'})
            set attributes: d.val2 = 'second' or d['val2'] = 'second'
            get attributes: d.val2 or d['val2']
            """
            __getattr__ = dict.__getitem__
            __setattr__ = dict.__setitem__
            __delattr__ = dict.__delitem__

            def __init__(self, dct):
                for key, value in dct.items():
                    if hasattr(value, 'keys'):
                        value = DotDict(value)
                    self[key] = value

        dictkeys = [
            'target_linear', 'mixed_linear', 'target_mel', 'mixed_mel',
            'spkid_embeddings'
        ]
        eval_dictkeys = [
            'eval_target_linear', 'eval_mixed_linear', 'eval_target_phase',
            'eval_mixed_phase', 'eval_target_mel', 'eval_mixed_mel',
            'eval_spkid_embeddings'
        ]
        feeder_dict = DotDict(dict(zip(dictkeys, feeder.next)))
        feeder_dict.update(DotDict(dict(zip(eval_dictkeys, feeder.eval_next))))

    # Set up model:
    global_step = tf.Variable(0, name='global_step', trainable=False)
    model, stats = model_train_mode(args, feeder_dict, hparams, global_step)
    eval_model = model_test_mode(args, feeder_dict, hparams, global_step)

    # Book keeping
    step = 0
    time_window = ValueWindow(100)
    loss_window = ValueWindow(100)
    saver = tf.train.Saver(max_to_keep=5)
    saver2 = tf.train.Saver(max_to_keep=15)

    log('Extractron training set to a maximum of {} steps'.format(
        args.extractron_train_steps))

    # Memory allocation on the GPU as needed
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    #config.log_device_placement = True
    config.allow_soft_placement = True

    # Train
    with tf.Session(config=config) as sess:
        try:
            #summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph)
            xsummary_writer = SummaryWriter(tensorboard_dir)

            sess.run(tf.global_variables_initializer())

            # saved model restoring
            if args.restore:
                # Restore saved model if the user requested it, default = True
                try:
                    checkpoint_state = tf.train.get_checkpoint_state(save_dir)

                    if (checkpoint_state
                            and checkpoint_state.model_checkpoint_path):
                        log('Loading checkpoint {}'.format(
                            checkpoint_state.model_checkpoint_path),
                            slack=True)
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)

                    else:
                        log('No model to load at {}'.format(save_dir),
                            slack=True)
                        saver.save(sess,
                                   checkpoint_path,
                                   global_step=global_step)

                except tf.errors.OutOfRangeError as e:
                    log('Cannot restore checkpoint: {}'.format(e), slack=True)
            else:
                log('Starting new training!', slack=True)
                saver.save(sess, checkpoint_path, global_step=global_step)

            if hparams.tfprof or hparams.timeline:
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                if hparams.timeline:
                    from tensorflow.python.client import timeline
                if hparams.tfprof:
                    from tensorflow.python.profiler import model_analyzer, option_builder
                    my_profiler = model_analyzer.Profiler(graph=sess.graph)
                    profile_op_builder = option_builder.ProfileOptionBuilder()
                    profile_op_builder.select(['micros', 'occurrence'])
                    profile_op_builder.order_by('micros')
                    #profile_op_builder.select(['device', 'bytes', 'peak_bytes'])
                    #profile_op_builder.order_by('bytes')
                    profile_op_builder.with_max_depth(
                        20)  # can be any large number
                    profile_op_builder.with_file_output('profile.log')
                    profile_op = profile_op_builder.build()

            # Training loop
            while step < args.extractron_train_steps:
                start_time = time.time()
                # from tensorflow.python import debug as tf_debug
                # sess=tf_debug.LocalCLIDebugWrapperSession(sess)
                if hparams.tfprof or hparams.timeline:
                    step, loss, opt = sess.run(
                        [global_step, model.loss, model.optimize],
                        options=run_options,
                        run_metadata=run_metadata)
                    if hparams.timeline:
                        fetched_timeline = timeline.Timeline(
                            run_metadata.step_stats)
                        chrome_trace = fetched_timeline.generate_chrome_trace_format(
                            show_dataflow=True, show_memory=True)
                        with open('timeline_01.json', 'w') as f:
                            f.write(chrome_trace)
                    if hparams.tfprof:
                        my_profiler.add_step(step=int(step),
                                             run_meta=run_metadata)
                        my_profiler.profile_name_scope(profile_op)
                else:
                    step, loss, opt = sess.run(
                        [global_step, model.loss, model.optimize])
                time_window.append(time.time() - start_time)
                loss_window.append(loss)
                message = \
                'Step {:7d} [{:.3f} sec/step, {:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format(
                    step, time.time() - start_time, time_window.average, loss, loss_window.average)

                log(message,
                    end='\r',
                    slack=(step % args.checkpoint_interval == 0))

                # Originally assume 100 means loss exploded, now change to 1000 due to waveglow settings
                if loss > 100 or np.isnan(loss):
                    log('Loss exploded to {:.5f} at step {}'.format(
                        loss, step))
                    raise Exception('Loss exploded')

                if step % args.summary_interval == 0:
                    log('\nWriting summary at step {}'.format(step))
                    add_train_summary(xsummary_writer, step, loss)
                    #summary_writer.add_summary(sess.run(stats), step)
                    #summary_writer.flush()

                if step % args.gc_interval == 0:
                    log('\nGarbage collect: {}\n'.format(gc.collect()))

                if step % args.eval_interval == 0:
                    # Run eval and save eval stats
                    log('\nRunning evaluation at step {}'.format(step))

                    #1. avg loss, before, after, predicted mag, mixed phase, mixed_mag, target phase, target_mag
                    #2. 3 wavs
                    #3. 3 mag specs
                    #4. sdr

                    eval_losses = []
                    before_losses = []
                    after_losses = []
                    linear_losses = []

                    for i in tqdm(range(args.test_steps)):
                        try:
                            eloss, before_loss, after_loss, linear_loss, \
                            mixed_phase, mixed_mel, mixed_linear, \
                            target_phase, target_mel, target_linear, \
                            predicted_linear = sess.run([
                                eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_linear_loss[0],
                                eval_model.tower_mixed_phase[0][0], eval_model.tower_mixed_mel[0][0],
                                eval_model.tower_mixed_linear[0][0],
                                eval_model.tower_target_phase[0][0], eval_model.tower_target_mel[0][0],
                                eval_model.tower_target_linear[0][0],
                                eval_model.tower_linear_outputs[0][0]
                            ])
                            eval_losses.append(eloss)
                            before_losses.append(before_loss)
                            after_losses.append(after_loss)
                            linear_losses.append(linear_loss)
                            #if i==0:
                            #    tmp_phase=mixed_phase
                            #    tmp_spec=mixed_spec
                        except tf.errors.OutOfRangeError:
                            log('\n test dataset out of range')
                            pass

                    eval_loss = sum(eval_losses) / len(eval_losses)
                    before_loss = sum(before_losses) / len(before_losses)
                    after_loss = sum(after_losses) / len(after_losses)
                    linear_loss = sum(linear_losses) / len(linear_losses)

                    #mixed_wav = voicefilter_audio.spec2wav(tmp_spec, tmp_phase)
                    mixed_wav = voicefilter_audio.spec2wav(
                        mixed_linear, mixed_phase)
                    target_wav = voicefilter_audio.spec2wav(
                        target_linear, target_phase)
                    predicted_wav = voicefilter_audio.spec2wav(
                        predicted_linear, mixed_phase)
                    librosa.output.write_wav(
                        os.path.join(eval_wav_dir,
                                     'step-{}-eval-mixed.wav'.format(step)),
                        mixed_wav, hparams.sample_rate)
                    librosa.output.write_wav(
                        os.path.join(eval_wav_dir,
                                     'step-{}-eval-target.wav'.format(step)),
                        target_wav, hparams.sample_rate)
                    librosa.output.write_wav(
                        os.path.join(
                            eval_wav_dir,
                            'step-{}-eval-predicted.wav'.format(step)),
                        predicted_wav, hparams.sample_rate)
                    #audio.save_wav(mixed_wav, os.path.join(
                    #    eval_wav_dir, 'step-{}-eval-mixed.wav'.format(step)), sr=hparams.sample_rate)
                    #audio.save_wav(target_wav, os.path.join(
                    #    eval_wav_dir, 'step-{}-eval-target.wav'.format(step)), sr=hparams.sample_rate)
                    #audio.save_wav(predicted_wav, os.path.join(
                    #    eval_wav_dir, 'step-{}-eval-predicted.wav'.format(step)), sr=hparams.sample_rate)

                    mixed_linear_img = plot_spectrogram_to_numpy(
                        mixed_linear.T)
                    target_linear_img = plot_spectrogram_to_numpy(
                        target_linear.T)
                    predicted_linear_img = plot_spectrogram_to_numpy(
                        predicted_linear.T)

                    #plot.plot_spectrogram(predicted_spec,
                    #        os.path.join(eval_plot_dir, 'step-{}-eval-spectrogram.png'.format(step)),
                    #        title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss),
                    #        target_spectrogram=target_spec)

                    log('Eval loss for global step {}: {:.3f}'.format(
                        step, eval_loss))
                    log('Writing eval summary!')

                    add_eval_summary(xsummary_writer, step, before_loss,
                                     after_loss, linear_loss, eval_loss,
                                     hparams.sample_rate, mixed_wav,
                                     target_wav, predicted_wav,
                                     mixed_linear_img, target_linear_img,
                                     predicted_linear_img)

                if step % args.super_checkpoint_interval == 0 or step == args.extractron_train_steps:
                    # Save model and current global step
                    saver2.save(sess,
                                checkpoint_path2,
                                global_step=global_step)

                if step % args.checkpoint_interval == 0 or step == args.extractron_train_steps:
                    # Save model and current global step
                    saver.save(sess, checkpoint_path, global_step=global_step)

                    #log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..')

                    #input_seq, mel_prediction, alignment, target, target_length = sess.run([
                    #    model.tower_inputs[0][0],
                    #    model.tower_mel_outputs[0][0],
                    #    model.tower_alignments[0][0],
                    #    model.tower_mel_targets[0][0],
                    #    model.tower_targets_lengths[0][0],
                    #])

                    ## save predicted mel spectrogram to disk (debug)
                    #mel_filename = 'mel-prediction-step-{}.npy'.format(step)
                    #np.save(os.path.join(mel_dir, mel_filename),
                    #        mel_prediction.T, allow_pickle=False)

                    ## save griffin lim inverted wav for debug (mel -> wav)
                    #wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams)
                    #audio.save_wav(wav, os.path.join(
                    #    wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate)

                    ## save alignment plot to disk (control purposes)
                    #plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)),
                    #                    title='{}, {}, step={}, loss={:.5f}'.format(
                    #                        args.model, time_string(), step, loss),
                    #                    max_len=target_length // hparams.outputs_per_step)
                    ## save real and predicted mel-spectrogram plot to disk (control purposes)
                    #plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)),
                    #                      title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=target,
                    #                      max_len=target_length)
                    #log('Input at step {}: {}'.format(
                    #    step, sequence_to_text(input_seq)))

            log('Extractron training complete after {} global steps!'.format(
                args.extractron_train_steps),
                slack=True)
            return save_dir

        except Exception as e:
            log('Exiting due to exception: {}'.format(e), slack=True)
            traceback.print_exc()
Exemple #7
0
def save_image(mel, name):
    # print(type(mel))
    # newmel = mel.detach().numpy()[0]
    spectrogram1 = plot_spectrogram_to_numpy(mel)
    plt.imsave(os.path.join('validation_tests', name + '.png'),
               spectrogram1.transpose((1, 2, 0)))
Exemple #8
0
    map_location=torch.device('cpu'))
# generated = torch.load('../../hw_blizzard_compressed.pt')

generated_np = generated[0].cpu().detach().numpy()


# turn inference ms back to audio
def denormalize(x):
    return (np.clip(x, 0.0, 1.0) - 1.0) * 80.0


x = librosa.db_to_power(denormalize(generated_np) + 20.0)
y = librosa.feature.inverse.mel_to_audio(M=x,
                                         sr=7000,
                                         n_fft=1080,
                                         hop_length=150,
                                         win_length=1080)
sf.write(
    '../Melnet files/librosa_testing/hello_world_blizzard_reconstructed2.wav',
    y, 7000, 'PCM_24')

y, sr = librosa.load('../Melnet files/librosa_testing/hello_world.mp3')
ms = librosa.feature.melspectrogram(y, sr)
spectrogram = plot_spectrogram_to_numpy()

# turn my voice's ms back to audio
S = librosa.feature.inverse.mel_to_stft(ms)
y = librosa.griffinlim(S)
sf.write('../Melnet files/librosa_testing/hello_world_reconstructed.wav', y,
         sr, 'PCM_24')