def save_images(final_mel, target_mel, filename): spectrogram1 = plot_spectrogram_to_numpy(final_mel) plt.imsave(os.path.join('validation_tests', filename + '_generated.png'), spectrogram1.transpose((1, 2, 0))) spectrogram2 = plot_spectrogram_to_numpy(target_mel) plt.imsave(os.path.join('validation_tests', filename + '_target.png'), spectrogram2.transpose((1, 2, 0)))
def log_validation(self, reduced_loss, model, y, y_pred, iteration): self.add_scalar("validation.loss", reduced_loss, iteration) _, mel_outputs, gate_outputs, alignments = y_pred mel_targets, gate_targets = y # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace(".", "/") self.add_histogram(tag, value.data.cpu().numpy(), iteration) # plot alignment, mel_target and predicted, gate target and predicted idx = random.randint(0, alignments.size()[0] - 1) self.add_image("alignment", plot_alignment_to_numpy( alignments[idx].data.cpu().numpy().T), iteration, dataformats="HWC") self.add_image("mel_target", plot_spectrogram_to_numpy( mel_targets[idx].data.cpu().numpy()), iteration, dataformats="HWC") self.add_image("mel_predicted", plot_spectrogram_to_numpy( mel_outputs[idx].data.cpu().numpy()), iteration, dataformats="HWC") self.add_image("gate", plot_gate_outputs_to_numpy( gate_targets[idx].data.cpu().numpy(), torch.sigmoid( gate_outputs[idx].data.cpu().numpy())), iteration, dataformats="HWC")
def helper(data, name, hp, store_path): if not os.path.exists(store_path): os.makedirs(store_path, exist_ok=True) spectrogram = plot_spectrogram_to_numpy(data[0].cpu().detach().numpy()) plt.imsave(os.path.join(store_path, name + '.png'), spectrogram.transpose((1, 2, 0))) with torch.enable_grad(): waveform, wavespec = Reconstruct(hp).inverse(data[0], iters=2000) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join(store_path, 'Final ' + name + '.png'), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr) audio.export(os.path.join(store_path, name + '.wav'), format='wav')
def store(generated, path, hp, idx, class_label): if not os.path.exists(path): os.makedirs(path) torch.save(generated, os.path.join(path, '{}_{}.pt'.format(class_label, idx))) spectrogram = plot_spectrogram_to_numpy( generated[0].cpu().detach().numpy()) plt.imsave(os.path.join(path, '{}_{}.png'.format(class_label, idx)), spectrogram.transpose((1, 2, 0))) with torch.enable_grad(): waveform, wavespec = Reconstruct(hp).inverse(generated[0]) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join(path, 'Final {}_{}.png'.format(class_label, idx)), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array(waveform, framerate=hp.audio.sr) audio.export(os.path.join(path, '{}_{}.wav'.format(class_label, idx)), format='wav')
hp = HParam(args.config) infer_hp = HParam(args.infer_config) assert args.timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], args.timestep) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() with torch.no_grad(): generated = model.sample(args.input) os.makedirs('temp', exist_ok=True) torch.save(generated, os.path.join('temp', args.name + '.pt')) spectrogram = plot_spectrogram_to_numpy(generated[0].cpu().detach().numpy()) plt.imsave(os.path.join('temp', args.name + '.png'), spectrogram.transpose((1, 2, 0))) waveform, wavespec = Reconstruct(hp).inverse(generated[0]) wavespec = plot_spectrogram_to_numpy(wavespec.cpu().detach().numpy()) plt.imsave(os.path.join('temp', 'Final ' + args.name + '.png'), wavespec.transpose((1, 2, 0))) waveform = waveform.unsqueeze(-1) waveform = waveform.cpu().detach().numpy() waveform *= 32768 / waveform.max() waveform = waveform.astype(np.int16) audio = audiosegment.from_numpy_array( waveform, framerate=hp.audio.sr ) audio.export(os.path.join('temp', args.name + '.wav'), format='wav')
def train(log_dir, args, hparams): voicefilter_audio = Audio(hparams) save_dir = os.path.join(log_dir, 'extract_pretrained') plot_dir = os.path.join(log_dir, 'plots') wav_dir = os.path.join(log_dir, 'wavs') spec_dir = os.path.join(log_dir, 'spec-spectrograms') eval_dir = os.path.join(log_dir, 'eval-dir') #eval_plot_dir = os.path.join(eval_dir, 'plots') eval_wav_dir = os.path.join(eval_dir, 'wavs') tensorboard_dir = os.path.join(log_dir, 'extractron_events') meta_folder = os.path.join(log_dir, 'metas') os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(spec_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) #os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_path = os.path.join(save_dir, 'extractron_model.ckpt') checkpoint_path2 = os.path.join(save_dir, 'super_extractron_model.ckpt') #input_paths = [os.path.join(args.base_dir, args.extractron_input)] #if args.extractron_inputs: # input_paths = [os.path.join(args.base_dir, arg_input_path) # for arg_input_path in args.extractron_inputs] #if args.extractron_input_glob: # input_paths = glob.glob(args.extractron_input_glob) log('Checkpoint path: {}'.format(checkpoint_path)) log('Using model: {}'.format(args.model)) log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.extractron_random_seed) # Set up data feeder with tf.variable_scope('datafeeder'): feeder = Feeder(hparams) feeder.setup_dataset(args.dataset, args.eval_dataset) class DotDict(dict): """ a dictionary that supports dot notation as well as dictionary access notation usage: d = DotDict() or d = DotDict({'val1':'first'}) set attributes: d.val2 = 'second' or d['val2'] = 'second' get attributes: d.val2 or d['val2'] """ __getattr__ = dict.__getitem__ __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ def __init__(self, dct): for key, value in dct.items(): if hasattr(value, 'keys'): value = DotDict(value) self[key] = value dictkeys = [ 'target_linear', 'mixed_linear', 'target_mel', 'mixed_mel', 'spkid_embeddings' ] eval_dictkeys = [ 'eval_target_linear', 'eval_mixed_linear', 'eval_target_phase', 'eval_mixed_phase', 'eval_target_mel', 'eval_mixed_mel', 'eval_spkid_embeddings' ] feeder_dict = DotDict(dict(zip(dictkeys, feeder.next))) feeder_dict.update(DotDict(dict(zip(eval_dictkeys, feeder.eval_next)))) # Set up model: global_step = tf.Variable(0, name='global_step', trainable=False) model, stats = model_train_mode(args, feeder_dict, hparams, global_step) eval_model = model_test_mode(args, feeder_dict, hparams, global_step) # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) saver2 = tf.train.Saver(max_to_keep=15) log('Extractron training set to a maximum of {} steps'.format( args.extractron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: #summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) xsummary_writer = SummaryWriter(tensorboard_dir) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): log('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log('No model to load at {}'.format(save_dir), slack=True) saver.save(sess, checkpoint_path, global_step=global_step) except tf.errors.OutOfRangeError as e: log('Cannot restore checkpoint: {}'.format(e), slack=True) else: log('Starting new training!', slack=True) saver.save(sess, checkpoint_path, global_step=global_step) if hparams.tfprof or hparams.timeline: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() if hparams.timeline: from tensorflow.python.client import timeline if hparams.tfprof: from tensorflow.python.profiler import model_analyzer, option_builder my_profiler = model_analyzer.Profiler(graph=sess.graph) profile_op_builder = option_builder.ProfileOptionBuilder() profile_op_builder.select(['micros', 'occurrence']) profile_op_builder.order_by('micros') #profile_op_builder.select(['device', 'bytes', 'peak_bytes']) #profile_op_builder.order_by('bytes') profile_op_builder.with_max_depth( 20) # can be any large number profile_op_builder.with_file_output('profile.log') profile_op = profile_op_builder.build() # Training loop while step < args.extractron_train_steps: start_time = time.time() # from tensorflow.python import debug as tf_debug # sess=tf_debug.LocalCLIDebugWrapperSession(sess) if hparams.tfprof or hparams.timeline: step, loss, opt = sess.run( [global_step, model.loss, model.optimize], options=run_options, run_metadata=run_metadata) if hparams.timeline: fetched_timeline = timeline.Timeline( run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format( show_dataflow=True, show_memory=True) with open('timeline_01.json', 'w') as f: f.write(chrome_trace) if hparams.tfprof: my_profiler.add_step(step=int(step), run_meta=run_metadata) my_profiler.profile_name_scope(profile_op) else: step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = \ 'Step {:7d} [{:.3f} sec/step, {:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( step, time.time() - start_time, time_window.average, loss, loss_window.average) log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) # Originally assume 100 means loss exploded, now change to 1000 due to waveglow settings if loss > 100 or np.isnan(loss): log('Loss exploded to {:.5f} at step {}'.format( loss, step)) raise Exception('Loss exploded') if step % args.summary_interval == 0: log('\nWriting summary at step {}'.format(step)) add_train_summary(xsummary_writer, step, loss) #summary_writer.add_summary(sess.run(stats), step) #summary_writer.flush() if step % args.gc_interval == 0: log('\nGarbage collect: {}\n'.format(gc.collect())) if step % args.eval_interval == 0: # Run eval and save eval stats log('\nRunning evaluation at step {}'.format(step)) #1. avg loss, before, after, predicted mag, mixed phase, mixed_mag, target phase, target_mag #2. 3 wavs #3. 3 mag specs #4. sdr eval_losses = [] before_losses = [] after_losses = [] linear_losses = [] for i in tqdm(range(args.test_steps)): try: eloss, before_loss, after_loss, linear_loss, \ mixed_phase, mixed_mel, mixed_linear, \ target_phase, target_mel, target_linear, \ predicted_linear = sess.run([ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mixed_phase[0][0], eval_model.tower_mixed_mel[0][0], eval_model.tower_mixed_linear[0][0], eval_model.tower_target_phase[0][0], eval_model.tower_target_mel[0][0], eval_model.tower_target_linear[0][0], eval_model.tower_linear_outputs[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) linear_losses.append(linear_loss) #if i==0: # tmp_phase=mixed_phase # tmp_spec=mixed_spec except tf.errors.OutOfRangeError: log('\n test dataset out of range') pass eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) linear_loss = sum(linear_losses) / len(linear_losses) #mixed_wav = voicefilter_audio.spec2wav(tmp_spec, tmp_phase) mixed_wav = voicefilter_audio.spec2wav( mixed_linear, mixed_phase) target_wav = voicefilter_audio.spec2wav( target_linear, target_phase) predicted_wav = voicefilter_audio.spec2wav( predicted_linear, mixed_phase) librosa.output.write_wav( os.path.join(eval_wav_dir, 'step-{}-eval-mixed.wav'.format(step)), mixed_wav, hparams.sample_rate) librosa.output.write_wav( os.path.join(eval_wav_dir, 'step-{}-eval-target.wav'.format(step)), target_wav, hparams.sample_rate) librosa.output.write_wav( os.path.join( eval_wav_dir, 'step-{}-eval-predicted.wav'.format(step)), predicted_wav, hparams.sample_rate) #audio.save_wav(mixed_wav, os.path.join( # eval_wav_dir, 'step-{}-eval-mixed.wav'.format(step)), sr=hparams.sample_rate) #audio.save_wav(target_wav, os.path.join( # eval_wav_dir, 'step-{}-eval-target.wav'.format(step)), sr=hparams.sample_rate) #audio.save_wav(predicted_wav, os.path.join( # eval_wav_dir, 'step-{}-eval-predicted.wav'.format(step)), sr=hparams.sample_rate) mixed_linear_img = plot_spectrogram_to_numpy( mixed_linear.T) target_linear_img = plot_spectrogram_to_numpy( target_linear.T) predicted_linear_img = plot_spectrogram_to_numpy( predicted_linear.T) #plot.plot_spectrogram(predicted_spec, # os.path.join(eval_plot_dir, 'step-{}-eval-spectrogram.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), # target_spectrogram=target_spec) log('Eval loss for global step {}: {:.3f}'.format( step, eval_loss)) log('Writing eval summary!') add_eval_summary(xsummary_writer, step, before_loss, after_loss, linear_loss, eval_loss, hparams.sample_rate, mixed_wav, target_wav, predicted_wav, mixed_linear_img, target_linear_img, predicted_linear_img) if step % args.super_checkpoint_interval == 0 or step == args.extractron_train_steps: # Save model and current global step saver2.save(sess, checkpoint_path2, global_step=global_step) if step % args.checkpoint_interval == 0 or step == args.extractron_train_steps: # Save model and current global step saver.save(sess, checkpoint_path, global_step=global_step) #log('\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform..') #input_seq, mel_prediction, alignment, target, target_length = sess.run([ # model.tower_inputs[0][0], # model.tower_mel_outputs[0][0], # model.tower_alignments[0][0], # model.tower_mel_targets[0][0], # model.tower_targets_lengths[0][0], #]) ## save predicted mel spectrogram to disk (debug) #mel_filename = 'mel-prediction-step-{}.npy'.format(step) #np.save(os.path.join(mel_dir, mel_filename), # mel_prediction.T, allow_pickle=False) ## save griffin lim inverted wav for debug (mel -> wav) #wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) #audio.save_wav(wav, os.path.join( # wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), sr=hparams.sample_rate) ## save alignment plot to disk (control purposes) #plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}'.format( # args.model, time_string(), step, loss), # max_len=target_length // hparams.outputs_per_step) ## save real and predicted mel-spectrogram plot to disk (control purposes) #plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), # title='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), target_spectrogram=target, # max_len=target_length) #log('Input at step {}: {}'.format( # step, sequence_to_text(input_seq))) log('Extractron training complete after {} global steps!'.format( args.extractron_train_steps), slack=True) return save_dir except Exception as e: log('Exiting due to exception: {}'.format(e), slack=True) traceback.print_exc()
def save_image(mel, name): # print(type(mel)) # newmel = mel.detach().numpy()[0] spectrogram1 = plot_spectrogram_to_numpy(mel) plt.imsave(os.path.join('validation_tests', name + '.png'), spectrogram1.transpose((1, 2, 0)))
map_location=torch.device('cpu')) # generated = torch.load('../../hw_blizzard_compressed.pt') generated_np = generated[0].cpu().detach().numpy() # turn inference ms back to audio def denormalize(x): return (np.clip(x, 0.0, 1.0) - 1.0) * 80.0 x = librosa.db_to_power(denormalize(generated_np) + 20.0) y = librosa.feature.inverse.mel_to_audio(M=x, sr=7000, n_fft=1080, hop_length=150, win_length=1080) sf.write( '../Melnet files/librosa_testing/hello_world_blizzard_reconstructed2.wav', y, 7000, 'PCM_24') y, sr = librosa.load('../Melnet files/librosa_testing/hello_world.mp3') ms = librosa.feature.melspectrogram(y, sr) spectrogram = plot_spectrogram_to_numpy() # turn my voice's ms back to audio S = librosa.feature.inverse.mel_to_stft(ms) y = librosa.griffinlim(S) sf.write('../Melnet files/librosa_testing/hello_world_reconstructed.wav', y, sr, 'PCM_24')