def do_convert(args, logdir1, logdir2): # Load graph model = Net2() data = get_mfccs_and_spectrogram(args.file) ckpt1 = '{}/{}'.format( logdir1, args.net1) if args.net1 else tf.train.latest_checkpoint(logdir1) ckpt2 = '{}/{}'.format( logdir2, args.net2) if args.net2 else tf.train.latest_checkpoint(logdir2) session_inits = [] if ckpt2: session_inits.append(SaverRestore(ckpt2)) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) pred_conf = PredictConfig(model=model, input_names=get_eval_input_names(), output_names=get_eval_output_names(), session_init=ChainInit(session_inits)) predictor = OfflinePredictor(pred_conf) audio, y_audio, ppgs = convert(predictor, data) target_file = args.file.split('/')[-1] portion = os.path.splitext(target_file) # converted_file = target_file.split('.')[0] + '_converted.wav' converted_file = portion[0] + '.wav' write_wav(audio[0], hp.Default.sr, args.savepath + converted_file) # Write the result tf.summary.audio('A', y_audio, hp.Default.sr, max_outputs=hp.Convert.batch_size) tf.summary.audio('B', audio, hp.Default.sr, max_outputs=hp.Convert.batch_size) # Visualize PPGs heatmap = np.expand_dims(ppgs, 3) # channel=1 tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0]) writer = tf.summary.FileWriter(args.savepath) with tf.Session() as sess: summ = sess.run(tf.summary.merge_all()) writer.add_summary(summ) writer.close()
def predict(self, path_to_wav): x_mfcc, y_spec, mel = get_mfccs_and_spectrogram(path_to_wav) pred_spec, y_spec, ppgs = self.predictor(x_mfcc, y_spec, mel) pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db) y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db) pred_spec = db2amp(pred_spec) y_spec = db2amp(y_spec) pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude) y_spec = np.power(y_spec, hp.convert.emphasis_magnitude) # Spectrogram to waveform audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), pred_spec)) y_audio = np.array( map( lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp. default.hop_length, hp.default.n_iter), y_spec))
def generate_npz(wav_files): for i in range(len(wav_files)): f_name = wav_files[i].replace('wav', 'npz') mfccs, mag_db, mel_db = get_mfccs_and_spectrogram(wav_files[i]) savez(f_name, mfccs=mfccs, mag_db=mag_db, mel_db=mel_db)
def get_network_output( wav, ckpt_dir=CKPT_DIR, out_path_fmt="extern/deep_voice_conversion/outputs/test2_{:04d}_{:04d}.png" ): """ Computes PPGs for a given loaded wav audio. This splits the input wav file into two-second batches and runs each through the Phoneme classifier. For each batch, this outputs each ppg to the given output_path. :param wav: Loaded Wav audio. :param ckpt_dir: Pretrained 'Net1' weights. :param out_path_fmt: Output path format template to write each of the network outputs. """ assert os.path.isdir(ckpt_dir) # Make output directory. # out_dir = os.path.dirname(out_path_fmt) # if not os.path.isdir(out_dir): # os.mkdir(out_dir) # Initialize Offline Predictor. predictor = init_predictor(ckpt_dir) # Split wav into 2-second clips. length = hp.default.sr * hp.default.duration splits = list(range(length, wav.shape[0], length)) wavs = np.array_split(wav, splits, axis=0) print("Original wav length is", len(wav), "with sample rate", hp.default.sr) print("Length of wavs is ", [len(x) for x in wavs]) # mfcc_batch: [b=num_splits, time/lenghth, feats] mfcc_batch = np.array( [data_load.get_mfccs_and_spectrogram(wav=wav_)[0] for wav_ in wavs]) print("Length of mfccs is ", [len(x) for x in mfcc_batch]) # inp.shape: [b=num_splits, time/length, feats] # ppgs: (N, T, V); preds = predictor(mfcc_batch) assert len(preds) == 1 ppgs = preds[0] print("Length of ppgs is ", [len(x) for x in ppgs]) # Output each ppg. heatmaps = [] for i, heatmap in enumerate(ppgs): assert 0 <= np.min(heatmap) <= np.max(heatmap) <= 1.0 out_path = out_path_fmt.format(i, len(splits) - 1) print("Writing heatmap '{}' of shape '{}' to '{}'".format( i, heatmap.shape, out_path)) # REVIEW josephz: This really should be changed to the PIL Image.fromarray(...) -> convert -> save. # as scipy.misc.toimage is deprecated. But how the f**k are they doing the conversion? # I guess asserting the range and then manually converting will just have to do. # I think the scipy.misc.toimage method was deprecated as they realized supporting generalized # image conversion is actually really hard and the user should know how to do it themselves. # In fact we have the most information available to best approach convert it exactly as we need. # import scipy.misc # scipy.misc.toimage(heatmap).save(out_path) # Convert [0, 1] heatmap to [0, 255]. heatmap = 255 * heatmap # im = Image.fromarray(heatmap.astype(np.uint8)) # Save the image to disk. # im.save(out_path) # Accumulate heatmap. heatmaps.append(heatmap) return heatmaps
def do_convert(args, logdir1, logdir2): print("do_convert") # Load graph ckpt1 = tf.train.latest_checkpoint(logdir1) ckpt2 = '{}/{}'.format( logdir2, args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2) model = Net2ForConvert() session_inits = [] if ckpt2: session_inits.append(SaverRestore(ckpt2)) if ckpt1: session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu else: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' print("PredictConfig") pred_conf = PredictConfig(model=model, input_names=get_eval_input_names(), output_names=get_eval_output_names(), session_init=ChainInit(session_inits)) print("OfflinePredictor") set_env_s = datetime.datetime.now() predictor = OfflinePredictor(pred_conf) set_env_e = datetime.datetime.now() set_env_t = set_env_e - set_env_s print("Setting Environment time:{}s".format(set_env_t.seconds)) input_name = '' while True: input_name = input("Write your audio file\'s path for converting : ") if input_name == 'quit': break elif len(glob.glob(input_name)) == 0: print("That audio file doesn't exist! Try something else.") continue convert_s = datetime.datetime.now() mfcc, spec, mel_spec = get_mfccs_and_spectrogram(input_name, trim=False, isConverting=True) mfcc = np.expand_dims(mfcc, axis=0) spec = np.expand_dims(spec, axis=0) mel_spec = np.expand_dims(mel_spec, axis=0) output_audio, ppgs = convert(predictor, mfcc, spec, mel_spec) input_audio, samplerate = load(input_name, sr=hp.default.sr, dtype=np.float64) """ # F0 adaptation with WORLD Vocoder f0_conv_s = datetime.datetime.now() output_audio = f0_adapt(input_audio, output_audio, logdir2, samplerate) f0_conv_e = datetime.datetime.now() f0_conv_time = f0_conv_e - f0_conv_s print("F0 Adapting Time:{}s".format(f0_conv_time.seconds)) """ # Saving voice-converted audio to 32-bit float wav file # print(audio.dtype) output_audio = output_audio.astype(np.float32) write_wav(path="./converted/" + input_name, y=output_audio, sr=hp.default.sr) # Saving PPGS data to Grayscale Image and raw binary file ppgs = np.squeeze(ppgs, axis=0) plt.imsave('./converted/debug/' + input_name + '.png', ppgs, cmap='binary') np.save('./converted/debug/' + input_name + '.npy', ppgs) convert_e = datetime.datetime.now() convert_time = convert_e - convert_s print("Total Converting Time:{}s".format(convert_time.seconds))