def test_folder_wav(self, folder_name): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ sess = tf.Session() self.load_model(sess, log_dir=config.log_dir) file_list = [x for x in os.listdir(folder_name) if x.endswith('.wav') and not x.startswith('.')] count = 0 unprocessable = [] for file_name in file_list: try: mel = self.read_wav_file(os.path.join(folder_name, file_name)) out_mel, out_f0, out_vuv = self.process_file(mel, sess) out_featss = np.concatenate((out_mel, out_f0, out_vuv), axis = -1) audio_out = utils.feats_to_audio(out_featss) sf.write(os.path.join(config.output_dir,'./{}_output.wav'.format(file_name.split('/')[-1][:-4])), audio_out, config.fs) np.save(os.path.join(config.output_dir_np,file_name[:-4]), out_featss) except: unprocessable.append(file_name) count+=1 utils.progress(count, len(file_list), "Files processed")
def test_file_hdf5(self, file_name, file_name_singer): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ sess = tf.Session() self.load_model(sess, log_dir = config.log_dir) voc_stft, feats = self.read_hdf5_file(file_name) voc_stft_singer, feats_singer = self.read_hdf5_file(file_name_singer) out_feats = self.process_file(voc_stft, voc_stft_singer, sess) self.plot_features(feats, out_feats) import pdb;pdb.set_trace() out_featss = np.concatenate((out_feats[:feats.shape[0]], feats[:out_feats.shape[0],-2:]), axis = -1) utils.feats_to_audio(out_featss,file_name[:-4]+'gan_op.wav')
def test_file_hdf5(self, file_name, speaker_index, speaker_index_2): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ sess = tf.Session() self.load_model(sess, log_dir = config.log_dir) mel = self.read_hdf5_file(file_name) out_mel = self.process_file(mel[:,:-2], speaker_index, speaker_index_2, sess) self.plot_features(mel, out_mel) synth = utils.query_yes_no("Synthesize output? ") if synth: gen_change = utils.query_yes_no("Change in gender? ") if gen_change: female_male = utils.query_yes_no("Female to male?") if female_male: out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:-1]-12, mel[:out_mel.shape[0],-1:]), axis = -1) else: out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:-1]+12, mel[:out_mel.shape[0],-1:]), axis = -1) else: out_featss = np.concatenate((out_mel[:mel.shape[0]], mel[:out_mel.shape[0],-2:]), axis = -1) audio_out = utils.feats_to_audio(out_featss) sf.write('./{}_{}.wav'.format(file_name[:-5], config.singers[speaker_index_2]), audio_out, config.fs) synth_ori = utils.query_yes_no("Synthesize ground truth with vocoder? ") if synth_ori: audio = utils.feats_to_audio(mel) sf.write('./{}_{}_ori.wav'.format(file_name[:-5], config.singers[speaker_index]), audio, config.fs)
def test_model_yam(self): """ Function to extract vocals from wav file. """ sess = tf.Session() self.load_model(sess, log_dir=config.log_dir) voc_stft = utils.file_to_stft('./Bria_000_VoU67.wav') back_stft = utils.file_to_stft('./Bria_000_Back.wav') mix_stft = np.clip( (voc_stft[:len(back_stft)] + back_stft[:len(voc_stft)]) / 2, 0.0, 1.0) feats = utils.input_to_feats('./Bria_000_VoU67.wav') out_feats = self.process_file(mix_stft, sess) self.plot_features(feats, out_feats) out_featss = np.concatenate( (out_feats[:feats.shape[0], :-2], feats[:out_feats.shape[0], -2:]), axis=-1) utils.feats_to_audio(out_featss[:3000], 'Bree_output')
def test_file_hdf5(self, file_name, singer_index): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ sess = tf.Session() self.load_model(sess, log_dir = config.log_dir) feats, f0_nor, pho_target = self.read_hdf5_file(file_name) out_feats = self.process_file(f0_nor, pho_target, singer_index, sess) self.plot_features(feats, out_feats) synth = utils.query_yes_no("Synthesize output? ") if synth: out_featss = np.concatenate((out_feats[:feats.shape[0]], feats[:out_feats.shape[0],-2:]), axis = -1) utils.feats_to_audio(out_featss,file_name[:-4]+'output') synth_ori = utils.query_yes_no("Synthesize gorund truth with vocoder? ") if synth_ori: utils.feats_to_audio(feats,file_name[:-4]+'ground_truth')
def synth_file(file_name="nus_MCUR_sing_10.hdf5", singer_index=0, file_path=config.wav_dir, show_plots=True, save_file="GBO"): stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 66), name='input_placeholder') tf.summary.histogram('inputs', input_placeholder) output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='output_placeholder') f0_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_input_placeholder') rand_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 4), name='rand_input_placeholder') prob = tf.placeholder_with_default(1.0, shape=()) phoneme_labels = tf.placeholder(tf.int32, shape=(config.batch_size, config.max_phr_len), name='phoneme_placeholder') phone_onehot_labels = tf.one_hot(indices=tf.cast( phoneme_labels, tf.int32), depth=42) phoneme_labels_2 = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 42), name='phoneme_placeholder_1') # phone_onehot_labels = tf.one_hot(indices=tf.cast(phoneme_labels, tf.int32), depth=42) singer_labels = tf.placeholder(tf.float32, shape=(config.batch_size), name='singer_placeholder') singer_onehot_labels = tf.one_hot(indices=tf.cast( singer_labels, tf.int32), depth=12) with tf.variable_scope('phone_Model') as scope: # regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) pho_logits = modules.phone_network(input_placeholder) pho_classes = tf.argmax(pho_logits, axis=-1) pho_probs = tf.nn.softmax(pho_logits) with tf.variable_scope('Final_Model') as scope: voc_output = modules.final_net(singer_onehot_labels, f0_input_placeholder, phoneme_labels_2) voc_output_decoded = tf.nn.sigmoid(voc_output) scope.reuse_variables() voc_output_3 = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs) voc_output_3_decoded = tf.nn.sigmoid(voc_output_3) # scope.reuse_variables() # voc_output_gen = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs) # voc_output_decoded_gen = tf.nn.sigmoid(voc_output_gen) # with tf.variable_scope('singer_Model') as scope: # singer_embedding, singer_logits = modules.singer_network(input_placeholder, prob) # singer_classes = tf.argmax(singer_logits, axis=-1) # singer_probs = tf.nn.softmax(singer_logits) with tf.variable_scope('Generator') as scope: voc_output_2 = modules.GAN_generator(singer_onehot_labels, phoneme_labels_2, f0_input_placeholder, rand_input_placeholder) with tf.variable_scope('Discriminator') as scope: D_fake = modules.GAN_discriminator(voc_output_2, singer_onehot_labels, phone_onehot_labels, f0_input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # saver.restore(sess, './log/model.ckpt-3999') # import pdb;pdb.set_trace() voc_file = h5py.File(config.voice_dir + file_name, "r") # speaker_file = h5py.File(config.voice_dir+speaker_file, "r") feats = np.array(voc_file['feats']) # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1) f0 = feats[:, -2] # import pdb;pdb.set_trace() med = np.median(f0[f0 > 0]) f0[f0 == 0] = med f0 = f0 - 12 feats[:, -2] = feats[:, -2] - 12 f0_nor = (f0 - min_feat[-2]) / (max_feat[-2] - min_feat[-2]) feats = (feats - min_feat) / (max_feat - min_feat) pho_target = np.array(voc_file["phonemes"]) in_batches_f0, nchunks_in = utils.generate_overlapadd( f0_nor.reshape(-1, 1)) in_batches_pho, nchunks_in_pho = utils.generate_overlapadd( pho_target.reshape(-1, 1)) in_batches_feat, kaka = utils.generate_overlapadd(feats) # import pdb;pdb.set_trace() out_batches_feats = [] out_batches_feats_1 = [] out_batches_feats_gan = [] for in_batch_f0, in_batch_pho_target, in_batch_feat in zip( in_batches_f0, in_batches_pho, in_batches_feat): in_batch_f0 = in_batch_f0.reshape( [config.batch_size, config.max_phr_len, 1]) in_batch_pho_target = in_batch_pho_target.reshape( [config.batch_size, config.max_phr_len]) # in_batch_pho_target = sess.run(pho_probs, feed_dict = {input_placeholder: in_batch_feat}) output_feats, output_feats_1, output_feats_gan = sess.run( [voc_output_decoded, voc_output_3_decoded, voc_output_2], feed_dict={ input_placeholder: in_batch_feat, f0_input_placeholder: in_batch_f0, phoneme_labels_2: in_batch_pho_target, singer_labels: np.ones(30) * singer_index, rand_input_placeholder: np.random.normal(-1.0, 1.0, size=[30, config.max_phr_len, 4]) }) out_batches_feats.append(output_feats) out_batches_feats_1.append(output_feats_1) out_batches_feats_gan.append(output_feats_gan / 2 + 0.5) # out_batches_voc_stft_phase.append(output_voc_stft_phase) # import pdb;pdb.set_trace() out_batches_feats = np.array(out_batches_feats) # import pdb;pdb.set_trace() out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in) out_batches_feats_1 = np.array(out_batches_feats_1) # import pdb;pdb.set_trace() out_batches_feats_1 = utils.overlapadd(out_batches_feats_1, nchunks_in) out_batches_feats_gan = np.array(out_batches_feats_gan) # import pdb;pdb.set_trace() out_batches_feats_gan = utils.overlapadd(out_batches_feats_gan, nchunks_in) feats = feats * (max_feat - min_feat) + min_feat out_batches_feats = out_batches_feats * (max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats_1 = out_batches_feats_1 * ( max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats_gan = out_batches_feats_gan * ( max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats = out_batches_feats[:len(feats)] out_batches_feats_1 = out_batches_feats_1[:len(feats)] out_batches_feats_gan = out_batches_feats_gan[:len(feats)] first_op = np.concatenate([out_batches_feats, feats[:, -2:]], axis=-1) pho_op = np.concatenate([out_batches_feats_1, feats[:, -2:]], axis=-1) gan_op = np.concatenate([out_batches_feats_gan, feats[:, -2:]], axis=-1) # import pdb;pdb.set_trace() gan_op = np.ascontiguousarray(gan_op) pho_op = np.ascontiguousarray(pho_op) first_op = np.ascontiguousarray(first_op) if show_plots: plt.figure(1) ax1 = plt.subplot(311) plt.imshow(feats[:, :60].T, aspect='auto', origin='lower') ax1.set_title("Ground Truth Vocoder Features", fontsize=10) ax2 = plt.subplot(312, sharex=ax1, sharey=ax1) plt.imshow(out_batches_feats[:, :60].T, aspect='auto', origin='lower') ax2.set_title("Cross Entropy Output Vocoder Features", fontsize=10) ax3 = plt.subplot(313, sharex=ax1, sharey=ax1) ax3.set_title("GAN Vocoder Output Features", fontsize=10) # plt.imshow(out_batches_feats_1[:,:60].T,aspect='auto',origin='lower') # # plt.subplot(414, sharex = ax1, sharey = ax1) plt.imshow(out_batches_feats_gan[:, :60].T, aspect='auto', origin='lower') plt.figure(2) plt.subplot(211) plt.imshow(feats[:, 60:-2].T, aspect='auto', origin='lower') plt.subplot(212) plt.imshow(out_batches_feats[:, -4:].T, aspect='auto', origin='lower') plt.show() save_file = input( "Which files to synthesise G for GAN, B for Binary Entropy, " "O for original, or any combination. Default is None").upper( ) or "N" else: save_file = input( "Which files to synthesise G for GAN, B for Binary Entropy, " "O for original, or any combination. Default is all (GBO)" ).upper() or "GBO" if "G" in save_file: utils.feats_to_audio(gan_op[:, :], file_name[:-4] + 'gan_op.wav') print("GAN file saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + 'gan_op.wav'))) if "O" in save_file: utils.feats_to_audio(feats[:, :], file_name[:-4] + 'ori_op.wav') print("Originl file, resynthesized via WORLD vocoder saved to {}". format( os.path.join(config.val_dir, file_name[:-4] + 'ori_op.wav'))) # if "B" in save_file: # # utils.feats_to_audio(pho_op[:5000,:],file_name[:-4]+'phoop.wav') # utils.feats_to_audio(first_op[:, :], file_name[:-4] + 'bce_op.wav') print("Binar cross entropy file saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + 'bce_op.wav')))
def synth_file(file_name, file_path=config.wav_dir, show_plots=True, save_file=True): if file_name.startswith('ikala'): file_name = file_name[6:] file_path = config.wav_dir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) mode = 0 elif file_name.startswith('mir'): file_name = file_name[4:] file_path = config.wav_dir_mir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) mode = 0 elif file_name.startswith('med'): file_name = file_name[4:] file_path = config.wav_dir_med utils.write_ori_med(os.path.join(file_path, file_name), file_name) mode = 2 else: mode = 1 file_path = './' stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_back = np.array(stat_file["back_stft_maximus"]) min_back = np.array(stat_file["back_stft_minimus"]) max_mix = np.array(max_voc) + np.array(max_back) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.input_features), name='input_placeholder') with tf.variable_scope('First_Model') as scope: harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder) # harmy = harm_1+harm if config.use_gan: with tf.variable_scope('Generator') as scope: gen_op = modules.GAN_generator(harm) # with tf.variable_scope('Discriminator') as scope: # D_real = modules.GAN_discriminator(target_placeholder[:,:,:60],input_placeholder) # scope.reuse_variables() # D_fake = modules.GAN_discriminator(gen_op,input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir_m1) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) mix_stft = utils.file_to_stft(os.path.join(file_path, file_name), mode=mode) targs = utils.input_to_feats(os.path.join(file_path, file_name), mode=mode) import pdb pdb.set_trace() # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name)) # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2]) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) in_batches = in_batches / max_mix # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in) val_outer = [] first_pred = [] cleaner = [] gan_op = [] for in_batch in in_batches: val_harm, val_ap, val_f0, val_vuv = sess.run( [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch}) if config.use_gan: val_op = sess.run(gen_op, feed_dict={input_placeholder: in_batch}) gan_op.append(val_op) # first_pred.append(harm1) # cleaner.append(val_harm) val_harm = val_harm val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv), axis=-1) val_outer.append(val_outs) val_outer = np.array(val_outer) val_outer = utils.overlapadd(val_outer, nchunks_in) val_outer[:, -1] = np.round(val_outer[:, -1]) val_outer = val_outer[:targs.shape[0], :] val_outer = np.clip(val_outer, 0.0, 1.0) import pdb pdb.set_trace() #Test purposes only # first_pred = np.array(first_pred) # first_pred = utils.overlapadd(first_pred, nchunks_in) # cleaner = np.array(cleaner) # cleaner = utils.overlapadd(cleaner, nchunks_in) if config.use_gan: gan_op = np.array(gan_op) gan_op = utils.overlapadd(gan_op, nchunks_in) targs = (targs - min_feat) / (max_feat - min_feat) # first_pred = (first_pred-min_feat[:60])/(max_feat[:60]-min_feat[:60]) # cleaner = (cleaner-min_feat[:60])/(max_feat[:60]-min_feat[:60]) # ax1 = plt.subplot(311) # plt.imshow(targs[:,:60].T, origin='lower', aspect='auto') # # ax1.set_title("Harmonic Spectral Envelope", fontsize = 10) # ax2 = plt.subplot(312) # plt.imshow(targs[:,60:64].T, origin='lower', aspect='auto') # # ax2.set_title("Aperiodicity Envelope", fontsize = 10) # ax3 = plt.subplot(313) # plt.plot(targs[:,-2]) # ax3.set_title("Fundamental Frequency Contour", fontsize = 10) if show_plots: # import pdb;pdb.set_trace() ins = val_outer[:, :60] outs = targs[:, :60] plt.figure(1) ax1 = plt.subplot(211) plt.imshow(ins.T, origin='lower', aspect='auto') ax1.set_title("Predicted Harm ", fontsize=10) ax2 = plt.subplot(212) plt.imshow(outs.T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Harm ", fontsize=10) # ax1 = plt.subplot(413) # plt.imshow(first_pred.T, origin='lower', aspect='auto') # ax1.set_title("Initial Prediction ", fontsize = 10) # ax2 = plt.subplot(412) # plt.imshow(cleaner.T, origin='lower', aspect='auto') # ax2.set_title("Residual Added ", fontsize = 10) if config.use_gan: plt.figure(5) ax1 = plt.subplot(411) plt.imshow(ins.T, origin='lower', aspect='auto') ax1.set_title("Predicted Harm ", fontsize=10) ax2 = plt.subplot(414) plt.imshow(outs.T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Harm ", fontsize=10) ax1 = plt.subplot(412) plt.imshow(gan_op.T, origin='lower', aspect='auto') ax1.set_title("GAN output ", fontsize=10) ax1 = plt.subplot(413) plt.imshow((gan_op[:ins.shape[0], :] + ins).T, origin='lower', aspect='auto') ax1.set_title("GAN output ", fontsize=10) plt.figure(2) ax1 = plt.subplot(211) plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto') ax1.set_title("Predicted Aperiodic Part", fontsize=10) ax2 = plt.subplot(212) plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Aperiodic Part", fontsize=10) plt.figure(3) f0_output = val_outer[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_output = f0_output * (1 - targs[:, -1]) f0_output[f0_output == 0] = np.nan plt.plot(f0_output, label="Predicted Value") f0_gt = targs[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_gt = f0_gt * (1 - targs[:, -1]) f0_gt[f0_gt == 0] = np.nan plt.plot(f0_gt, label="Ground Truth") f0_difference = np.nan_to_num(abs(f0_gt - f0_output)) f0_greater = np.where(f0_difference > config.f0_threshold) diff_per = f0_greater[0].shape[0] / len(f0_output) plt.suptitle("Percentage correct = " + '{:.3%}'.format(1 - diff_per)) # import pdb;pdb.set_trace() # import pdb;pdb.set_trace() # uu = f0_sac[:,0]*(1-f0_sac[:,1]) # uu[uu == 0] = np.nan # plt.plot(uu, label="Sac f0") plt.legend() plt.figure(4) ax1 = plt.subplot(211) plt.plot(val_outer[:, -1]) ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10) ax2 = plt.subplot(212) plt.plot(targs[:, -1]) ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10) plt.show() if save_file: val_outer = np.ascontiguousarray(val_outer * (max_feat - min_feat) + min_feat) targs = np.ascontiguousarray(targs * (max_feat - min_feat) + min_feat) # import pdb;pdb.set_trace() # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out)) try: utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_pred_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_pred_f0.wav') except: print("Couldn't synthesize with predicted f0") try: val_outer[:, -2:] = targs[:, -2:] utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_ori_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_ori_f0.wav') except: print("Couldn't synthesize with original f0")
def synth_file(file_name="015.hdf5", singer_index=0, file_path=config.wav_dir, show_plots=True): stat_file = h5py.File('./stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='output_placeholder') f0_output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_output_placeholder') f0_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len), name='f0_input_placeholder') f0_onehot_labels = tf.one_hot(indices=tf.cast(f0_input_placeholder, tf.int32), depth=len(config.notes)) f0_context_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_context_placeholder') phone_context_placeholder = tf.placeholder( tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='phone_context_placeholder') rand_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='rand_input_placeholder') prob = tf.placeholder_with_default(1.0, shape=()) phoneme_labels = tf.placeholder(tf.int32, shape=(config.batch_size, config.max_phr_len), name='phoneme_placeholder') phone_onehot_labels = tf.one_hot(indices=tf.cast( phoneme_labels, tf.int32), depth=len(config.phonemas)) with tf.variable_scope('Generator_feats') as scope: inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder ], axis=-1) voc_output = modules.GAN_generator(inputs) with tf.variable_scope('Generator_f0') as scope: inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, output_placeholder ], axis=-1) # inputs = tf.concat([phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output/2)+0.5], axis = -1) f0_output = modules.GAN_generator_f0(inputs) scope.reuse_variables() inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output / 2) + 0.5 ], axis=-1) f0_output_2 = modules.GAN_generator_f0(inputs) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # saver.restore(sess, './log/model.ckpt-3999') # import pdb;pdb.set_trace() feat_file = h5py.File(config.feats_dir + file_name, "r") # speaker_file = h5py.File(config.voice_dir+speaker_file, "r") # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1) feats = feat_file["world_feats"][()] feats = (feats - min_feat) / (max_feat - min_feat) phones = feat_file["phonemes"][()] notes = feat_file["notes"][()] phones = np.concatenate([phones, notes], axis=-1) # in_batches_f0, nchunks_in = utils.generate_overlapadd(f0_nor.reshape(-1,1)) in_batches_pho, nchunks_in = utils.generate_overlapadd(phones) in_batches_feat, kaka = utils.generate_overlapadd(feats) noters = np.expand_dims( np.array([config.notes[int(x)] for x in notes[:, 0]]), 1) out_batches_feats = [] out_batches_f0 = [] for conds, feat in zip(in_batches_pho, in_batches_feat): # import pdb;pdb.set_trace() f0 = conds[:, :, 2] phones = conds[:, :, 0] f0_context = conds[:, :, -1:] phones_context = conds[:, :, 1:2] feed_dict = { f0_input_placeholder: f0, phoneme_labels: phones, phone_context_placeholder: phones_context, f0_context_placeholder: f0_context, output_placeholder: feat[:, :, :-2] } output_feats_gan, output_f0 = sess.run([voc_output, f0_output_2], feed_dict=feed_dict) out_batches_feats.append(output_feats_gan / 2 + 0.5) out_batches_f0.append(output_f0 / 2 + 0.5) # out_batches_voc_stft_phase.append(output_voc_stft_phase) out_batches_feats = np.array(out_batches_feats) out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in) out_batches_f0 = np.array(out_batches_f0) out_batches_f0 = utils.overlapadd(out_batches_f0, nchunks_in) feats = feats * (max_feat - min_feat) + min_feat out_batches_feats = out_batches_feats * (max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats = out_batches_feats[:len(feats)] out_batches_f0 = out_batches_f0 * (max_feat[-2] - min_feat[-2]) + min_feat[-2] out_batches_f0 = out_batches_f0[:len(feats)] diff_1 = (out_batches_f0 - noters) * (1 - feats[:, -1:]) diff_2 = (feats[:, -2:-1] - noters) * (1 - feats[:, -1:]) print("Mean predicted note deviation {}".format(diff_1.mean())) print("Mean original note deviation {}".format(diff_2.mean())) print("STD predicted note deviation {}".format(diff_1.std())) print("STD original note deviation {}".format(diff_2.std())) plt.figure(1) plt.suptitle("F0 contour") plt.plot(out_batches_f0, label='Predicted F0') plt.plot(feats[:, -2], label="Ground Truth F0") plt.plot(noters, label="Input Midi Note") # plt.plot(phones[:,]) plt.legend() # plt.figure(2) # ax1 = plt.subplot(211) # plt.imshow(feats[:,:60].T,aspect='auto',origin='lower') # ax1.set_title("Ground Truth Vocoder Features", fontsize=10) # ax2 = plt.subplot(212, sharex = ax1, sharey = ax1) # plt.imshow(out_batches_feats[:,:60].T,aspect='auto',origin='lower') # ax2.set_title("GAN Output Vocoder Features", fontsize=10) plt.show() import pdb pdb.set_trace() # out_batches_feats_gan= out_batches_feats_gan[:len(feats)] first_op = np.concatenate( [out_batches_feats, out_batches_f0, feats[:, -1:]], axis=-1) second_op = np.concatenate( [feats[:, 60:64], out_batches_f0, feats[:, -1:]], axis=-1) # pho_op = np.concatenate([out_batches_feats_1,feats[:,-2:]], axis = -1) # gan_op = np.concatenate([out_batches_feats_gan,feats[:,-2:]], axis = -1) # import pdb;pdb.set_trace() # gan_op = np.ascontiguousarray(gan_op) # pho_op = np.ascontiguousarray(pho_op) first_op = np.ascontiguousarray(first_op) second_op = np.ascontiguousarray(second_op) utils.feats_to_audio(first_op, file_name[:-4] + '_gan_op') print("Full output saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + '_gan_op.wav'))) utils.feats_to_audio(first_op, file_name[:-4] + '_F0_op') print("Only F0 saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + '_F0_op.wav')))
def synth_file(file_name, file_path=config.wav_dir, show_plots=True, save_file=True): debug = False if file_name.startswith('ikala'): file_name = file_name[6:] file_path = config.wav_dir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) elif file_name.startswith('mir'): file_name = file_name[4:] file_path = config.wav_dir_mir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_back = np.array(stat_file["back_stft_maximus"]) min_back = np.array(stat_file["back_stft_minimus"]) max_mix = np.array(max_voc) + np.array(max_back) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(1, config.max_phr_len, config.input_features), name='input_placeholder') tf.summary.histogram('conditioning', input_placeholder) input_placeholder_2 = tf.placeholder(tf.float32, shape=(1, config.max_phr_len, config.output_features), name='input_placeholder') tf.summary.histogram('inputs', input_placeholder) output, vuv = modules.wavenet(input_placeholder_2, input_placeholder) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) mix_stft = utils.file_to_stft(os.path.join(file_path, file_name)) mix_stft = mix_stft / max_mix lent = len(mix_stft) mix_stft_in = np.pad(mix_stft, [(0, config.max_phr_len), (0, 0)], 'constant') # import pdb;pdb.set_trace() targs = utils.input_to_feats(os.path.join(file_path, file_name)) outputs = np.zeros((1, config.max_phr_len, config.output_features)) opus = [] i = 0 for index in range(lent): inputs = mix_stft_in[index:index + config.max_phr_len, :] if outputs.shape[1] > config.max_phr_len: inpy = inputs.reshape(1, config.max_phr_len, -1) outpy = outputs[:, -config.max_phr_len:, :] else: inpy = inputs.reshape(1, config.max_phr_len, -1) outpy = outputs # import pdb;pdb.set_trace() op, vu = sess.run([output, vuv], feed_dict={ input_placeholder: inpy, input_placeholder_2: outpy }) op = np.concatenate((op, vu), axis=-1) if debug: plt.figure(1) plt.subplot(311) plt.imshow(np.log(inputs.T), aspect='auto', origin='lower') plt.subplot(312) plt.imshow(targs[index:index + config.max_phr_len, :].T, aspect='auto', origin='lower') plt.subplot(313) plt.imshow(outpy.reshape(config.max_phr_len, -1).T, aspect='auto', origin='lower') plt.show() import pdb pdb.set_trace() if index > config.max_phr_len: outputs = np.append(outputs, op[:, -1:, :], axis=1) else: outputs[:, :index, :] = op[:, :index, :] val_outer = outputs[0] val_outer[:, -1] = np.round(val_outer[:, -1]) val_outer = val_outer[:targs.shape[0], :] val_outer = np.clip(val_outer, 0.0, 1.0) #Test purposes only # targs = utils.normalize(targs, 'feats', mode=config.norm_mode_out) targs = (targs - min_feat) / (max_feat - min_feat) if show_plots: # import pdb;pdb.set_trace() ins = val_outer[:, :60] outs = targs[:, :60] plt.figure(1) ax1 = plt.subplot(211) plt.imshow(ins.T, origin='lower', aspect='auto') ax1.set_title("Predicted Harm ", fontsize=10) ax2 = plt.subplot(212) plt.imshow(outs.T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Harm ", fontsize=10) plt.figure(2) ax1 = plt.subplot(211) plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto') ax1.set_title("Predicted Aperiodic Part", fontsize=10) ax2 = plt.subplot(212) plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Aperiodic Part", fontsize=10) plt.figure(3) uu = val_outer[:, -2] * (1 - targs[:, -1]) uu[uu == 0] = np.nan plt.plot(uu, label="Predicted Value") uu = targs[:, -2] * (1 - targs[:, -1]) uu[uu == 0] = np.nan plt.plot(uu, label="Ground Truth") # uu = f0_sac[:,0]*(1-f0_sac[:,1]) # uu[uu == 0] = np.nan # plt.plot(uu, label="Sac f0") plt.legend() plt.figure(4) ax1 = plt.subplot(211) plt.plot(val_outer[:, -1]) ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10) ax2 = plt.subplot(212) plt.plot(targs[:, -1]) ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10) plt.show() if save_file: val_outer = np.ascontiguousarray(val_outer * (max_feat - min_feat) + min_feat) targs = np.ascontiguousarray(targs * (max_feat - min_feat) + min_feat) # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out)) try: utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_pred_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_pred_f0.wav') except: print("Couldn't synthesize with predicted f0") try: val_outer[:, -2:] = targs[:, -2:] utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_ori_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_ori_f0.wav') except: print("Couldn't synthesize with original f0")
def test_file_wav(self, file_name, acap_file=None): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ sess = tf.Session() self.load_model(sess, log_dir = config.log_dir) mel = self.read_wav_file(file_name) if acap_file: feats = self.read_acap_file(acap_file) else: feats = None out_mel, out_atb, out_vuv = self.process_file(mel, sess) if config.f0_mode == "discrete": est_freq = utils.to_viterbi_cents(out_atb) est_freq = est_freq/100 est_freq = est_freq + 12*np.log2(10) - 12*np.log2(440) est_freq = est_freq + 69 plt.figure(1) if acap_file: ax1 = plt.subplot(311) plt.imshow(np.log(mel.T),aspect='auto',origin='lower') ax1.set_title("Input STFT", fontsize=10) ax2 = plt.subplot(312, sharex = ax1) plt.imshow(feats[:,:64].T,aspect='auto',origin='lower') ax2.set_title("Ground Truth Vocoder Features", fontsize=10) ax3 = plt.subplot(313, sharex = ax1, sharey = ax2) plt.imshow(out_mel[:feats.shape[0]].T,aspect='auto',origin='lower') ax3.set_title("Output Vocoder Features", fontsize=10) plt.figure(4) ax1 = plt.subplot(211) plt.plot(feats[:,-1]) ax1.set_title("Ground Truth VUV", fontsize=10) ax2 = plt.subplot(212, sharex = ax1, sharey = ax1) plt.plot(out_vuv) ax1.set_title("Output VUV", fontsize=10) plt.figure(3) if config.f0_mode == "cont": f0_output = out_atb[:feats.shape[0], -1] else: f0_output = est_freq[:feats.shape[0]] f0_output = f0_output*(1-out_vuv[:feats.shape[0],0]) f0_output[f0_output == 0] = np.nan plt.plot(f0_output, label = "Predicted Value") f0_gt = feats[:,-2] f0_gt = f0_gt*(1-feats[:,-1]) f0_gt[f0_gt == 0] = np.nan plt.plot(f0_gt, label="Ground Truth") f0_difference = np.nan_to_num(abs(f0_gt-f0_output)) f0_greater = np.where(f0_difference>config.f0_threshold) diff_per = f0_greater[0].shape[0]/len(f0_output) plt.suptitle("Percentage correct = "+'{:.3%}'.format(1-diff_per)) plt.legend() plt.show() else: ax1 = plt.subplot(211) plt.imshow(np.log(mel.T),aspect='auto',origin='lower') ax1.set_title("Input STFT", fontsize=10) ax1 = plt.subplot(212) plt.imshow(out_mel.T,aspect='auto',origin='lower') ax1.set_title("Output Vocoder Features", fontsize=10) plt.show() if config.f0_mode == "cont": audio_out = utils.feats_to_audio(np.concatenate((out_mel, out_atb, out_vuv) , axis = -1)) sf.write('./{}_ss_pred.wav'.format(file_name.split('/')[-1][:-4]), audio_out, config.fs) elif config.f0_mode == "discrete": audio_out = utils.feats_to_audio(np.concatenate((out_mel[:feats.shape[0]], np.expand_dims(est_freq,-1)[:feats.shape[0]], feats[:,-1:]) , axis = -1)) sf.write('./{}_ss_pred_dis.wav'.format(file_name.split('/')[-1][:-4]), audio_out, config.fs) if acap_file: audio = utils.feats_to_audio(feats) sf.write('./{}_ori.wav'.format(file_name.split('/')[-1][:-4]), audio, config.fs) np.save(file_name.split('/')[-1][:-4], out_mel)