def pre_process_batch(self, batch): len_batch = np.asarray([len(item) for item in batch], dtype=np.int32) max_len = np.max(len_batch) wav_batch = np.asarray([ np.pad(item, (0, max_len - item_len), mode='constant', constant_values=0.) for item, item_len in zip(batch, len_batch) ]) wav_batch = np.expand_dims(wav_batch, axis=-1) wav_batch = audio.quantize(audio.miu_law(wav_batch)) return wav_batch, len_batch
def __getitem__(self, index): entry = self.metadata[index] m = np.load(entry[2].strip()) wav = np.load(entry[1].strip()) if hp.input_type == 'raw' or hp.input_type=='mixture': wav = wav.astype(np.float32) elif hp.input_type == 'mulaw': wav = mulaw_quantize(wav, hp.mulaw_quantize_channels).astype(np.int) elif hp.input_type == 'bits': wav = quantize(wav).astype(np.int) else: raise ValueError("hp.input_type {} not recognized".format(hp.input_type)) return m, wav
def convert_voice(model, wav_s, wav_t, emb_s, emb_t): """Arguments: cvae - ACVAE model embedder - DeepSpeakerModel wav_s - source voice wav_t - target voice Returns: wav file with words from source voice, voice from target voice """ pic_dir = "../figure/" if not os.path.exists(pic_dir): os.mkdir(pic_dir) feat_t, mean_t, std_t, sp_t, f0_t, ap_t = get_features(wav_t, training=False) feat_s, mean_s, std_s, sp_s, f0_s, ap_s = get_features(wav_s, training=False) logf0s_mean_s = np.mean(np.ma.log(f0_s)) logf0s_std_s = np.std(np.ma.log(f0_s)) logf0s_mean_t = np.mean(np.ma.log(f0_t)) logf0s_std_t = np.std(np.ma.log(f0_t)) f0_converted = pitch_conversion(f0=f0_s, mean_log_src=logf0s_mean_s, std_log_src=logf0s_std_s, mean_log_target=logf0s_mean_t, std_log_target=logf0s_std_t) mu_enc, logvar_enc = model.encoder([ np.expand_dims(feat_s.astype(np.float32), [0, -1]), emb_s.astype(np.float32) ]) z_enc = model.reparameterize(mu_enc, logvar_enc) # norm_f0_c = quantize([1]*len(f0_s)) norm_f0 = quantize(f0_s) nmfe_converted = model.decoder([ z_enc, tf.reshape(emb_t, (1, -1)), np.expand_dims(norm_f0.astype(np.float32), [0, 1]) ]) nmfe_recon = model.decoder([ z_enc, tf.reshape(emb_s, (1, -1)), np.expand_dims(norm_f0.astype(np.float32), [0, 1]) ]) nmfe_converted = np.squeeze(nmfe_converted.numpy()) # print("Val L1 loss: {}".format(np.mean(np.abs(np.squeeze(nmfe_recon.numpy()) - feat_s)).mean())) nmfe_recon = np.squeeze(nmfe_recon.numpy()) mfe_converted = np.exp(nmfe_converted.T * std_t + mean_t) mfe_recon = np.exp(nmfe_recon.T * std_s + mean_s) plt.imshow(nmfe_converted[:100]) plt.colorbar() plt.savefig(pic_dir + "convert.png") plt.close("all") plt.imshow(nmfe_recon[:100]) plt.colorbar() plt.savefig(pic_dir + "recon.png") plt.close("all") plt.imshow(feat_s[:100]) plt.colorbar() plt.savefig(pic_dir + "orig.png") plt.close("all") sp_converted_s = mfe2sp(mfe_recon) sp_converted_t = mfe2sp(mfe_converted) plt.imshow(sp_converted_s) plt.colorbar() plt.savefig(pic_dir + "sp_recovered.png") plt.close("all") plt.imshow(sp_converted_t) plt.colorbar() plt.savefig(pic_dir + "sp_converted.png") plt.close("all") factor = np.divide(sp_converted_t, sp_converted_s) sp_gained = np.multiply(sp_s[:len(factor)], factor[:len(sp_s)]) #remove too big peaks: # sp_gained = np.minimum(sp_gained, sp_s.max(axis=1, keepdims=True)[:len(sp_gained)]) plt.plot(mfe_recon[50], color="green") plt.plot(mfe_converted[50], color="red") plt.plot(feat_s[50], color="black") plt.savefig(pic_dir + "mfe.png") plt.close("all") plt.figure(figsize=(5, 10)) plt.plot(sp_converted_t[50], color="red") plt.plot(sp_converted_s[50], color="green") plt.plot(sp_s[50], color="black") plt.plot(sp_gained[50], color="blue") plt.savefig(pic_dir + "sp.png") plt.close("all") plt.plot(sp_converted_t[10], color="red") plt.plot(sp_converted_s[10], color="green") plt.plot(sp_s[10], color="black") plt.plot(sp_gained[10], color="blue") plt.savefig(pic_dir + "sp2.png") plt.close("all") plt.imshow(sp_t) plt.savefig(pic_dir + "target_sp.png") plt.imshow(sp_s) plt.savefig(pic_dir + "sp_orig.png") plt.close("all") #deemphasis plt.imshow(sp_gained) plt.savefig(pic_dir + "sp_gained.png") sp_gained = sp_gained / preemph_transform.reshape(1, -1) wav_transformed = pyworld.synthesize( f0_converted[:len(sp_gained)], # wav_transformed = pyworld.synthesize(np.array([np.exp(logf0s_mean_t)] * len(sp_gained)), sp_gained, ap_s[:len(sp_gained)], SAMPLE_RATE, FRAME_PERIOD) #normalize amplitude wav_result = librosa.util.normalize(wav_transformed) wav_result = wav_result.astype(np.float32) return wav_result