def get_world_feats(vocals): feats = pw.wav2world(vocals, config.fs, frame_period=config.hoptime * 1000) ap = feats[2].reshape([feats[1].shape[0], feats[1].shape[1]]).astype(np.float32) ap = 10. * np.log10(ap**2) harm = 10 * np.log10(feats[1].reshape( [feats[2].shape[0], feats[2].shape[1]])) f0 = feats[0] # f0 = pitch.extract_f0_sac(vocals, fs, config.hoptime) y = 69 + 12 * np.log2(f0 / 440) # import pdb;pdb.set_trace() # y = hertz_to_new_base(f0) nans, x = utils.nan_helper(y) naners = np.isinf(y) y[nans] = np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y = np.array(y).reshape([len(y), 1]) guy = np.array(naners).reshape([len(y), 1]) y = np.concatenate((y, guy), axis=-1) if config.comp_mode == 'mfsc': harmy = sp_to_mfsc(harm, 60, 0.45) apy = sp_to_mfsc(ap, 4, 0.45) elif config.comp_mode == 'mgc': harmy = sp_to_mgc(harm, 60, 0.45) apy = sp_to_mgc(ap, 4, 0.45) out_feats = np.concatenate((harmy, apy, y.reshape((-1, 2))), axis=1) return out_feats
def stft_to_feats(vocals, fs, mode=config.comp_mode): if len(vocals.shape)>1: vocals = vocals[:,0] vocals = np.ascontiguousarray(vocals) feats=pw.wav2world(vocals,fs,frame_period=5.80498866) ap = feats[2].reshape([feats[1].shape[0],feats[1].shape[1]]).astype(np.float32) ap = 10.*np.log10(ap**2) harm=10*np.log10(feats[1].reshape([feats[2].shape[0],feats[2].shape[1]])) f0 = feats[0] # f0_1 = pitch.extract_f0_sac(vocals, fs, 0.00580498866) # import pdb;pdb.set_trace() y=69+12*np.log2(f0/440) # y = hertz_to_new_base(f0) nans, x= nan_helper(y) naners=np.isinf(y) y[nans]= np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y=np.array(y).reshape([len(y),1]) guy=np.array(naners).reshape([len(y),1]) y=np.concatenate((y,guy),axis=-1) if mode == 'mfsc': harmy=sp_to_mfsc(harm,60,0.45) apy=sp_to_mfsc(ap,4,0.45) elif mode == 'mgc': harmy=sp_to_mgc(harm,60,0.45) apy=sp_to_mgc(ap,4,0.45) # import pdb;pdb.set_trace() out_feats=np.concatenate((harmy,apy,y.reshape((-1,2))),axis=1) # harm_in=mgc_to_sp(harmy, 1025, 0.45) # ap_in=mgc_to_sp(apy, 1025, 0.45) return out_feats
def stft_to_feats(vocals, fs = config.fs): if len(vocals.shape)>1: vocals = vocals[:,0] vocals = np.ascontiguousarray(vocals) feats=pw.wav2world(vocals,fs = config.fs,frame_period=config.hoptime*1000) ap = feats[2].reshape([feats[1].shape[0],feats[1].shape[1]]).astype(np.float32) ap = 10.*np.log10(ap**2) harm=10 * np.log10(feats[1].reshape([feats[2].shape[0],feats[2].shape[1]])) harm = harm - 20 f0 = feats[0] is_voiced = f0 > 0.0 if not np.any(is_voiced): pass # all unvoiced, do nothing else: for k in range(ap.shape[1]): ap[~is_voiced, k] = np.interp(np.where(~is_voiced)[0], np.where(is_voiced)[0], ap[is_voiced, k]) # f0_1 = pitch.extract_f0_sac(vocals, fs, 0.00580498866) y=69+12*np.log2(f0/440) # y = hertz_to_new_base(f0) nans, x= nan_helper(y) naners=np.isinf(y) y[nans]= np.interp(x(nans), x(~nans), y[~nans]) # y=[float(x-(min_note-1))/float(max_note-(min_note-1)) for x in y] y=np.array(y).reshape([len(y),1]) guy=np.array(naners).reshape([len(y),1]) y=np.concatenate((y,guy),axis=-1) harm = np.nan_to_num(harm) ap = np.nan_to_num(ap) harmy= sp_to_mfsc(harm+1e-12,60,0.45) apy= ap_to_wbap(ap+1e-12,4,config.fs) out_feats=np.concatenate((harmy,apy,y.reshape((-1,2))),axis=1) # import pdb;pdb.set_trace() # audio_out = feats_to_audio(out_feats) # sf.write('./test_mfsc.wav', audio_out, config.fs) # import pdb;pdb.set_trace() # harm_in=mgc_to_sp(harmy, 1025, 0.45) # ap_in= wbap_to_ap(apy, 1025, config.fs) # harm_in = 10**((harm_in + 20)/10) # ap_in = np.clip(10**(ap_in/20), 0.0, 1.0) # audio_out = pw.synthesize(f0 , np.ascontiguousarray(harm_in).astype('double') , np.ascontiguousarray(ap).astype('double'),config.fs,config.hoptime*1000) # sf.write('./test.wav', audio_out, config.fs) return out_feats, f0