def read_wav_file(self, file_name): audio, fs = librosa.core.load(file_name, sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:, 1] + audio[:, 0]) / 2) else: vocals = np.array(audio) voc_stft = abs( np.array( utils.stft(audio, hopsize=config.hopsize, nfft=config.framesize, fs=config.fs))) feats = sig_process.get_world_feats(audio) voc_stft = np.clip(voc_stft, 0.0, 1.0) return feats, voc_stft
def get_feats(audio): """ Function to get acoustic input features, starting with STFT, needs to be extended to include MFCCs, will ask how many coefficients to use. """ # stft = librosa.core.stft(audio, n_fft = config.nfft, hop_length = config.hopsize, window = config.window).T stft = utils.stft(audio, window=config.window, hopsize=config.hopsize, nfft=config.nfft, fs=config.fs) assert abs(stft).max() <= 1.0 # voc_stft_mag = 2 * abs(voc_stft)/np.sum(config.window) # voc_stft_phase = np.angle(voc_stft) # cqt = librosa.core.cqt(audio, sr = config.fs, hop_length = config.hopsize, n_bins = config.cqt_bins, fmin = config.fmin, bins_per_octave = config.bins_per_octave).T # hcqt = get_hcqt(audio) # hcqt = np.swapaxes(hcqt, 0,1) return stft
def extract_feature_wav(self, audio): """ Function to extract multi pitch from file. Currently supports only HDF5 files. """ stft = abs(np.array(utils.stft(audio, hopsize=config.hopsize, nfft=config.framesize, fs=config.fs))) out_feats = self.extract_feature(stft, self.sess) out_feats = out_feats[:stft.shape[0]] return out_feats
def process_seg_yam(audio, audio_back): """ Process a segment of the audio. Returns the world features, TONY annotated notes and the STFT. """ out_feats = sig_process.get_world_feats(audio) #Test if the reverse works. # audio_out = sig_process.feats_to_audio(out_feats) traj = vamp_notes.extract_notes_pYIN_vamp(audio) if traj.shape[0]<1 or len(out_feats)<=config.max_phr_len: return None,None,None else: timestamps = np.arange(0, float(traj[-1][1]), config.hoptime) out_notes = vamp_notes.note2traj(traj, timestamps) out_notes_1 = sig_process.f0_to_hertz(out_notes[:,0]) out_notes_1[out_notes_1== -np.inf] = 0 out_notes[:,0] = out_notes_1 out_stft = abs(np.array(utils.stft(audio, hopsize=config.hopsize, nfft=config.framesize, fs=config.fs))) back_stft = abs(np.array(utils.stft(audio_back, hopsize=config.hopsize, nfft=config.framesize, fs=config.fs))) out_feats, out_notes, out_stft, back_stft = utils.match_time([out_feats, out_notes, out_stft, back_stft]) if len(out_feats)<=config.max_phr_len: return None,None,None, None else: assert all(out_feats[:,-2]>0) assert len(out_feats) == len(out_notes) return out_feats, out_notes, out_stft, back_stft
def read_wav_file(self, file_name): audio, fs = librosa.core.load(file_name, sr=config.fs) audio = np.float64(audio) if len(audio.shape) == 2: vocals = np.array((audio[:, 1] + audio[:, 0]) / 2) else: vocals = np.array(audio) voc_stft = abs(utils.stft(vocals)) feats = utils.stft_to_feats(vocals, fs) voc_stft = np.clip(voc_stft, 0.0, 1.0) return voc_stft, feats