def abs_preemph_fft(signal, samplerate=16000, winlen=0.08, winstep=0.04, nfft=2048, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) feat = sigproc.magspec(frames, nfft) feat = sigproc.preemphasis(feat, 1) return feat
def specdecomp(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, )), decomp='complex'): """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). :param frames: the array of frames. Each row is a frame. :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) if decomp == 'time' or decomp == 'frames': return frames complex_spec = np.fft.rfft(frames, nfft) if decomp == 'magnitude' or decomp == 'mag' or decomp == 'abs': return np.abs(complex_spec) elif decomp == 'phase' or decomp == 'angle': return np.angle(complex_spec) elif decomp == 'power' or decomp == 'powspec': return sigproc.powspec(frames, nfft) else: return complex_spec return spect
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Spectral Subband Centroid features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) pspec = numpy.where(pspec == 0, numpy.finfo(float).eps, pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1)) return numpy.dot(pspec * R, fb.T) / feat
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Mel-filterbank energy features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) energy = numpy.sum(pspec, 1) # this stores the total energy in each frame energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log return feat, energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def cspec(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfft=512, preemph=0.97, winfunc=lambda x: np.ones((x, ))): """Compute STFT coeeficients from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfft: the FFT size. Default is 512. :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=np.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) if np.shape(frames)[1] > nfft: logging.warn( 'frame length (%d) is greater than FFT size (%d), frame will be truncated. ' + 'Increase NFFT to avoid.', np.shape(frames)[1], nfft) return np.fft.rfft(frames, nfft)
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def return_mfcc(signal): new_signal = sigproc.preemphasis(np.asarray(signal), coeff=0.95) mfcc_feat = mfcc(new_signal, RATE, winlen=WIN_LEN, nfft=CHUNK_SAMPLES * 2, winfunc=np.hamming) return mfcc_feat
def load_frames(in_file, srate=16000): '''load frames from either single wav or npy @in_file: input wav or npy @return: raw frames in the size of [num_frames, frame_len] ''' signal, srate = librosa.load(in_file, srate) pre_emphed = sigproc.preemphasis(signal, coeff=0.95) frames = sigproc.framesig(pre_emphed, 0.025 * srate, 0.01 * srate) return frames
def _powspec_cord(self, data, fs): kwargs = self._cording_params preemph = kwargs["preemph"] winlen = kwargs["winlen"] winstep = kwargs["winstep"] winfunc = kwargs["winfunc"] fft_size = int(winlen * fs) data = preemphasis(data, preemph) frames = framesig(data, winlen * fs, winstep * fs, winfunc) return powspec(frames, fft_size)
def run(): rate, signal = wav.read("../recordings/english.wav") sigproc.preemphasis(signal) # filter_signal(signal) frame_size = 0.02 # second frame_step = 0.01 # second mfcc_feat = mfcc(signal, rate, winlen=frame_size, winstep=frame_step) n_components = 2 n_components = get_min_n_components(mfcc_feat) print("No. of identified components: {0}".format(n_components)) gmm = GMM(n_components=n_components, covariance_type='full') labels = gmm.fit(mfcc_feat).predict(mfcc_feat) plt.scatter(mfcc_feat[:, 0], mfcc_feat[:, 1], c=labels) plt.show() plt.plot(signal) plt.show()
def Make_Spect(wav_path, windowsize, stride, window=np.hamming, bandpass=False, lowfreq=0, highfreq=0, preemph=0.97, duration=False, nfft=None, normalize=True): """ read wav as float type. [-1.0 ,1.0] :param wav_path: :param windowsize: :param stride: :param window: default to np.hamming :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1). """ # samplerate, samples = wavfile.read(wav_path) samples, samplerate = sf.read(wav_path, dtype='float32') if bandpass and highfreq > lowfreq: samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate) signal = sigproc.preemphasis(samples, preemph) frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window) if nfft == None: nfft = int(windowsize * samplerate) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) # S = librosa.stft(samples, n_fft=int(windowsize * samplerate), # hop_length=int((windowsize-stride) * samplerate), # window=window(int(windowsize * samplerate))) # 进行短时傅里叶变换,参数意义在一开始有定义 # feature, _ = librosa.magphase(S) # feature = np.log1p(feature) # log1p操作 feature = np.log(pspec).astype(np.float32) # feature = feature.transpose() if normalize: feature = normalize_frames(feature) if duration: return feature, len(samples) / samplerate return feature
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False): """Return complex spec """ rate, sig = wav.read(wav_) sig = preemphasis(sig, PREEMPH) frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC) complex_spec = np.fft.rfft(frames, NFFT) time_scaled_complex_spec = None if with_time_scaled: time_scaled_frames = np.arange(frames.shape[-1]) * frames time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT) return complex_spec, time_scaled_complex_spec
def fft_sam(signal, samplerate=16000, winlen=0.08, winstep=0.04, nfft=2048, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) feat = numpy.float32(numpy.absolute(numpy.fft.fft( frames, nfft)))[:, 0:int(numpy.floor(nfft / 2)) + 1] for i in range(0, len(feat)): feat[i, 1:] = feat[i, 1:] - feat[i, :-1] return feat
def job(input_name, output_name): audio, _ = librosa.load(input_name, mono=True, sr=samplerate) if len(audio) == 0: return False signal = sigproc.preemphasis(audio, 0.97) x = sigproc.framesig(signal, winlen, winstep, np.hanning) if len(x) == 0: return False x = sigproc.powspec(x, nfft) x = np.dot(x, banks) x = np.where(x == 0, np.finfo(float).eps, x) x = np.log(x).astype(dtype=np.float32) if np.isnan(np.sum(x)): return False np.save(output_name, x) return True
def make_spectrogram(): global RUNNING_SPECTOGRAM, FINISHED_SPECTOGRAM data = q.get() if len(RUNNING_SPECTOGRAM) < FEED_LENGTH: # preemphasis the signal to weight up high frequencies signal = sigproc.preemphasis(data, coeff=0.95) # apply mfcc on the frames mfcc_feat = mfcc(signal, RATE, winlen=1 / (RATE / CHUNK_SAMPLES), nfft=CHUNK_SAMPLES * 2, winfunc=np.hamming) RUNNING_SPECTOGRAM = np.vstack([mfcc_feat, RUNNING_SPECTOGRAM]) connection.send_spectogram(mfcc_feat, len(RUNNING_SPECTOGRAM)) print(len(RUNNING_SPECTOGRAM)) else: FINISHED_SPECTOGRAM = RUNNING_SPECTOGRAM RUNNING_SPECTOGRAM = np.empty([0, 13], dtype='int16') globals.EXAMPLE_READY = True globals.MIC_TRIGGER = False
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False): """Return complex spec """ sig, rate = librosa.load(wav_, sr=sr) #print(rate,sig) sig = preemphasis(sig, PREEMPH) frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC) complex_spec = np.fft.rfft(frames, NFFT) time_scaled_complex_spec = None if with_time_scaled: time_scaled_frames = np.arange(frames.shape[-1]) * frames time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT) print(complex_spec.shape, time_scaled_complex_spec.shape) return complex_spec, time_scaled_complex_spec
def psf_compute_logpowspec(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, ))): # Note: nfft must bigger than winlen # otherwise, will lost some info # 1 pre emphasis 预加重 signal = sigproc.preemphasis(signal, preemph) # signal == (185876,) 16000 # 2 数据分帧 # (sample_rate = 16000, winlen=0.025, winstep=0.01) # 采样率 16000 窗 0.025s(400) 窗移 0.01s(100) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) # print(frames.shape) # (1161, 400) # 第11帧数据是完全相等的, 说明没有进行计算, 只是进行分帧处理. # signal 计算的第11帧数据为 400*10 - 400*11 # frames 分帧好的第11帧数据为 frames[10] # print(signal[1600:1610]) # print(frames[10,:10]) # 3 计算能量谱 # nfft: 傅立叶频率数量, # 最终每帧数据会得到 (nfft/2+1) 个频率幅度向量. # 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) # 归一化? 取平方 # pspec = sigproc.powspec(frames,nfft) # 3 直接计算log spec, 内部通过 sigproc.powspec 计算了能量谱 logpspec = sigproc.logpowspec(frames, nfft).T # energy = np.sum(pspec,1) # this stores the total energy in each frame # energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log return logpspec
def _vad_wav(self, wav_data): # 高通濾波 frequence = 1600 HZ_L = 20 HZ_H = 600 l_ = 2 * HZ_L / frequence h_ = 2 * HZ_H / frequence b, a = signal.butter(8, [l_, h_], 'bandpass') wav_data_f = signal.filtfilt(b, a, wav_data).astype(np.int16) # data为要过滤的信号 vad_wav = np.asarray([0], dtype=np.int16) status_wav = np.asarray([0], dtype=np.int16) wav_window = self.vad._load_wav(wav_data_f) max_en = wav_data.max() for wav_one in wav_window: # 0= 静音, 1= 可能开始 , 2=语音段, 3 =結束 status = self.vad.speech_status(wav_one, max_en) status_data = np.ones_like(wav_one) * status * 3000 status_wav = np.concatenate((status_wav, status_data)) if status != 0: vad_wav = np.concatenate((vad_wav, wav_one)) if self.show_wav: x1 = np.linspace(0, len(wav_data) - 1, len(wav_data)) x2 = np.linspace(0, len(vad_wav) - 1, len(vad_wav)) plt.subplot(211) plt.plot(x1, wav_data) plt.plot(x1, status_wav[1:]) plt.subplot(212) plt.plot(x2, vad_wav) plt.show() if self.save_vad_file: self._save_wav_file(wav_data) self._save_wav_file(wav_data_f) self._save_wav_file(vad_wav) # 預加重. vad_wav = sigproc.preemphasis(vad_wav, coeff=0.9).astype(np.int16) return vad_wav
def compute_mfcc(signal, sr=16000, winlen=0.032, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, window=np.hamming): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * sr, winstep * sr, winfunc=window) magspec = np.absolute(np.fft.rfft(frames, nfft)) powspec = 1.0 / nfft * np.square(magspec) energy = np.sum(powspec, 1) # this stores the total energy in each frame energy = np.where(energy == 0, np.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = base.get_filterbanks(nfilt, nfft, sr, lowfreq, highfreq) feat = np.dot(powspec, fb.T) # compute the filterbank energies feat = np.where(feat == 0, np.finfo(float).eps, feat) # if feat is zero, we get problems with log feat = np.log10(feat) feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep] feat = base.lifter(feat, ceplifter) if appendEnergy: feat[:, 0] = np.log10( energy ) # replace first cepstral coefficient with log of frame energy return feat
def generate(audiopath, binsize=320, numcontext=0, istxt=False): if istxt: samples = np.loadtxt(audiopath, dtype=np.int16, delimiter=',') print(samples.shape) samplerate = 16000 else: samplerate, samples = wav.read(audiopath) if len(samples.shape) == 2: samples = samples[:, 0] #normalized db samples = data_rep.normalize_to_db(samples, 0) samples = sigproc.preemphasis(samples, 0.97) s = stft(samples, binsize) sshow, freq = logscale_spec(s, factor=20.0, sr=samplerate) ims = 20. * np.log10(np.abs(sshow) / 10e-6) # amplitude to decibel _, numcep = ims.shape # For each time slice of the training set, we need to copy the context this makes # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions # because of: # - numcep dimensions for the current mfcc feature set # - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set # => so numcep + 2*numcontext*numcep train_inputs = np.array([], np.float32) train_inputs.resize((ims.shape[0], numcep + 2 * numcep * numcontext)) # Prepare pre-fix post fix context empty_spectrogram = np.array([]) empty_spectrogram.resize((numcep)) # Prepare train_inputs with past and future contexts time_slices = range(train_inputs.shape[0]) context_past_min = time_slices[0] + numcontext context_future_max = time_slices[-1] - numcontext for time_slice in time_slices: # Reminder: array[start:stop:step] # slices from indice |start| up to |stop| (not included), every |step| # Add empty context data of the correct size to the start and end # Pick up to numcontext time slices in the past, and complete with empty need_empty_past = max(0, (context_past_min - time_slice)) empty_source_past = list(empty_spectrogram for empty_slots in range(need_empty_past)) data_source_past = ims[max(0, time_slice - numcontext):time_slice] # Pick up to numcontext time slices in the future, and complete with empty need_empty_future = max(0, (time_slice - context_future_max)) empty_source_future = list(empty_spectrogram for empty_slots in range(need_empty_future)) data_source_future = ims[time_slice + 1:time_slice + numcontext + 1] if need_empty_past: past = np.concatenate((empty_source_past, data_source_past)) else: past = data_source_past if need_empty_future: future = np.concatenate((data_source_future, empty_source_future)) else: future = data_source_future past = np.reshape(past, numcontext * numcep) now = ims[time_slice] future = np.reshape(future, numcontext * numcep) train_inputs[time_slice] = np.concatenate((past, now, future)) return train_inputs
:return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1). """ # samplerate, samples = wavfile.read(wav_path) <<<<<<< HEAD samples, samplerate = sf.read(wav_path, dtype='float32') ======= samples, samplerate = sf.read(wav_path, dtype='int16') if not len(samples) > 0: raise ValueError('wav file is empty?') >>>>>>> Server/Server if bandpass and highfreq > lowfreq: samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate) signal = sigproc.preemphasis(samples, preemph) frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window) if nfft == None: nfft = int(windowsize * samplerate) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) if log_scale == True: feature = np.log(pspec).astype(np.float32) else: feature = pspec.astype(np.float32) # feature = feature.transpose() if normalize: feature = normalize_frames(feature)
def audio_features(params, img_audio, audio_path, append_name, node_list): output_file = params['output_file'] # create pytable atom for the features f_atom = tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, places # database contains some empty audio files invalid = [] for node in node_list: print(f'processing file: {count}') count += 1 # create a group for the desired feature type audio_node = output_file.create_group(node, params['feat']) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] # get the caption file names corresponding to the image of this node caption_files = img_audio[base_name][1] for cap in caption_files: # remove extension from the caption filename base_capt = cap.split('.')[0] # remove folder path from file names (Places/coco database) if '/' in base_capt: base_capt = base_capt.split('/')[-1] if '-' in base_capt: base_capt = base_capt.replace('-', '_') # read audio samples try: input_data, fs = librosa.load(os.path.join(audio_path, cap), sr=None) # in the places database some of the audiofiles are empty if len(input_data) == 0: break except: # try to repair broken files, some files had a wrong header. # In Places I found some that could not be fixed however try: fix_wav(os.path.join(audio_path, cap)) #input_data = read(os.path.join(audio_path, cap)) except: # the loop will break, if no valid audio features could # be made for this image, the entire node is deleted. break # set the fft size to the power of two equal to or greater than # the window size. window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 ############################################################################### # create audio features if params['feat'] == 'raw': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) features = sigproc.framesig(input_data, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) elif params['feat'] == 'freq_spectrum': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) frames = sigproc.framesig(input, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) # create the power spectrum features = sigproc.powspec(frames, fft_size) elif params['feat'] == 'fbanks': # create mel filterbank features [features, energy] = base.fbank(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], winfunc=params['windowing']) elif params['feat'] == 'mfcc': # create mfcc features features = base.mfcc(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], ceplifter=0, appendEnergy=params['use_energy'], winfunc=params['windowing']) # apply cepstral mean variance normalisation if params['normalise']: features = (features - features.mean(0)) / features.std(0) # optionally add the deltas and double deltas if params['use_deltas']: single_delta = base.delta(features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) features = np.concatenate( [features, single_delta, double_delta], 1) ############################################################################### # create new leaf node in the feature node for the current audio # file feature_shape = np.shape(features)[1] f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape), expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could # be made invalid.append(node._v_name) # remove the top node including all other features if no captions # features could be created output_file.remove_node(node, recursive=True) print(invalid) print(f'There were {len(invalid)} files that could not be processed')