def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def specdecomp(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, )), decomp='complex'): """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). :param frames: the array of frames. Each row is a frame. :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) if decomp == 'time' or decomp == 'frames': return frames complex_spec = np.fft.rfft(frames, nfft) if decomp == 'magnitude' or decomp == 'mag' or decomp == 'abs': return np.abs(complex_spec) elif decomp == 'phase' or decomp == 'angle': return np.angle(complex_spec) elif decomp == 'power' or decomp == 'powspec': return sigproc.powspec(frames, nfft) else: return complex_spec return spect
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Spectral Subband Centroid features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) pspec = numpy.where(pspec == 0, numpy.finfo(float).eps, pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1)) return numpy.dot(pspec * R, fb.T) / feat
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Mel-filterbank energy features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) energy = numpy.sum(pspec, 1) # this stores the total energy in each frame energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log return feat, energy
def _powspec_cord(self, data, fs): kwargs = self._cording_params preemph = kwargs["preemph"] winlen = kwargs["winlen"] winstep = kwargs["winstep"] winfunc = kwargs["winfunc"] fft_size = int(winlen * fs) data = preemphasis(data, preemph) frames = framesig(data, winlen * fs, winstep * fs, winfunc) return powspec(frames, fft_size)
def Make_Spect(wav_path, windowsize, stride, window=np.hamming, bandpass=False, lowfreq=0, highfreq=0, preemph=0.97, duration=False, nfft=None, normalize=True): """ read wav as float type. [-1.0 ,1.0] :param wav_path: :param windowsize: :param stride: :param window: default to np.hamming :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1). """ # samplerate, samples = wavfile.read(wav_path) samples, samplerate = sf.read(wav_path, dtype='float32') if bandpass and highfreq > lowfreq: samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate) signal = sigproc.preemphasis(samples, preemph) frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window) if nfft == None: nfft = int(windowsize * samplerate) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) # S = librosa.stft(samples, n_fft=int(windowsize * samplerate), # hop_length=int((windowsize-stride) * samplerate), # window=window(int(windowsize * samplerate))) # 进行短时傅里叶变换,参数意义在一开始有定义 # feature, _ = librosa.magphase(S) # feature = np.log1p(feature) # log1p操作 feature = np.log(pspec).astype(np.float32) # feature = feature.transpose() if normalize: feature = normalize_frames(feature) if duration: return feature, len(samples) / samplerate return feature
def power_(signal, samplerate=16000, winlen=0.08, winstep=0.04, nfft=2048, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) feat = sigproc.powspec(frames, nfft) return feat
def mfcc(frames,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = python_speech_features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log feat = numpy.log(feat) feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = python_speech_features.lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat
def job(input_name, output_name): audio, _ = librosa.load(input_name, mono=True, sr=samplerate) if len(audio) == 0: return False signal = sigproc.preemphasis(audio, 0.97) x = sigproc.framesig(signal, winlen, winstep, np.hanning) if len(x) == 0: return False x = sigproc.powspec(x, nfft) x = np.dot(x, banks) x = np.where(x == 0, np.finfo(float).eps, x) x = np.log(x).astype(dtype=np.float32) if np.isnan(np.sum(x)): return False np.save(output_name, x) return True
def dominant_frequency_features(signal, rate, cutoff, nfft=4096, logging=False): f = rate/2*np.linspace(0, 1, nfft/2, dtype=np.float32) cutoffidx = np.searchsorted(f, cutoff) #length(find(f <= cutoff)) f = f[0:cutoffidx+1] frames = sigproc.framesig(f, signal.size, signal.size) #sig, frame_len, frame_step - (1, 12000) powersp = sigproc.powspec(frames, nfft) #(1, 2049) #print(powersp[0]) if logging: print("frames.shape %s, powersp.shape %s" % (str(frames.shape), str(powersp.shape))) maxidx = powersp[0].argmax() maxval = f[maxidx] if logging: print("maxidx %d maxval %f" % (maxidx, maxval)) maxfreq = 0 #TODO: #% Extract features from the power spectrum # [~, maxval, ~] = dominant_frequency_features(current_signal, fs, 256/*cutoff*/, 0); return maxfreq, maxval
======= samples, samplerate = sf.read(wav_path, dtype='int16') if not len(samples) > 0: raise ValueError('wav file is empty?') >>>>>>> Server/Server if bandpass and highfreq > lowfreq: samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate) signal = sigproc.preemphasis(samples, preemph) frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window) if nfft == None: nfft = int(windowsize * samplerate) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) if log_scale == True: feature = np.log(pspec).astype(np.float32) else: feature = pspec.astype(np.float32) # feature = feature.transpose() if normalize: feature = normalize_frames(feature) if duration: return feature, len(samples) / samplerate return feature
def audio_features(params, img_audio, audio_path, append_name, node_list): output_file = params['output_file'] # create pytable atom for the features f_atom = tables.Float32Atom() count = 1 # keep track of the nodes for which no features could be made, places # database contains some empty audio files invalid = [] for node in node_list: print(f'processing file: {count}') count += 1 # create a group for the desired feature type audio_node = output_file.create_group(node, params['feat']) # get the base name of the node this feature will be appended to base_name = node._v_name.split(append_name)[1] # get the caption file names corresponding to the image of this node caption_files = img_audio[base_name][1] for cap in caption_files: # remove extension from the caption filename base_capt = cap.split('.')[0] # remove folder path from file names (Places/coco database) if '/' in base_capt: base_capt = base_capt.split('/')[-1] if '-' in base_capt: base_capt = base_capt.replace('-', '_') # read audio samples try: input_data, fs = librosa.load(os.path.join(audio_path, cap), sr=None) # in the places database some of the audiofiles are empty if len(input_data) == 0: break except: # try to repair broken files, some files had a wrong header. # In Places I found some that could not be fixed however try: fix_wav(os.path.join(audio_path, cap)) #input_data = read(os.path.join(audio_path, cap)) except: # the loop will break, if no valid audio features could # be made for this image, the entire node is deleted. break # set the fft size to the power of two equal to or greater than # the window size. window_size = int(fs * params['t_window']) exp = 1 while True: if np.power(2, exp) - window_size >= 0: fft_size = np.power(2, exp) break else: exp += 1 ############################################################################### # create audio features if params['feat'] == 'raw': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) features = sigproc.framesig(input_data, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) elif params['feat'] == 'freq_spectrum': # calculate the needed frame shift, premphasize and frame # the signal frame_shift = int(fs * params['t_shift']) input = sigproc.preemphasis(input_data, coeff=params['alpha']) frames = sigproc.framesig(input, frame_len=window_size, frame_step=frame_shift, winfunc=params['windowing']) # create the power spectrum features = sigproc.powspec(frames, fft_size) elif params['feat'] == 'fbanks': # create mel filterbank features [features, energy] = base.fbank(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], winfunc=params['windowing']) elif params['feat'] == 'mfcc': # create mfcc features features = base.mfcc(input_data, samplerate=fs, winlen=params['t_window'], winstep=params['t_shift'], numcep=params['ncep'], nfilt=params['nfilters'], nfft=fft_size, lowfreq=0, highfreq=None, preemph=params['alpha'], ceplifter=0, appendEnergy=params['use_energy'], winfunc=params['windowing']) # apply cepstral mean variance normalisation if params['normalise']: features = (features - features.mean(0)) / features.std(0) # optionally add the deltas and double deltas if params['use_deltas']: single_delta = base.delta(features, params['delta_n']) double_delta = base.delta(single_delta, params['delta_n']) features = np.concatenate( [features, single_delta, double_delta], 1) ############################################################################### # create new leaf node in the feature node for the current audio # file feature_shape = np.shape(features)[1] f_table = output_file.create_earray(audio_node, append_name + base_capt, f_atom, (0, feature_shape), expectedrows=5000) # append new data to the tables f_table.append(features) if audio_node._f_list_nodes() == []: # keep track of all the invalid nodes for which no features could # be made invalid.append(node._v_name) # remove the top node including all other features if no captions # features could be created output_file.remove_node(node, recursive=True) print(invalid) print(f'There were {len(invalid)} files that could not be processed')
def get_freq_features(frames, NFFT, samplingRate): ''' This method extracts the frequency domain features from the signal :param frames: (list) : List of windowed frames :param NFFT: (integer) : The NFFT value for calculating FFT :param samplingRate: (integer) : The sampling rate of the signal :return:window_mean_pwr (list) : Mean power of every frame window_n_peaks (list) : Number of peaks in every frame window_tot_pw (list) : Total power in every frame window_pow_var (list) : Variance in power in every frame window_max_fr (list) : Maximum frequency in every frame window_dominating_frequencies (list) : The dominating frequency in every frame ''' window_mean_pwr = [] window_n_peaks = [] window_tot_pw = [] window_pow_var = [] window_max_fr = [] window_dominating_frequencies = [] frames_pw_spec = sigproc.powspec(frames, NFFT) for frame in frames_pw_spec: # n_peaks peakind = signal.find_peaks_cwt(frame, np.arange(1, 10)) window_n_peaks.append(len(peakind)) # mean m = np.mean(frame) window_mean_pwr.append(m) # total power sum = np.sum(np.absolute(frame)) window_tot_pw.append(sum) # power variance var = np.var(frame) window_pow_var.append(var) # min and max frequencies ''' w = np.fft.fft(frame) freqs = np.fft.fftfreq(len(w)) #print(freqs.min(), freqs.max()) window_max_fr.append(freqs.min()) window_min_fr.append(freqs.max()) idx = np.argmax(np.abs(w)) freq = freqs[idx] freq_in_hertz = abs(freq * samplingRate) if(freq_in_hertz > 0.0): print(freq_in_hertz) window_dominating_frequencies.append(freq_in_hertz) ''' # Find the peak in the coefficients fourier = np.fft.fft(frame) frequencies = np.fft.fftfreq(len(frame)) positive_frequencies = frequencies[np.where(frequencies > 0)] magnitudes = abs( fourier[np.where(frequencies > 0)]) # magnitude spectrum peak_frequency = np.argmax(magnitudes) peak_fr = positive_frequencies[peak_frequency] window_dominating_frequencies.append(peak_fr) window_max_fr.append(positive_frequencies.max()) return np.array(window_mean_pwr), \ np.array(window_n_peaks), \ np.array(window_tot_pw), \ np.array(window_pow_var), \ np.array(window_max_fr), \ np.array(window_dominating_frequencies)