def test_frame_sig(self): n = 10000124 frame_len = 37 frame_step = 13 x = np.random.rand(n) t0 = time.time() y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False) t1 = time.time() y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True) t_new = time.time() - t1 t_old = t1 - t0 self.assertTupleEqual(y_old.shape, y_new.shape) np.testing.assert_array_equal(y_old, y_new) self.assertLess(t_new, t_old) print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
def specdecomp(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, )), decomp='complex'): """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). :param frames: the array of frames. Each row is a frame. :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) if decomp == 'time' or decomp == 'frames': return frames complex_spec = np.fft.rfft(frames, nfft) if decomp == 'magnitude' or decomp == 'mag' or decomp == 'abs': return np.abs(complex_spec) elif decomp == 'phase' or decomp == 'angle': return np.angle(complex_spec) elif decomp == 'power' or decomp == 'powspec': return sigproc.powspec(frames, nfft) else: return complex_spec return spect
def specspreadcent_xtr_func(sig, fs, args, winlen, winstep): chnkd_sig = framesig(sig, winlen * fs, winstep * fs) """Spectral spread and centroid of windows - based on pyAudioAnalysis.audioFeatureExtraction library [github.com/tyiannak/pyAudioAnalysis]""" #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args) centroids, spreads = [], [] for chnk in chnkd_sig: spec = get_win_fft(chnk, winlen, fs) ind = (np.arange(1, len(spec) + 1)) * (fs / (2.0 * len(spec))) Xt = spec.copy() Xt = Xt / Xt.max() NUM = np.sum(ind * Xt) DEN = np.sum(Xt) + eps # Centroid: C = (NUM / DEN) # Spread: S = np.sqrt(np.sum(((ind - C)**2) * Xt) / DEN) # Normalize: C = C / (fs / 2.0) S = S / (fs / 2.0) centroids.append(C) spreads.append(S) res = [[cent, spread] for cent, spread in zip(centroids, spreads)] return np.array(res)
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def compute_rmfcc(self, pad, save=False): filepaths = self.wavpaths frames = {} fno = 0 self.af = {} self.rmfcc = {} for f in filepaths: if fno%10 == 0: print fno fno += 1 sig, sr = librosa.load(f[0]) frames = sigproc.framesig(sig, frame_len=500, frame_step=250) af = [] rmfcc = [] for frame in frames: analysis_filt = lpc.autocor(frame, 12) af.append(analysis_filt) residual = np.array(list(analysis_filt(frame))) temp = list(python_speech_features.mfcc(frame, sr, winlen=0.022)) rmfcc.append(temp[0]) rmfcc,_ = self.pad_sequence_into_array(np.array(rmfcc).transpose(), maxlen=pad) print rmfcc.shape self.rmfcc[f[1]] = rmfcc self.af[f[1]] = af with open('analysis_filt.pkl', 'wb') as f: pickle.dump(self.af, f)
def extract_features_file(filename): """Extract feature vectors for file Frames the file according to WINLEN and WINSTEP and extracts a featurevector for every frame. :param filename: filename of the IRMAS dataset :return: 2D-numpy array with the shape (numframes, numfeatures) NOTE: if numframes == 1 it returns a single featurevector """ # read wav file data = wav.read(filename) data = data[1] data = data[:, 1] frames = framesig(data, winlen_samp, winlen_samp) features = None for frame in frames: if features is None: features = extract_features_window(frame) else: new_features = extract_features_window(frame) features = np.vstack((features, new_features)) return features
def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Spectral Subband Centroid features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) pspec = numpy.where(pspec == 0, numpy.finfo(float).eps, pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1)) return numpy.dot(pspec * R, fb.T) / feat
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=55, nfft=2048, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): """Compute Mel-filterbank energy features from an audio signal. """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) energy = numpy.sum(pspec, 1) # this stores the total energy in each frame energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log return feat, energy
def cspec(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfft=512, preemph=0.97, winfunc=lambda x: np.ones((x, ))): """Compute STFT coeeficients from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfft: the FFT size. Default is 512. :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=np.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) if np.shape(frames)[1] > nfft: logging.warn( 'frame length (%d) is greater than FFT size (%d), frame will be truncated. ' + 'Increase NFFT to avoid.', np.shape(frames)[1], nfft) return np.fft.rfft(frames, nfft)
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def stft(sig, rate): frames = sigproc.framesig(sig, FRAME_LENGTH*rate, FRAME_SHIFT*rate, winfunc=squared_hann) spec = np.fft.rfft(frames, int(FRAME_LENGTH*rate)) # adding 1e-7 just to avoid problems with log(0) return np.log10(np.absolute(spec)+1e-7) # Log 10 for easier dB calculation
def prepare_tfrecord(example_paths, destination_path, max_len, nfft=256, noverlap=128): ''' Converts a set of inputs into spectrograms and saves them to disk in a TensorFlow native format. :return A tuple container (min_val, max_val. mean) for the set. ''' features_min, features_max = None, None features_count, features_sum = 0, 0 # Open a TFRecords file for writing. writer = tf.python_io.TFRecordWriter(destination_path) for idx in range(len(example_paths)): # Load an audio file for preprocessing. try: samples, _ = librosa.load(example_paths[idx]) except NoBackendError: print('Warning: Could not load {}.'.format(example_paths[idx])) continue # Pad or shorten the number audio samples to max length. if samples.shape[0] < max_len: samples = np.pad(samples, (0, max_len - samples.shape[0]), 'constant') elif samples.shape[0] > max_len: samples = samples[:max_len] # Generate a log power spectrum of the audio samples. spectrum = np.abs( logpowspec(framesig(samples, nfft, noverlap, winfunc=np.hanning), nfft, norm=0)) spectrum = np.transpose(np.flip(spectrum, 1)).astype(np.float32) label = int(os.path.split(example_paths[idx])[-1].split('-')[1]) # Keep track of the dataset statistics. new_min = np.min(spectrum) new_max = np.max(spectrum) if features_min is not None and features_max is not None: features_min = new_min if features_min > new_min else features_min features_max = new_max if features_max < new_max else features_max else: features_min = new_min features_max = new_max features_count += np.prod(spectrum.shape) features_sum += np.sum(spectrum) # Write the final spectrum and label to disk. example = tf.train.Example(features=tf.train.Features( feature={ 'spectrum': bytes_feature(spectrum.flatten().tostring()), 'label': int64_feature(label) })) writer.write(example.SerializeToString()) writer.close() # Return the dataset statistics. return features_min, features_max, float(features_sum) / features_count
def load_frames(in_file, srate=16000): '''load frames from either single wav or npy @in_file: input wav or npy @return: raw frames in the size of [num_frames, frame_len] ''' signal, srate = librosa.load(in_file, srate) pre_emphed = sigproc.preemphasis(signal, coeff=0.95) frames = sigproc.framesig(pre_emphed, 0.025 * srate, 0.01 * srate) return frames
def calc_sp(mix, clean, data_type, Win_Length, Offset_Length): """ This func is to calculate the features and corresponding labels in the time domain :param mix: 1-D vector :param clean: 1-D vector :param data_type: :param Win_Length: the length of the window function :param Offset_Length: the offset length between adjanct frames :return: """ """ """ n_window = Win_Length n_offset = Offset_Length mix_x = framesig(mix, frame_len=n_window, frame_step=n_offset) clean_x = framesig(clean, frame_len=n_window, frame_step=n_offset) return mix_x, clean_x
def zcr_xtr_func(sig, fs, args, winlen, winstep): chnkd_sig = framesig(sig, winlen * fs, winstep * fs) """Sign-change rate of signal per frame.""" #chnkd_sig = raw_chnkd_xtr_func(chnkd_sig_lst, fs, args) zcr_wins = [] for chnk in chnkd_sig: zcr_win = np.sum(chnk[:-1] * chnk[1:] < 0) zcr_wins.append(zcr_win) return np.array(zcr_wins)
def _powspec_cord(self, data, fs): kwargs = self._cording_params preemph = kwargs["preemph"] winlen = kwargs["winlen"] winstep = kwargs["winstep"] winfunc = kwargs["winfunc"] fft_size = int(winlen * fs) data = preemphasis(data, preemph) frames = framesig(data, winlen * fs, winstep * fs, winfunc) return powspec(frames, fft_size)
def extract(sig): # framing sig_frames = sigproc.framesig(sig=sig, frame_len=FRAME_LENGTH, frame_step=FRAME_STEP) frames_feats = None def concat_feats(feat_coeffs): return np.concatenate((frames_feats, feat_coeffs), axis=1) # region calculate mfcc features mfcc_feat = mfcc(signal=sig, samplerate=SAMPLE_RATE, winlen=WINDOW_LENGTH, winstep=WINDOW_STEP, numcep=13, preemph=PRE_EMPH, winfunc=WINDOW_FUNCTION) mfcc_feat_delta = delta(mfcc_feat, 20) mfcc_feat_delta_delta = delta(mfcc_feat_delta, 20) frames_feats = mfcc_feat frames_feats = concat_feats(mfcc_feat_delta) frames_feats = concat_feats(mfcc_feat_delta_delta) # endregion # region calculate zero cross rating def zcr(frames): def sign(x): return 1 if x >= 0 else -1 zcrs = [] for frame in frames: zc_rate = 0 for i in range(1, len(frame)): zc_rate += abs(sign(frame[i]) - sign(frame[i - 1])) / 2 zcrs.append(zc_rate / len(frame)) return zcrs zcrs = zcr(sig_frames) frames_feats = concat_feats(np.array([zcrs]).reshape(len(zcrs), 1)) # endregion # region calculate energy def autocorrelate(frames, eta): energys = [] for frame in frames: total_sum = 0 for i in range(eta, len(frame)): total_sum += frame[i] * frame[i - eta] energy = 1 / len(frame) * total_sum energys.append(energy) return energys energys = autocorrelate(sig_frames, 0) frames_feats = concat_feats(np.array([energys]).reshape(len(energys), 1)) # endregion # frames_feats = frames_feats/frames_feats.max(axis=1).reshape(frames_feats.shape[0],1) return frames_feats/100000
def parse_audio(self, audio_path): if self.augment: y = load_randomly_augmented_audio(audio_path, self.sample_rate) else: y = load_audio(audio_path) if self.noiseInjector: add_noise = np.random.binomial(1, self.noise_prob) if add_noise: y = self.noiseInjector.inject_noise(y) # Split audio into frames frame_len_ = self.sample_rate*self.window_size frame_step_ = self.sample_rate*self.window_stride frames = sigproc.framesig(y,frame_len=frame_len_,frame_step=frame_step_) # Compute features features = None if self.feature_type=='rawspeech': # Raw speech signal (dimension = 1 X # of samples) y = y.reshape((1,len(y))) features = y features = torch.FloatTensor(features) elif self.feature_type=='rawframes': # Raw speech frames (dimension = # of frames X frame length) features = frames features = torch.FloatTensor(features.transpose()) elif self.feature_type=='spectrogram': # Spectrogram features = sigproc.magspec(frames,NFFT=int(frame_len_)) features = torch.FloatTensor(features.transpose()) elif self.feature_type=='mfcc': # MFCCs mfcc_feat = mfcc(y,self.sample_rate,winlen=self.window_size,winstep=self.window_stride, numcep=13,nfilt=26) delta = mfccdelta(mfcc_feat,2) deltadelta = mfccdelta(delta,2) mfcc_feat = torch.FloatTensor(mfcc_feat.transpose()) delta = torch.FloatTensor(delta.transpose()) deltadelta = torch.FloatTensor(deltadelta.transpose()) features = torch.cat((mfcc_feat,delta,deltadelta),0) elif self.feature_type=='logmel': # Log Mel-FB features logmel_feat = logfbank(y,self.sample_rate,winlen=self.window_size,winstep=self.window_stride,nfilt=26) delta = mfccdelta(logmel_feat,2) deltadelta = mfccdelta(delta,2) logmel_feat = torch.FloatTensor(logmel_feat.transpose()) delta = torch.FloatTensor(delta.transpose()) deltadelta = torch.FloatTensor(deltadelta.transpose()) features = torch.cat((logmel_feat,delta,deltadelta),0) if self.normalize: mean = torch.mean(features,0,keepdim=True) mean = torch.cat([mean]*features.size(0)) std = torch.std(features,0,keepdim=True) std =torch.cat([std]*features.size(0)) features = (features-mean)/std return features
def energy_xtr_func(sig, fs, args, winlen, winstep): chnkd_sig = framesig(sig, winlen * fs, winstep * fs) """Sum of squares of signal values, normalised by window length.""" #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args) #for chnkd_sig, fs in zip(chnkd_sigs, fs_lst): energy_wins = [] for chnk in chnkd_sig: nrm_energy_win = 1. / len(chnk) * np.sum(chnk**2) energy_wins.append(nrm_energy_win) return np.array(energy_wins)
def power_(signal, samplerate=16000, winlen=0.08, winstep=0.04, nfft=2048, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) feat = sigproc.powspec(frames, nfft) return feat
def get_features(file_name): (rate, sig) = wavfile.read(file_name) with contextlib.closing(wave.open(file_name, 'r')) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) # Frame our signal into 20 frames with 50% overlap number_of_frames = 40 frame_len = len(sig) / (number_of_frames * (.5) + .5) frames = framesig(sig, frame_len, frame_len * .5) # A list of 20 frequency lists for each frame. 6 frequency bands with the average energy of each features = [] band0 = [] band1 = [] band2 = [] band3 = [] band4 = [] band5 = [] for frame in frames: spectrum, freqs, t, img = specgram(frame, Fs=rate) i = 0 bands = [] for freq in freqs: if freq <= 400: band0.extend(spectrum[i]) elif freq > 400 and freq <= 800: band1.extend(spectrum[i]) elif freq > 800 and freq <= 1600: band2.extend(spectrum[i]) elif freq > 1600 and freq <= 2800: band3.extend(spectrum[i]) elif freq > 2800 and freq <= 4400: band4.extend(spectrum[i]) elif freq > 4400: band5.extend(spectrum[i]) i += 1 bands.append(sum(band0) / len(band0)) bands.append(sum(band1) / len(band1)) bands.append(sum(band2) / len(band2)) bands.append(sum(band3) / len(band3)) bands.append(sum(band4) / len(band4)) bands.append(sum(band5) / len(band5)) features.append(bands) values = [] for feature in features: for f in feature: values.append(f) return values
def Make_Spect(wav_path, windowsize, stride, window=np.hamming, bandpass=False, lowfreq=0, highfreq=0, preemph=0.97, duration=False, nfft=None, normalize=True): """ read wav as float type. [-1.0 ,1.0] :param wav_path: :param windowsize: :param stride: :param window: default to np.hamming :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1). """ # samplerate, samples = wavfile.read(wav_path) samples, samplerate = sf.read(wav_path, dtype='float32') if bandpass and highfreq > lowfreq: samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate) signal = sigproc.preemphasis(samples, preemph) frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window) if nfft == None: nfft = int(windowsize * samplerate) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) # S = librosa.stft(samples, n_fft=int(windowsize * samplerate), # hop_length=int((windowsize-stride) * samplerate), # window=window(int(windowsize * samplerate))) # 进行短时傅里叶变换,参数意义在一开始有定义 # feature, _ = librosa.magphase(S) # feature = np.log1p(feature) # log1p操作 feature = np.log(pspec).astype(np.float32) # feature = feature.transpose() if normalize: feature = normalize_frames(feature) if duration: return feature, len(samples) / samplerate return feature
def datafeed(): data_w.put('start') loc, sec, i = 0, 0, 0 if '.wav' not in args.track: rsmpfile = 'resampled.wav' resampling(args.track, rsmpfile) else: rsmpfile = args.track data_wav, fs = soundfile.read(rsmpfile) if fs != 16000: rsmpfile = 'resampled.wav' resampling(args.track, rsmpfile) data_wav, fs = soundfile.read(rsmpfile) data_wav /= np.amax(np.abs(data_wav)) idxs = np.linspace(0, fs, 31, endpoint=True, dtype=np.int) rest = [0.0325, 0.0335, 0.0325] slope = (rng[1] - rng[0]) / (audio_max - audio_min) intersec = rng[1] - slope * audio_max data_w.put('start') if vlclib: vlcplayer.play() if enable_record: ws.call(requests.StartRecording()) NFFT = int(2**(np.ceil(np.log2(256)))) while loc < data_wav.shape[0]: # t = time.time() prv = idxs[i] + fs * sec loc = idxs[i + 1] + fs * sec frames = framesig(data_wav[prv:loc], 256, 80, winfunc=lambda x: np.hamming(x)) stft_data = logpowspec(frames, NFFT) # , fs, 160, 80) stft_data = (stft_data * slope) + intersec stft_data = np.swapaxes(stft_data, 0, 1).astype(np.float32) if stft_data.shape[1] != 5: stft_data = np.ones((129, 5), dtype=np.float32) * rng[0] data_w.put((0, stft_data.copy())) break data_w.put((0, stft_data.copy())) sleep(rest[i % 3]) if i >= 29: i = 0 sec += 1 else: i += 1 os.remove(rsmpfile) data_w.put('end') return
def fft_sam(signal, samplerate=16000, winlen=0.08, winstep=0.04, nfft=2048, preemph=0.97, winfunc=lambda x: numpy.ones((x, ))): signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) feat = numpy.float32(numpy.absolute(numpy.fft.fft( frames, nfft)))[:, 0:int(numpy.floor(nfft / 2)) + 1] for i in range(0, len(feat)): feat[i, 1:] = feat[i, 1:] - feat[i, :-1] return feat
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False): """Return complex spec """ rate, sig = wav.read(wav_) sig = preemphasis(sig, PREEMPH) frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC) complex_spec = np.fft.rfft(frames, NFFT) time_scaled_complex_spec = None if with_time_scaled: time_scaled_frames = np.arange(frames.shape[-1]) * frames time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT) return complex_spec, time_scaled_complex_spec
def job(input_name, output_name): audio, _ = librosa.load(input_name, mono=True, sr=samplerate) if len(audio) == 0: return False signal = sigproc.preemphasis(audio, 0.97) x = sigproc.framesig(signal, winlen, winstep, np.hanning) if len(x) == 0: return False x = sigproc.powspec(x, nfft) x = np.dot(x, banks) x = np.where(x == 0, np.finfo(float).eps, x) x = np.log(x).astype(dtype=np.float32) if np.isnan(np.sum(x)): return False np.save(output_name, x) return True
def energyentropy_xtr_func(sig, fs, args, winlen, winstep): chnkd_sig = framesig(sig, winlen * fs, winstep * fs) """Entropy of energy - based on pyAudioAnalysis.audioFeatureExtraction library [github.com/tyiannak/pyAudioAnalysis]""" #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args) entropies = [] for chnk in chnkd_sig: tot_enrgy = np.sum(chnk**2) subwin_len = int(np.floor(len(chnk) / args["n_subwins"])) if len(chnk) != subwin_len * args["n_subwins"]: chnk = chnk[0:subwin_len * args["n_subwins"]] subwins = chnk.reshape(subwin_len, args["n_subwins"], order='F').copy() subwin_enrgy = np.sum(subwins**2, axis=0) / float(tot_enrgy + eps) entropy = -np.sum(subwin_enrgy * np.log2(subwin_enrgy + eps)) entropies.append(entropy) return np.array(entropies)
def compute_lpcc(self, pad, save=False): filepaths = self.wavpaths self.lpcc = {} self.sr = {} self.frames = {} st = time.time() for f in filepaths: sig, self.sr[f[1]] = librosa.load(f[0]) self.frames[f[1]] = sigproc.framesig(sig, frame_len=2200, frame_step=1100) temp = list(self.frames[f[1]]) arr = np.array([np.array(lpc.kautocor(i, 12).numerator) for i in temp]) print arr.shape self.lpcc[f[1]] = self.pad_sequence_into_array(arr.transpose(), maxlen=pad) if save: with open('feat/lpcc.pkl', 'wb') as f: pickle.dump(self.lpcc, f)
def specflux_xtr_func(sig, fs, args, winlen, winstep): chnkd_sig = framesig(sig, winlen * fs, winstep * fs) """Spectral flux as sum of square differences - based on pyAudioAnalysis.audioFeatureExtraction library [github.com/tyiannak/pyAudioAnalysis]""" #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args) fluxs, prev_chnk, prev_chnk_sum = [], [], [] for chnk in chnkd_sig: spec = get_win_fft(chnk, winlen, fs) specsum = np.sum(spec + eps) if prev_chnk != []: flux = np.sum((spec / specsum - prev_chnk / prev_chnk_sum)**2) fluxs.append(flux) else: fluxs.append(0.) prev_chnk = spec prev_chnk_sum = specsum return np.array(fluxs)
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False): """Return complex spec """ sig, rate = librosa.load(wav_, sr=sr) #print(rate,sig) sig = preemphasis(sig, PREEMPH) frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC) complex_spec = np.fft.rfft(frames, NFFT) time_scaled_complex_spec = None if with_time_scaled: time_scaled_frames = np.arange(frames.shape[-1]) * frames time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT) print(complex_spec.shape, time_scaled_complex_spec.shape) return complex_spec, time_scaled_complex_spec