def classification(musicFile, segLength, sr, breakInterval, resultFile, modelFile): # We divide the audio file in segments of length 3600 secs and load it to data data = [] audioDuration = get_duration(filename=musicFile) // 1.0 numSegments = int(audioDuration // breakInterval) print(audioDuration) # Loading data in size of break interval for i in range(numSegments): st = time.time() offset = i * breakInterval y, srp = load(musicFile, sr=sr, duration=breakInterval, offset=offset, res_type='kaiser_fast') for j in range(breakInterval): offset = j * 22050 yp = y[offset:(offset + sr)] D = np.mean(mfcc(yp, sr=sr, n_mfcc=40), axis=1) data.append(D) del y print(time.time() - st) #Loading remaining data offset = numSegments * breakInterval duration = audioDuration - offset y, srp = load(musicFile, sr=sr, duration=duration, offset=offset, res_type='kaiser_fast') for i in range(int(duration)): offset = i * 22050 yp = y[offset:(offset + sr)] D = np.mean(mfcc(yp, sr=sr, n_mfcc=40), axis=1) data.append(D) del y data = np.array(data) # Loading model and classifying file. model = load_model(modelFile) result = np.argmax(model.predict(data), axis=1) # Saving result to resultFile f = open(resultFile, 'w') for i in range(result.shape[0]): st = str(result[i]) + '\n' f.write(st) f.close()
def compute_librosa_features(self, audio_data, feat_name): """ Compute feature using librosa methods :param audio_data: signal :param feat_name: feature to compute :return: np array """ # if rmse_feat.shape == (1, 427): # rmse_feat = np.concatenate((rmse_feat, np.zeros((1, 4))), axis=1) if feat_name == 'zero_crossing_rate': return zero_crossing_rate(y=audio_data, hop_length=self.FRAME) elif feat_name == 'rmse': return rmse(y=audio_data, hop_length=self.FRAME) elif feat_name == 'mfcc': return mfcc(y=audio_data, sr=self.RATE, n_mfcc=13) elif feat_name == 'spectral_centroid': return spectral_centroid(y=audio_data, sr=self.RATE, hop_length=self.FRAME) elif feat_name == 'spectral_rolloff': return spectral_rolloff(y=audio_data, sr=self.RATE, hop_length=self.FRAME, roll_percent=0.90) elif feat_name == 'spectral_bandwidth': return spectral_bandwidth(y=audio_data, sr=self.RATE, hop_length=self.FRAME)
def extract(self, audio_file): y, sr = librosa.load(audio_file, sr=self.sr) D = mfcc(y, sr=self.sr, n_mfcc=self.mfcc_no + 2, n_fft=self.window_size, hop_length=self.hop_size) D = D[2:, :] feats = [] timestamps = [] current_time = 0 for i in range(0, D.shape[1], self.step): d = D[:, i:i + self.w] d = d.transpose() if d.shape[0] == self.w: feats.append(d) timestamps.append(current_time) current_time = current_time + self.step * self.hop_size / float( self.sr) feats = np.array(feats) return feats, timestamps
def compute_librosa_features(self, audio_data, feat_name): """ Compute feature using librosa methods :param audio_data: signal :param feat_name: feature to compute :return: np array """ # # http://stackoverflow.com/questions/41896123/librosa-feature-tonnetz-ends-up-in-typeerror # chroma_cens_feat = chroma_cens(y=audio_data, sr=self.RATE, hop_length=self.FRAME) logging.info('=> Computing {}'.format(feat_name)) if feat_name == 'zero_crossing_rate': return zero_crossing_rate(y=audio_data, hop_length=self.FRAME) elif feat_name == 'rmse': return rms(y=audio_data, hop_length=self.FRAME) elif feat_name == 'mfcc': return mfcc(y=audio_data, sr=self.RATE, n_mfcc=13) elif feat_name == 'spectral_centroid': return spectral_centroid(y=audio_data, sr=self.RATE, hop_length=self.FRAME) elif feat_name == 'spectral_rolloff': return spectral_rolloff(y=audio_data, sr=self.RATE, hop_length=self.FRAME, roll_percent=0.90) elif feat_name == 'spectral_bandwidth': return spectral_bandwidth(y=audio_data, sr=self.RATE, hop_length=self.FRAME)
def extract_audio_features(root_dir, row): raw_data_dir = join(root_dir, RAW_DATA_DIR) row_dict = row.to_dict() waveform, _ = load(raw_data_dir + row_dict['filename'], sr=FEATURE_ARGS['sr']) row_dict['melspec'] = _clean_features(melspectrogram(waveform, n_mels=EXTRACTOR_ARGS['n_mels'], **FEATURE_ARGS)) row_dict['mfcc'] = _clean_features(mfcc(waveform, n_mfcc=EXTRACTOR_ARGS['n_mfcc'], **FEATURE_ARGS)) return row_dict
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0, maxf0, type): wav, sr = load(filename, sr=None) # get f0 x = wav.astype(float) _f0, t = world.harvest(x, sr, f0_floor=minf0, f0_ceil=maxf0, frame_period=winstep * 1000) f0 = world.stonemask(x, _f0, t, sr) window_size = int(sr * winlen) hop_size = int(sr * winstep) # get mel if type == 'mcc': spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0) h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T else: h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size) h = np.vstack((h, f0)) maxlen = len(x) // hop_size + 2 h = repeat_last_padding(h, maxlen) id = os.path.basename(filename).replace(".wav", "") return (id, x, h)
def apply(self, data): all_ceps = [] for ch in data: ceps, mspec, spec = mfcc(ch) all_ceps.append(ceps.ravel()) return np.array(all_ceps)
def findTimbral(wave): # 19 dimensions timbral_feature = {} centroid = feature.spectral_centroid(wave) timbral_feature['mu_centroid'] = np.mean(centroid) timbral_feature['var_centroid'] = np.var(centroid, ddof=1) rolloff = feature.spectral_rolloff(wave) timbral_feature['mu_rolloff'] = np.mean(rolloff) timbral_feature['var_rolloff'] = np.var(rolloff, ddof=1) flux = onset_strength(wave, lag=1) # spectral flux timbral_feature['mu_flux'] = np.mean(flux) timbral_feature['var_flux'] = np.var(flux, ddof=1) zero_crossing = feature.zero_crossing_rate(wave) timbral_feature['mu_zcr'] = np.mean(zero_crossing) timbral_feature['var_zcr'] = np.var(zero_crossing) five_mfcc = feature.mfcc(wave, n_mfcc=10) # n_mfcc = 10 dim i = 1 for coef in five_mfcc: timbral_feature['mu_mfcc' + str(i)] = np.mean(coef) timbral_feature['var_mfcc' + str(i)] = np.var(coef, ddof=1) i = i + 1 percent = feature_low_energy(wave) # 1 dim timbral_feature['low_energy'] = percent return timbral_feature
def process_signal(self, signal): ft = np.abs(stft(signal, n_fft=self.window_size, hop_length=self.window_stride, window='hann')) mel = melspectrogram(sr=self.sample_rate,S=ft) mfccs = mfcc( sr=self.sample_rate, n_mfcc=self.num_mfccs,S=mel) deltas= delta(mfccs) delta_deltas= delta(mfccs,order=2) return mfccs, deltas, delta_deltas
def ConvertAudioToInputArray(audio, sr, num_mfccs, numcontext): # Get the average mel function of the audio files mfcc_a = mfcc(y=audio, sr=sr, n_mfcc=num_mfccs) # BiRNN stride = 2 mfcc_a = mfcc_a[::2] # one stride per timestep in the input num_strides = len(mfcc_a) # add empty initial and final contexts empty_context = np.zeros((numcontext, num_mfccs), dtype=mfcc_a.dtype) mfcc_a = np.concatenate((empty_context, mfcc_a, empty_context)) # create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * numcontext + 1 train_inputs = np.lib.stride_tricks.as_strided( mfcc_a, (num_strides, window_size, num_mfccs), (mfcc_a.stires[0], mfcc_a.stires[0], mfcc_a.stires[1]), writeable=False) # Flatten the second and third dimensions train_inputs = np.reshape(train_inputs, [num_strides, -1]) # Whiten inputs # copy the strided array so we can write to it safely train_inputs = np.copy(train_inputs) train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs) # Return the training data return train_inputs
def predict_2d(data, fs, interval): X = np.zeros((1, 20, int(16 * interval), 1)) #Length of mfcc from training X[0, :, :, 0] = mfcc(data, fs) return classifier.predict(X).argmax(1)
def load_training_file(training_data_file_name: str, ret_length=False): """ loads training data from the given file name Parameters ---------- training_data_file_name: str the name of the data file Returns ------- (classname, mfcc):tuple the class and data will be returned """ file_data, samplerate = sf.read(training_data_file_name) length = len(file_data) mfcc = feature.mfcc(file_data, sr=samplerate, n_mfcc=25, hop_length=512, n_fft=2048) mfcc = reshape(mfcc, 20) if ret_length: return (training_data_file_name, mfcc), length else: return (training_data_file_name, mfcc)
def vowel_predicting(self, model_path): model = models.load_model(model_path) data = self.data data = mfcc(data, self.fs) data = np.expand_dims(data, 0) data = np.expand_dims(data, 3) predictions_single = model.predict(data) predictions = predictions_single[0] print(predictions) t = 0.5 if predictions[0] > t: letter = 'A' elif predictions[1] > t: letter = 'E' elif predictions[2] > t: letter = 'I' elif predictions[3] > t: letter = 'O' elif predictions[4] > t: letter = 'U' elif predictions[5] > t: letter = 'Y' else: letter = '----' return letter
def get_perform_mfcc(self, outside_series=None, outside_sr=None): y = self.select_series(outside_series) sr = self.select_sr(outside_sr) mfccs = mfcc(y, sr=sr) return scale(mfccs, axis=1)
def get_seq_size(self, frames, sr): """ Get audio sequence size of audio time series when converted to mfcc-features or mel spectrogram :param frames: audio time series :param sr: sampling rate of frames :return: sequence size of mfcc-converted audio """ if self.type == 'mfcc': mfcc_frames = mfcc(frames, sr, n_fft=self.frame_length, hop_length=self.hop_length, n_mfcc=self.mfcc_features, n_mels=self.n_mels) return mfcc_frames.shape[1] elif self.type == 'spectrogram': spectrogram = melspectrogram(frames, sr, n_fft=self.frame_length, hop_length=self.hop_length, n_mels=self.n_mels) return spectrogram.shape[1] else: raise ValueError('Not a valid feature type: ', self.type)
def extract_features(audio, rate): audio = reduce_noise_power(audio, rate) audio, indexes = trim(audio) mfcc_feature = mfcc(y=audio, sr=rate, n_mfcc=13, n_fft=int(0.025 * rate), n_mels=40, fmin=20, hop_length=int(0.03 * rate)) mfcc_feature = preprocessing.scale(mfcc_feature, axis=1) mfcc_feature = stats.zscore(mfcc_feature) pitches, magnitudes = pitch(y=audio, sr=rate, fmin=50, fmax=400, n_fft=int(0.025 * rate), hop_length=int(0.03 * rate)) #delta_f = delta(mfcc_feature) #d_delta_f = delta(mfcc_feature, order=2) combined = np.hstack((np.transpose(mfcc_feature), np.transpose(pitches))) return combined
def split_audio(wav_path): print('splitting audios...') dst = os.path.join(wav_path.split('/')[0], 'info') with open(os.devnull, 'w') as ffmpeg_log: command = 'ffmpeg -i ' + wav_path + ' -f segment -segment_time 1 -c copy ' + os.path.join(dst,'%02d.wav') subprocess.call(command, shell=True, stdout=ffmpeg_log, stderr=ffmpeg_log) os.remove(wav_path) output = np.zeros((20, 0)) for segment in os.listdir(dst): segment = os.path.join(dst, segment) sample_rate, audio_info = wavfile.read(segment) audio_length = audio_info.shape[0] if audio_length<=16000: audio_info = np.pad(audio_info, (0, 16000-audio_length), 'constant', constant_values=0) else: audio_info = audio_info[0:16000] audio_info = audio_info.astype(np.float32) mfcc_feats = mfcc(audio_info, sr=sample_rate) #print(mfcc_feats.shape) output = np.concatenate((output, mfcc_feats), axis=1) #print(output.shape) for file in os.listdir(dst): if file.endswith('.wav'): os.remove(os.path.join(dst, file)) return output.T
def process_audio(audio_data, sr): """ Computes the Mel-Frequency Cepstral Coefficients and their first and second order derivatives. Concatenates then all into a single numpy array and the swaps the axis from [n_mfcc, n_samples] to [n_samples, n_mfcc]. :param audio_data: floating point time series of an audio file :param sr: the sample rate at which train_data was loaded :return: a feature array of dimension [n_samples, n_mfcc] containing the computed MFCCs and their time derivatives """ mel_freq_coeff = mfcc(y=audio_data, sr=sr, n_mfcc=13, hop_length=int(.10 * sr), n_fft=int(.20 * sr)) mel_freq_coeff = mel_freq_coeff[1:, :] mel_freq_coeff_delta = delta(mel_freq_coeff, width=7) mel_freq_coeff_delta_delta = delta(mel_freq_coeff, width=7, order=2) features = concatenate( (mel_freq_coeff, mel_freq_coeff_delta, mel_freq_coeff_delta_delta), axis=0) features = swapaxes(features, 0, 1) return features
def test_mfcc(self): correct = rosaft.mfcc(y=self.sig, sr=self.fs, n_fft=nfft, hop_length=stepsize) actual = mfcc(self.args) self.assertTrue(np.abs(correct - actual).max() < tol)
def mel_to_mfcc(x): mfcc_wav = mfcc(S=power_to_db(x), n_mfcc=13, sr=sr, n_fft=n_fft, hop_length=hop_length) mfcc_wav = mfcc_wav.reshape(mfcc_wav.shape[0], mfcc_wav.shape[1], 1) return mfcc_wav
def create_ceps(fn): sample_rate, X = scipy.io.wavfile.read(fn) Y = X * 1.0 # ceps, mspec, spec = mfcc(Y) ceps = mfcc(Y) write_ceps(ceps, fn)
def get_feature(fs, signal): """ :param fs :param signal :return feature:mfcc """ feature = mfcc(signal, fs, S=None, n_mfcc=20).T return feature
def compute_MFCC(data, sample_rate, num_coefs=20): """ small set of features (usually about 10–20) which concisely describe the overall shape of a spectral envelope :param data: np array audiodata :param sample_rate: :param num_coefs: number of features to generate, default 20 :return: np array (2D) that contains the key features of an audio file """ return feature.mfcc(data, sr=sample_rate, n_mfcc=num_coefs)
def get_feature(fs, signal): """ 简易的提取特征函数 :param fs: 采样率 :param signal: 信号 :return feature:mfcc特征 """ feature = mfcc(signal, fs, S=None, n_mfcc=20).T return feature
def mfccCoefficients(sample): ''' Determines the average value of each mfcc coefficient for each window. ''' mels = np.mean(mfcc(y=np.array([float(e) for e in sample]), sr=len(sample), n_mfcc=128).T, axis=0) return mels
def compute_mfccs(self, x): new_sample = x.astype(float) mfccs = mfcc(new_sample, 16000, n_mfcc=13, n_fft=640, hop_length=320) grad_mfccs = np.gradient(mfccs, axis = 1) mfccs = np.concatenate((mfccs, grad_mfccs)) mfccs = np.concatenate((mfccs, np.gradient(grad_mfccs, axis = 1))) mfccs = torch.from_numpy(mfccs) mfccs = mfccs.type(torch.FloatTensor) return mfccs
def extract_features(example_file): soundfile, samplerate = sf.read(example_file) return mfcc(y=soundfile, sr=samplerate, S=None, n_mfcc=13, dct_type=2, n_fft=1024, hop_length=64).T
def frequency_feature(self): data = np.array(self.sum_all()) data = np.transpose(data, (1, 0)) data_mfccs = [] for i in data: sig = i / max(abs(i)) data_mfccs.append(mfcc(sig, sr=10, n_mfcc=2, hop_length=10)) data_mfccs = np.array(data_mfccs) # data_mfccs = np.transpose(data_mfccs,(0,2,1)) return data_mfccs
def extract_features(keystroke, sr=44100, n_mfcc=16, n_fft=441, hop_len=110): """Return an MFCC-based feature vector for a given keystroke.""" spec = mfcc( y=keystroke.astype(float), sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, # n_fft=220 for a 10ms window hop_length=hop_len, # hop_length=110 for ~2.5ms ) return spec.flatten()
def extract_mfcc(full_audio_path): wave, sample_rate = load(full_audio_path) n_fft = int(sample_rate * 0.03) hop_length = n_fft // 2 mfcc_features = mfcc(wave, sr=sample_rate, n_mfcc=50, hop_length=hop_length, n_fft=n_fft).T return mfcc_features
def loadFile(self, fname): ''' fname: filename of the sound file we want to load ''' if self.verbose: print('Loading %s' % fname) if self.cached: if not os.path.exists(fname + '-mfcc.npy'): y, sr = librosa.load(fname) data = mfcc(y=y, sr=sr).T np.save(fname + '-mfcc.npy', data) else: data = np.load(fname + '-mfcc.npy') else: y, sr = librosa.load(fname) # TODO: Add ability to filter by seconds/duration # seconds = y.size/sr data = mfcc(y=y, sr=sr).T return data
def extract_features(data, n_fft=2048): res = [] for row in data: centroid = mfcc(row, n_fft=n_fft, sr=22050) res.append([ np.min(centroid), np.max(centroid), np.median(centroid), ]) return np.array(res)
def compute_spectral_signature(song_id, cached = True, use_covar = True): if cached and is_signature_cached(song_id): return fetch_signature(song_id) audioclip_path = join(AUDIOCLIPS_FOLDER, "{0}.mp3".format(song_id)) waveform, sample_rate, frame_length, frames = None, None, None, None try: waveform, sample_rate = load(audioclip_path, sr=SAMPLE_RATE) frame_length = core.time_to_samples(np.arange(0, 2, FRAME_TIMESTEP), sr = sample_rate)[1] frames = librosa_util.frame(y = waveform, frame_length = frame_length, hop_length = frame_length) except Exception as e: logging.warn("Couldn't preprocess audioclip '{0}': {1}".format(audioclip_path, str(e))) return None # The 'frames' array has shape (<frame_length>, <number_of_frames>) # hence, we transpose it. This holds true for every call to the librosa library that returns an array. frames = frames.T spectrograms = [] for frame in frames[FRAME_START: FRAME_START + FRAME_TOTAL]: spectrogram = feature.mfcc(y = frame, sr = frame_length).T to_add = [ entry[MFCSS_OFFSET : MFCSS_OFFSET+N_MFCCS] for entry in spectrogram ] spectrograms += to_add spectrograms = np.array(spectrograms) clusters = KMeans(n_clusters = CLUSTERS_PER_SIGNATURE) model = clusters.fit(spectrograms) # A song's "signature" is an array [ ( u_i, s_i, w_i ) ... ]. Where 0 <= i < CLUSTERS_PER_SIGNATURE # The triple (u_i, s_i, w_i) contains these variables: # u_i : Mean for Cluster i # s_i : Covariance for Cluster i # w_i : Weight for Cluster i signature = [] for label in xrange(CLUSTERS_PER_SIGNATURE): indexes = [ index for index, element in enumerate(model.labels_) if element == label ] cluster_points = [ spectrograms[i] for i in indexes ] mean = model.cluster_centers_[label] covariance = np.cov(cluster_points) if use_covar else [] weight = len(cluster_points) cluster_params = (mean, covariance, weight) signature.append(cluster_params) persist_signature(song_id, signature) return signature
def mfcc(path): # Let's make and display a mel-scaled power (energy-squared) spectrogram # We use a small hop length of 64 here so that the # frames line up with the beat tracker example below. y, sr = load_files(path) print 'claculating mfcc ' + path S = feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128) # Convert to log scale (dB). We'll use the peak power as reference. log_S = logamplitude(S, ref_power=np.max) mfcc_v = feature.mfcc(S=log_S, n_mfcc=14) return np.sum(mfcc_v, axis=1)/mfcc_v.shape[1]
def runMFCC(signal, sample_rate=22050): return mfcc(y=np.asarray(signal), sr=sample_rate, n_mfcc=10)
soundPath = '/Users/jiusi/Desktop/audioSamples/Rec_003.wav' forrest = '/Users/jiusi/dares_g1.1/dares_g1/left/forrest_1.wav' livingRoom = '/Users/jiusi/dares_g1.1/dares_g1/left/living_room_1.wav' study = '/Users/jiusi/dares_g1.1/dares_g1/left/study_1.wav' street = '/Users/jiusi/Desktop/busy_street_1.wav' sub = '/Users/jiusi/Desktop/sub_0.m4a' quite_smarti = '/Users/jiusi/Desktop/quiet_smarti.wav' quite_iphone = '/Users/jiusi/Desktop/quiet_iphone.m4a' rate, sig = ud.getDataFromPath(quite_smarti) mfccValue = mfcc(y=sig, sr=rate, n_mfcc=13) delta_mfcc = librosa.feature.delta(mfccValue) delta2_mfcc = librosa.feature.delta(mfccValue, order=2) # plt.plot() # # fig = plt.figure() # signalSub = fig.addSubPlot() # signalSub.plot(range(0, len(obs)), obs) # mfccSub = fig.addSubPlot() # mfccSub.plot(range(0, len(mfccValue)), mfccValue) # # # How do they look? We'll show each in its own subplot # plt.figure(figsize=(12, 6))