def melspectrogram(self): print(self.directory) rate = self.rate seg_length = 2 * (dim - 1) fft = int((2.99936669e-02 * self.data.shape[0]) + 1.42217233e+02) hop = int((0.01516035 * self.data.shape[0]) - 2.14982993) S = melspectrogram(self.data, self.rate, n_mels=mels, hop_length=hop, center=False, n_fft=fft, norm=1, fmax=fmax_, power=poww) while S.shape[1] != dim: if S.shape[1] > dim: fft += 2 elif S.shape[1] < dim: hop -= 1 S = melspectrogram(self.data, self.rate, n_mels=mels, hop_length=hop, n_fft=fft, center=False, norm=1, fmax=fmax_, power=poww) S = 255 * (S - S.min()) / (S.max() - S.min()) return (S)
def transform_audio(self, y): '''Compute the PCEN of the (log-) Mel Spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins) The PCEN magnitude ''' #extract proper shape S_test = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_fft=self.n_fft, n_mels=self.n_mels) P_test = pcen(S_test, sr=self.sr, hop_length=self.hop_length, time_constant=1) n_frames = P_test.shape[1] #double audio and reverse pad to prevent zero initial-energy assumption y = np.concatenate((y[::-1], y)) S = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_fft=self.n_fft, n_mels=self.n_mels) if self.log: S = amplitude_to_db(S, ref=np.max) t_base = (self.hop_length) / (self.sr) #tau, or hop length in time t_constants = t_base * np.array( [2**i for i in range(self.n_t_constants)]) pcen_layers = [] for T in t_constants: P = pcen(S, sr=self.sr, hop_length=self.hop_length, time_constant=T) #source of off-by-one error: P = P[:, P.shape[1] - n_frames + 1:] #remove padded section P = to_dtype(P, self.dtype) pcen_layers.append(P) pcen_layers = to_dtype(np.asarray(pcen_layers), self.dtype) return { 'mag': self._index(pcen_layers) } #copied from mel spectrogram pump feature extractor
class ExtractMonoAudioFiles(FeaturesExtractor): #static vars to be manually set ##the natural byterate of the examples here but we can consider modifying it sr = 44100 nblabels = 88 batchsize = 1000 #featurefunc = lambda y, sr: lrft.mfcc(y, sr, n_mfcc=1).T #featurefunc = lambda *x: x featurefunc = lambda y, sr: lrft.melspectrogram(y, sr).T inpath = '../simple-wdir' #for feeder featuremutation = lambda y, sr: lrft.melspectrogram(y, sr).T @staticmethod def labelmutation(pitch, nbsamples): labelvect = np.zeros(shape=(nbsamples, ExtractMonoAudioFiles.nblabels)) labelvect[:, int(pitch)] = np.ones(nbsamples) return labelvect def __init__(self, inpath=None): if inpath is None: self.inpath = ExtractMonoAudioFiles.inpath else: self.inpath = inpath # if featurefunc is None: # self.featurefunc = ExtractMonoAudioFiles.featurefunc # else: # self.featurefunc = featurefunc super().__init__(self.inpath, ExtractMonoAudioFiles.computefeatureddata, ExtractMonoAudioFiles.extractsamplesmetadata) @staticmethod def extractsamplesmetadata(path, filelist): samplesmetadata = [] for f in filelist: with open(join(path, f), 'r') as fs: #get the the onset and offset plus midi pitch onset, offset, midipitch = tuple(map(float, fs.readlines()[1][:-1].split('\t'))) #first is the midi pitch rescaled into [0,87] range #second is a tuple of wav path, onset and duration samplesmetadata.append(((f[:-3] + 'wav', onset, offset-onset), int(midipitch - 21))) return samplesmetadata @staticmethod def computefeatureddata(path, samplemetadata): meta, pitch = samplemetadata audiodat = lrco.load(join(path, meta[0]), sr= ExtractMonoAudioFiles.sr, offset=meta[1], duration=meta[2]) audiodat = ExtractMonoAudioFiles.featurefunc(*audiodat) pitchvect = np.array([pitch] * audiodat.shape[0]) #return (audiodat, pitch) return (audiodat, pitchvect)
def logmel(self): from librosa.feature import melspectrogram from librosa.core import load logmel_params = self.config['logmel_params'] sr = logmel_params['sr'] n_fft = logmel_params['n_fft'] hop_length = logmel_params['hop_length'] n_mels = logmel_params['n_mels'] feature_path = os.path.join( self.dataset['feature_path'], 'logmel_{}_{}_{}_{}'.format(sr, n_fft, hop_length, n_mels)) if not os.path.exists(feature_path): os.mkdir(feature_path) x_train = [] y_train = [] f_train = [] for i, row in self.dataset.train_data.iterrows(): print('[Train] {}) Getting logmels from {}...'.format( i, row['cur_name']), end='') wav_name = os.path.join(self.dataset['data_path'], row['cur_name']) wav_data, sr = load(wav_name, sr=sr) x_train.append( melspectrogram(wav_data, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)) y_train.append(self._build_multilabel(row)) f_train.append(row['cur_name']) print('done.') x_test = [] y_test = [] f_test = [] for i, row in self.dataset.test_data.iterrows(): print('[Test] {}) Getting mels from {}...'.format( i, row['cur_name']), end='') wav_name = os.path.join(self.dataset['data_path'], row['cur_name']) wav_data, sr = load(wav_name, sr=sr) x_test.append( melspectrogram(wav_data, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)) y_test.append(self._build_multilabel(row)) f_test.append(row['cur_name']) print('done') self._save_pickles(feature_path, x_train, y_train, f_train, x_test, y_test, f_test)
def get_mult_feature(wave_name, window): (rate, sig) = wav.read(wave_name) # librosa.load() # #power = librosa.amplitude_to_db(librosa.stft(sig,n_fft = window,hop_length=window/2), ref=numpy.max) power = numpy.abs(librosa.stft(sig, n_fft=window, hop_length=window / 2)) _min = numpy.min(power) _max = numpy.max(power) power = (power - _min) / (_max - _min) print power.shape mels = feature.melspectrogram(sig, rate, n_fft=window, hop_length=window / 2, n_mels=257) _min = numpy.min(mels) _max = numpy.max(mels) mels = (mels - _min) / (_max - _min) print mels.shape if power.shape[1] < 250: zero = numpy.zeros((power.shape[0], 250 - power.shape[1])) power = numpy.hstack((power, zero)) mels = numpy.hstack((mels, zero)) else: power = power[:, 0:250] mels = mels[:, 0:250] power = power.T mels = mels.T feat = numpy.asarray([power, mels]) return feat
def wav_to_mel_util(x): mel_spec = melspectrogram(x, sr=sr, n_fft=n_fft, hop_length=hop_length, window=window) return mel_spec
def process(src_fname_path, dest_fname_path): if not src_fname_path.endswith('.mp3'): return try: # load and process audio audio, sr = librosa.load(src_fname_path) # audio = lowpass(audio, cutoff=3000, sample_freq=sr) # spec = librosa.stft(np.asfortranarray(audio)) spec = melspectrogram(audio, sr) spec_db = librosa.amplitude_to_db(np.abs(spec)) # generate plot scale = 1 fig = plt.figure(figsize=(1.28 * scale, 0.64 * scale)) #128x64 plt.box(False) plt.subplots_adjust(left=0, right=1, bottom=0, wspace=0, hspace=0) librosa.display.specshow(spec_db, sr=sr, cmap='gray_r', x_axis='time', y_axis='log') fig.savefig(dest_fname_path, bbox_inches=None, pad_inches=0) plt.close() print('{0} -> {1}'.format(src_fname_path, dest_fname_path)) except Exception as e: print('processing {0}: {1}'.format(src_fname_path, e))
def model_predict(): """ Prediction of the classes 'open' and 'close' """ # Create a numpy array of audio data signal = np.frombuffer(stream.read(config.new_len, exception_on_overflow=False), dtype=np.float32) # signal_merged = np.append(config.signal_old, signal) # config.signal_old = signal_merged[signal_len:] signal_merged = signal # Using random signal from folder 'clean' (turned off) # signal_merged = evaluation_using_random() mel = melspectrogram(y=signal_merged, sr=config.sample_rate, n_mels=config.n_mels, n_fft=config.n_fft, hop_length=config.hop_length, window=config.window) X = librosa.amplitude_to_db(mel) X = X.reshape(1, X.shape[0], X.shape[1], 1) # Prediction prediction = model.predict([X]) # start_time = time.time() # Show the prediction, signal and mel (turned off) signal_visualisation(prediction, mel, signal_merged) # print(str((time.time() - start_time)*1000)+'ms') return prediction[0][0], prediction[0][1]
def process_individual_file(input_file_uri, index): """ It processes each input file and saves those in the required format, which is going to be used while training. Inputs: input_file_name : name of the data file to be used during training. Returns: {output_file_uri, num_of_mel_frames, length_of_time_scaled_audio} """ data = [librosa_import(uri)[0] for uri in files_uri][0] mfcc = melspectrogram(data, n_fft=params.nFft, hop_length=params.hop_size, n_mels=params.num_mels).T mfcc_shape = mfcc.shape num_elem_in_mfcc = mfcc_shape[0] * mfcc_shape[1] pad_val = params.scale_factor * num_elem_in_mfcc - data.shape[0] data = np.pad(data, [0, pad_val], mode="constant") assert data.shape[0] % num_elem_in_mfcc == 0 output_file_uri = os.path.join(training_data_folder, "sample_{}".format(index)) postprocess_data(data, mfcc, output_file_uri) return {output_file_uri, mfcc_shape[0], len(data)}
def compute_spectrogram(self): # Compute the spectrogram and convert to dB if not hasattr(self, "spectrogram"): self.spectrogram = melspectrogram(self.waveform, self.sampling_rate) self.spectrogram_db = power_to_db(self.spectrogram, ref=np.max) print("... Computed spectrogram")
def process_signal(self, signal): ft = np.abs(stft(signal, n_fft=self.window_size, hop_length=self.window_stride, window='hann')) mel = melspectrogram(sr=self.sample_rate,S=ft) mfccs = mfcc( sr=self.sample_rate, n_mfcc=self.num_mfccs,S=mel) deltas= delta(mfccs) delta_deltas= delta(mfccs,order=2) return mfccs, deltas, delta_deltas
def __data_generation(self, list_IDs_temp): #'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) # Initialization X = np.empty((self.batch_size, *self.dim, self.n_channels)) Y = np.empty((self.batch_size, self.n_classes), dtype=np.bool) # Generate data for i, row in list_IDs_temp.iterrows(): if row.path not in self.audio.keys(): #print('{} - loading {}'.format(i, row.path)) #sys.stdout.flush() aud, fs = load(row.path) coefs = melspectrogram(aud, sr=fs, n_fft=2**12, hop_length=2**11, n_mels=64, fmax=10000) self.audio[row.path] = coefs #print('{} - loaded!'.format(i)) #sys.stdout.flush() # we've loaded one more track, add it to the counter self.pbar.update(1) start_ind = np.random.randint(low=0, high=self.audio[row.path].shape[1] - self.window) clip = self.audio[row.path][:, start_ind:start_ind + self.window] # # start_ind = np.random.randint(low=0,high=coefs.shape[1]-self.window) # clip = coefs[:,start_ind:start_ind+self.window] X[i, :, :, 0] = clip Y[i, :] = row.iloc[2:-1].values.astype(np.int64) return X, Y
def wave2spec( wave, fs, frame_period, window, nperseg=None, nmels=80, preemphasis_coef=None, f_min=0, f_max=None, dtype='float32', return_t=False): stft_kwargs = make_stft_args( frame_period, fs, nperseg=nperseg, window=window) htk, norm = True, "slaney" if preemphasis_coef is not None: spec_wave = preemphasis(wave, preemphasis_coef) else: spec_wave = wave _, t, Zxx = signal.stft( spec_wave, **stft_kwargs) pspec = np.abs(Zxx) mspec = melspectrogram( sr=fs, S=pspec, n_mels=nmels, fmin=f_min, fmax=f_max, power=1.0, htk=htk, norm=norm) pspec = pspec.T.astype(dtype) mspec = mspec.T.astype(dtype) upsample = fs // (1000 // frame_period) length = (len(wave) // upsample) * upsample wave = wave[:length] mspec = mspec[:length // upsample] spec = pspec[:length // upsample] if return_t: return wave, spec, mspec, upsample, t else: return wave, spec, mspec, upsample
def spectogram(audio): spec = melspectrogram(audio, sr=audio_params['sr'], center=True, n_mels=audio_params['n_mel'], n_fft=audio_params['n_fft'], win_length=audio_params['win_length'], hop_length=audio_params['hop_length'], fmin=audio_params['f_min'], fmax=audio_params['f_max']) spec = np.log(spec)
def extract_features(signal, normalize=False, wavelet=0): # handle less than 3 [seg] L = sr*3 # Total length for samples ~3[seg] signal_length = signal.shape[0] if signal_length < L: #pad by repeating signal signal = np.pad(signal, (0, L-signal_length), mode='wrap') elif signal_length > L: signal = signal[:L] # Calculate melspectrogram melspec = melspectrogram(signal, sr=sr, center=False, #fmax = sr/2 hop_length=window_size, win_length=window_size, n_mels=128) # shape:[bands, frames] # Transform to log scale and transpose melspec = power_to_db(melspec, ref=np.amax(melspec)).T # shape:[frames, bands] if normalize: melspec = (melspec - np.mean(melspec))/np.std(melspec) # 2D Discrete Wavelet Transform if wavelet != 0: LL, (LH, HL, HH) = dwt2(melspec, wavelet) melspec = np.stack([LL,LH,HL,HH],axis=-1) # shape: [frames, bands, 4] else: melspec = melspec[..., np.newaxis] # Reshape features = melspec[np.newaxis, ...] # shape : [1, frames, bands, channels] return features
def create_mels_deltas(waveform, sample_rate): one_mel = melspectrogram(waveform.squeeze(0).numpy(), sr=sample_rate, n_fft=2048, hop_length=1024, n_mels=128, fmin=0.0, fmax=sample_rate / 2, htk=True, norm=None) one_mel = np.log(one_mel + 1e-8) one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel)) one_mel_delta = delta(one_mel) one_mel_delta = (one_mel_delta - np.min(one_mel_delta)) / ( np.max(one_mel_delta) - np.min(one_mel_delta)) one_mel_delta_delta = delta(one_mel, order=2) one_mel_delta_delta = (one_mel_delta_delta - np.min(one_mel_delta_delta) ) / (np.max(one_mel_delta_delta) - np.min(one_mel_delta_delta)) mel_3d = torch.cat([ torch.tensor(one_mel).unsqueeze(0), torch.tensor(one_mel_delta).unsqueeze(0), torch.tensor(one_mel_delta_delta).unsqueeze(0) ], dim=0) return mel_3d
def prepare_data_streaminput(data, config): X = np.empty(shape=(1, config.dim[0], config.dim[1], 1)) input_length = config.audio_length # Remove silence or noise or things like that # Random offset / Padding if len(data) > input_length: max_offset = len(data) - input_length offset = np.random.randint(max_offset) data = data[offset:(input_length + offset)] else: if input_length > len(data): max_offset = input_length - len(data) offset = np.random.randint(max_offset) else: offset = 0 data = np.pad(data, (offset, input_length - len(data) - offset), "constant") #data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc) S = feature.melspectrogram(data, sr=config.sampling_rate, n_fft=2048, hop_length=512, n_mels=config.n_mfcc) S_DB = power_to_db(S, ref=np.max) data = np.expand_dims(S_DB, axis=-1) X[0, ] = data return X
def extract_audio_features(root_dir, row): raw_data_dir = join(root_dir, RAW_DATA_DIR) row_dict = row.to_dict() waveform, _ = load(raw_data_dir + row_dict['filename'], sr=FEATURE_ARGS['sr']) row_dict['melspec'] = _clean_features(melspectrogram(waveform, n_mels=EXTRACTOR_ARGS['n_mels'], **FEATURE_ARGS)) row_dict['mfcc'] = _clean_features(mfcc(waveform, n_mfcc=EXTRACTOR_ARGS['n_mfcc'], **FEATURE_ARGS)) return row_dict
def CalcFrameSpectrogram(self, plot=False, spec_mode='hz'): fft_size = 2048 hop_length = 512 #fft_size//8 self.CurrentSpectrogram = [] #initialize for every new frame!!! if self.CurrentFrame.ndim == 1: #reshape so that it has dimension (dim,1) self.CurrentFrame = np.reshape(self.CurrentFrame, (len(self.CurrentFrame), 1)) mels_keep = 119 #128 is equal to keeping all the coefficients for ch in range(self.CurrentFrame.shape[1]): #for every channel Spectrogram = melspectrogram(self.CurrentFrame[:, ch], sr=self.Fs, n_fft=fft_size, hop_length=hop_length, power=2).astype( np.float32)[0:mels_keep, :] self.CurrentSpectrogram.append(Spectrogram) if plot == True: plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.power_to_db(Spectrogram, ref=np.max), sr=self.Fs, hop_length=hop_length, x_axis='time', y_axis=spec_mode, fmax=self.Fs / 2, cmap='RdBu') #plt.ylim([0, freqs[117]]) plt.colorbar(format='%+2.0f dB') plt.title('Spectrogram') plt.tight_layout() plt.show()
def result(): if request.method == 'POST': testing = [] f = request.files['file'] f.save(f.filename) file = f.filename model = load_model('Birdmodel.h5') Y, SR = load(file, res_type='kaiser_fast') onsets = frames_to_time(onset_detect(Y, SR), SR) leading_silence = onsets[0] y, sr = load(file, offset=leading_silence, res_type='kaiser_fast', duration=10) spec = melspectrogram(y=y, sr=sr) testing.append(spec) testing = np.asarray(testing) result = Model.predict(model, testing) if (np.where(result[0] == max(result[0]))[0] == 0): spe = 'Vanellus indicus' elif (np.where(result[0] == max(result[0]))[0] == 1): spe = 'Acridotheres tristis' elif (np.where(result[0] == max(result[0]))[0] == 2): spe = 'Columba Livia' elif (np.where(result[0] == max(result[0]))[0] == 3): spe = 'Amaurornis phoenicurus' elif (np.where(result[0] == max(result[0]))[0] == 4): spe = 'Centropus sinensis' os.remove(file) clear_session() return render_template("result.html", name=f.filename, species=spe)
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax,affichage=False): """ :param path: emplacement du fichier :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde :param hop_span: pas entre deux echantillons en seconde :param n_mels: nombre de bandes de frequences mel :param fmin: frequence minimale de la decomposition :param fmax: frequence maximale de la decomposition :param affichage: True si on veut afficher le spectrogramme :return: Renvoie les vecteurs fbank representant le signal X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels) """ # 1ere facon d ouvrir un fichier # wav_signal = scipy.io.wavfile.read(path) # wav = np.array(wav_signal[1]) # s_rate = wav_signal[0] # Deuxieme facon d ouvrir un fichier wav, s_rate = librosa.load(path) X = feature.melspectrogram(util.normalize(wav), s_rate, S=None, n_fft=int(np.floor(fft_span * s_rate)), hop_length=int(np.floor(hop_span * s_rate)), n_mels=n_mels, fmin=fmin, fmax=fmax) # #Verification nombre d'echantillons (un toutes les 10ms) # size = X.shape # print 'Taille de la matrice de sortie',size # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1] # print 'taille theorique d un morceau de signal',0.01*s_rate # print 's_rate',s_rate # print 'longueur',wav.shape # print wav.shape[0]/s_rate X = np.log(X) if affichage: afficherSpec(X,s_rate,hop_span) return np.transpose(X)
def make_blocking_data(self): xData, yData = list(), list() path = self.featurePath + self.name + '/' for j, filename in enumerate(os.listdir(path)): print(f"{self.name} {filename} ({j + 1})") WavPath = path + filename y, sr = load(WavPath, mono=True) S = melspectrogram(y, sr).T S = S[:-1 * (S.shape[0] % 128)] num_chunk = S.shape[0] / 128 data_chunks = np.split(S, num_chunk) xChunks, yChunks = list(), list() for unit in data_chunks: xChunks.append(unit) yChunks.append(self.labelDict[self.name]) xData.append(xChunks) yData.append(yChunks) xData = [unit for record in xData for unit in record] yData = [unit for record in yData for unit in record] self.features = torch.tensor(data=xData, device=device) self.labels = torch.tensor(data=yData, device=device) print(self.features.shape) print(self.labels.shape) self.x_cat_data.append(self.features) self.y_cat_data.append(self.labels) return
def calculate_spectrograms(audio_dir, out_dir, file_type='.mp3'): files = glob.glob(os.path.join(audio_dir, '*' + file_type)) num_files = len(files) print(f'{num_files} audio files found') if not os.path.exists(out_dir): os.mkdir(out_dir) for i, file_name in enumerate(sorted(files)): start_time = time.time() track_name = os.path.basename(file_name) track_id = os.path.splitext(track_name)[0] try: song_name = track_to_song[track_id] except KeyError: continue if song_name in wmf_item2i.keys(): audio_file = os.path.join(audio_dir, track_name) out_file = os.path.join(out_dir, track_id) + '.npy' if not os.path.exists(out_file): y, sr = load(audio_file) mel_spectrogram = melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=128) wmf_item2i = pickle.load(open('../../index_dicts.pkl', 'rb'))['item2i'] track_to_song = pickle.load(open('../../track_to_song.pkl', 'rb')) calculate_spectrograms(audio_dir='../../data/MillionSongSubset/audio', out_dir='../../data/MillionSongSubset/spectrograms')
def transform_audio(self, y): '''Compute the Mel spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape=(n_frames, n_mels) The Mel spectrogram ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) mel = np.sqrt(melspectrogram(y=y, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels, fmax=self.fmax)).astype(np.float32) mel = fix_length(mel, n_frames) if self.log: mel = amplitude_to_db(mel, ref=np.max) return {'mag': mel.T[self.idx]}
def get_seq_size(self, frames, sr): """ Get audio sequence size of audio time series when converted to mfcc-features or mel spectrogram :param frames: audio time series :param sr: sampling rate of frames :return: sequence size of mfcc-converted audio """ if self.type == 'mfcc': mfcc_frames = mfcc(frames, sr, n_fft=self.frame_length, hop_length=self.hop_length, n_mfcc=self.mfcc_features, n_mels=self.n_mels) return mfcc_frames.shape[1] elif self.type == 'spectrogram': spectrogram = melspectrogram(frames, sr, n_fft=self.frame_length, hop_length=self.hop_length, n_mels=self.n_mels) return spectrogram.shape[1] else: raise ValueError('Not a valid feature type: ', self.type)
def transform_audio(self, y): '''Compute the PCEN of the (log-) Mel Spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins) The PCEN magnitude ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) S = melspectrogram(y=y, sr=self.sr, hop_length=self.hop_length, n_mels=self.n_mels, n_fft=self.n_fft) if self.log: S = amplitude_to_db(S, ref=np.max) P = pcen(S, sr=sr, hop_length=self.hop_length) P = to_dtype(P, self.dtype) return { 'mag': P[self.idx] } #copied from mel spectrogram pump feature extractor
def from_audio( cls, audio, n_fft=1024, n_mels=128, window="flattop", win_length=256, hop_length=32, htk=True, fmin=None, fmax=None, ): """ Create a MelSpectrogram object from an Audio object The kwargs are cherry-picked from: - https://librosa.org/doc/latest/generated/librosa.feature.melspectrogram.html#librosa.feature.melspectrogram - https://librosa.org/doc/latest/generated/librosa.filters.mel.html?librosa.filters.mel Args: n_fft: Length of the FFT window [default: 1024] n_mels: Number of mel bands to generate [default: 128] window: The windowing function to use [default: "flattop"] win_length: Each frame of audio is windowed by `window`. The window will be of length `win_length` and then padded with zeros to match `n_fft` [default: 256] hop_length: Number of samples between successive frames [default: 32] htk: use HTK formula instead of Slaney [default: True] fmin: lowest frequency (in Hz) [default: None] fmax: highest frequency (in Hz). If None, use `fmax = sr / 2.0` [default: None] Returns: opensoundscape.melspectrogram.MelSpectrogram object """ if not isinstance(audio, Audio): raise TypeError("Class method expects Audio class as input") process_fmin = fmin if fmin else 0 process_fmax = fmax if fmax else audio.sample_rate / 2 S = melspectrogram( y=audio.samples, sr=audio.sample_rate, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, n_mels=n_mels, htk=htk, fmin=process_fmin, fmax=process_fmax, ) # Make spectrogram "right-side up" S = S[::-1] return cls(S, audio.sample_rate, hop_length, process_fmin, process_fmax)
def make_stacked_mels(mono_signal,n_fft,samprate,hop_length,fmin,fmax,n_mels): # create 3 mel spectrograms with different fft window size, all other variables the same mel = [melspectrogram(mono_signal, sr=samprate, hop_length=hop_length, n_fft=j, n_mels=n_mels, fmin=fmin, fmax=fmax) for j in n_fft] # turn spectrograms into log values mel_db = [librosa.power_to_db(mel[k],ref=np.max) for k in range(len(mel))] # re-stack these spectrograms into a single array mel_db = np.stack(mel_db,axis=-1) return mel_db
def melspectro(audio, sr=8000, win_len=256, n_mels=29): hop_len = win_len // 2 return lf.melspectrogram(audio, sr=sr, n_fft=win_len, hop_length=hop_len, win_length=win_len, n_mels=n_mels)
def mel_from_files(file_loc, MEL_PARAMS): phonemes = dict() mels = dict() for i, loc in file_loc.items(): fs, phonemes[i] = wavfile.read(loc) a_mel = melspectrogram(phonemes[i], sr=fs, **MEL_PARAMS) mels[i] = a_mel return mels
def mfcc(path): # Let's make and display a mel-scaled power (energy-squared) spectrogram # We use a small hop length of 64 here so that the # frames line up with the beat tracker example below. y, sr = load_files(path) print 'claculating mfcc ' + path S = feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128) # Convert to log scale (dB). We'll use the peak power as reference. log_S = logamplitude(S, ref_power=np.max) mfcc_v = feature.mfcc(S=log_S, n_mfcc=14) return np.sum(mfcc_v, axis=1)/mfcc_v.shape[1]
def extract_feature_spectrogram( data ): filename, lbl = data try: signal,sr = librosa.load(filename) if len(signal) < 5*sr: print "Too short" return filename, None, None else: print "OK", len(signal) / float(sr) if sr != 22050: print "Non standart sr", sr return filename, None, None signal = signal[:5*sr] spectrogram = melspectrogram(y=signal, sr=sr, n_fft=fft_points,hop_length = fft_overlap, fmax=5000, n_mels = mfcc_coefficients) #spectrogram = spectrogram / spectrogram.max() #print gmm.means_.shape #result_features = np.vstack( [ gmm. ] ) return filename, lbl, spectrogram except Exception,e: print e return filename, None, None
def mfcc(data, sr=22050, n_mfcc=20, **kwargs): """Mel-frequency cepstral coefficients (original function from librosa, modified to accept additional parameters). :parameters: - data : np.ndarray or None audio time series - sr : int > 0 sampling rate of y - n_mfcc: int number of MFCCs to return .. note:: additional keyword arguments are passed to the melspectrogram function. :returns: - M : np.ndarray, shape=(n_mfcc, S.shape[1]) MFCC sequence """ S = logamplitude(melspectrogram(y=data, sr=sr, **kwargs)) return np.dot(dct(n_mfcc, S.shape[0]), S)
def get_spectrogram(path): """Строим спектограмму из wav файла""" y, sr = load(path) S = melspectrogram(y, sr=sr, n_mels=100) log_S = logamplitude(S, ref_power=np.max) return log_S