def worker(audio_dir: list, save_dir, process_i=0): def read_wav(audio_path): ''' :param audio_path: :return: audio_data,sampleRate ''' audio_data, sampleRate = sf.read(audio_path) # print('audio :{0}'.format(audio_path)) # print('sample rate :{0}'.format(sampleRate)) # print('shape: {0}'.format(audio_data.shape)) return audio_data, sampleRate for var in tqdm(audio_dir, desc='process {0}'.format(process_i)): audio_data, sr = read_wav(var) processed_sig = [] # t=[] for i in range(audio_data.shape[1]): mean = np.mean(audio_data[:, i]) std = np.std(audio_data[:, i]) x = (audio_data[:, i] - mean) / std # t.append(x) # window size 0.0638s hop 0.0318 100 channels f_min 100Hz temp = gt.gtgram(x, sr, 0.0638, 0.0318, 100, 100) temp = librosa.amplitude_to_db(temp) processed_sig.append(temp) # t=np.asarray(t) processed_sig = np.asarray(processed_sig) feature_dict = {'gfcc': processed_sig, 'shape': processed_sig.shape} save_name = var.split('\\')[-1].split('.wav')[0] + '.gzip' with gzip.open(os.path.join(save_dir, save_name), 'wb') as f: pickle.dump(feature_dict, f)
def load_sound_stims(files, root="", windowtime=0.016, ovl=0.0016, f_min=500, f_max=8000, gammatone=False, dsample=10, sres=15, compress=0): stims = [] durations = [] for f in files: Fs, wave = read(root + f) duration = int(1000 * len(wave) / Fs) durations.append(duration) if gammatone: Pxx = gg.gtgram(wave, Fs, windowtime, ovl, sres, f_min) Pxx = np.log10(Pxx) else: w = np.hanning(int(windowtime * Fs)) Pxx = libtfr.stft(wave, w, int(w.size * .1)) freqs, ind = libtfr.fgrid(Fs, w.size, [f_min, f_max]) Pxx = Pxx[ind, :] Pxx = np.log10(Pxx + compress) Pxx = resample(Pxx, sres) Pxx = resample(Pxx, duration / dsample, axis=1) stims.append(Pxx) return stims, durations
def _function(self, recording): gram = gtgram.gtgram(recording, fs=self.fs, window_time=(self.window_time / self.fs), hop_time=(self.hop_time / self.fs), channels=self.num_banks, f_min=20).T return gram
def make_spectrogram(self, audio_samples): gtg=gtgram.gtgram(audio_samples, self.sample_rate, self.window_time, self.hop_time, self.num_filters, self.cutoff_low) return gtg
def make_spect(filepath, method='fourier', height=60, interval=1, verbose=False, max_len=1080): """ Turns a file containing sound data into a matrix for processing. Two methods are supported, fourier spectrum analysis, which returns a spectrogram, and gammatone which returns a gammatone quefrency cepstrum. Gammatones take `much` longer, but are ostensibly better for feature analysis, and are smaller. Spectrograms are big but don't take very much time to create. :param str filepath: path to file :param str method: 'fourier' or 'gamma' :param num max_len: the maximum length in seconds of a song to convert. Important for memory management. Default is a half-hour :param int height: for gammatones, how many quefrency bins should be used. default 60. :param int interval: for gammatones, the width in seconds of the time bins. default 2. :param bool verbose: toggles behavior showing a plot of the returned 'gram. :return: np.array a matrix representing (in decibels) the completed analysis. """ try: data, sr = sf.read(filepath) except RuntimeError: return None if len(data) // sr > max_len: return None if verbose: plt.figure() if method == 'fourier': f, t, sxx = signal.spectrogram(data[:, 0], sr) del data if verbose: plt.pcolormesh(t, f, 10 * np.log10(sxx)) plt.ylabel('Frequency [Hz]') plt.xlabel('Time [sec]') plt.show() elif method == 'gamma': sxx = gt.gtgram(data[:, 0], sr, interval, interval, height, 20) del data if verbose: plt.pcolormesh(10 * np.log10(sxx)) plt.show() else: raise ValueError(f'{method} is not a valid method.') with np.testing.suppress_warnings() as sup: sup.filter(RuntimeWarning) # This is because log10 will throw a warning when it coerces a 0 to Nan and I find that obnoxious. return 10 * np.log10(sxx)
def load_stimulus(path, window, step, f_min=0.5, f_max=8.0, f_count=30, compress=1, gammatone=False, **kwargs): """Load sound stimulus and calculate spectrotemporal representation. Parameters: path: location of wave file window: duration of window (in ms) step: window step (in ms) f_min: minimum frequency (in kHz) f_max: maximum frequency (in kHz) f_count: number of frequency bins gammatone: if True, use gammatone filterbank Returns spectrogram, duration (ms) """ import ewave fp = ewave.open(path, "r") Fs = fp.sampling_rate / 1000. osc = ewave.rescale(fp.read(), 'h') if gammatone: import gammatone.gtgram as gg Pxx = gg.gtgram(osc, Fs * 1000, window / 1000, step / 1000, f_count, f_min * 1000) else: import libtfr # nfft based on desired number of channels btw f_min and f_max nfft = int(f_count / (f_max - f_min) * Fs) npoints = int(Fs * window) if nfft < npoints: raise ValueError( "window size {} ({} points) too large for desired freq resolution {}. " "Decrease to {} ms or increase f_count.".format( window, f_count, npoints, nfft / Fs)) nstep = int(Fs * step) taper = np.hanning(npoints) mfft = libtfr.mfft_precalc(nfft, taper) Pxx = mfft.mtspec(osc, nstep) freqs, ind = libtfr.fgrid(Fs, nfft, [f_min, f_max]) Pxx = Pxx[ind, :] if compress is not None: Pxx = np.log10(Pxx + compress) - np.log10(compress) return Pxx, Pxx.shape[1] * step
def save_gammatone(list_audio, output_dir, fs): """ Save a list of chunks audio as gammagram :param list_audio: list of audio chunks :param output_dir: path of the output_dir :param fs: sampling rate :return: """ for i, f in enumerate(list_audio): waveform = gtgram(f, fs, window_time=0.04, hop_time=0.02, channels=128, f_min=120) wave_tmp = np.expand_dims(waveform, axis=0) filename = os.path.join(output_dir, f"chunk_{i}.npy") np.save(filename, wave_tmp)
f_size = fd * fs (rate, sig) = wav.read(new_file_name_path) x_brahms, sr_brahms = librosa.load(file, duration=30, offset=30) mfcc_feat = mfcc(sig, samplerate=rate) #(2992, 13) ipdb.set_trace() #mfcc_one_line = mfcc_feat.reshape(38896, 1) fbank_feat = fbank(sig, samplerate=rate) logfbank_feat = logfbank(sig, samplerate=rate) d_mfcc_feat = delta(mfcc_feat, 2) #gammatone.gtgram.gtgram(wave, fs, window_time, hop_time, channels, f_min) gtgram_function = gtgram.gtgram(sig, rate, .250, .125, 1, 20) print("mfcc_feat.shape:", mfcc_feat.shape) print("mfcc_one_line.shape", mfcc_one_line.shape) print("logfbank_feat.shape", logfbank_feat.shape) print("d_mfcc_feat.shape", d_mfcc_feat.shape) print("gtgram_function.shape", gtgram_function.shape) print("gtgram_function.shape.T", gtgram_function.T.shape) #ssc = ssc(sig,samplerate=rate) #print(logfbank_feat[1:3,:]) #gammatone.filters.centre_freqs(fs, num_freqs, cutoff) #centre_freqs = filters.#centre_freqs(rate, sig.shape[0], 100) """ Renders the given ``duration`` of audio from the audio file at ``path``
def start_process(): files = [os.path.join(AUDIOS_PATH+"complete/",fn) for fn in os.listdir(AUDIOS_PATH+"complete/") if fn.endswith('.mp3')] for file in files: filename = file.split("/")[-1].split(".")[:-1][0] # Set up the plot fig = matplotlib.pyplot.figure() axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) new_file_name_path = AUDIOS_PATH+"cuts/30s_cuts/"+filename+".wav" dataset.cut_30s_from_file(filename, file, AUDIOS_PATH+"cuts/") #track_30s = AudioSegment.from_wav(new_file_name_path) #play(track_30s) #aa,bb,cc,dd, plt = get_spectrogram(new_file_name_path) #matplotlib.pyplot.show() #ipdb.set_trace() (rate,sig) = wav.read(new_file_name_path) #fbank_feat = fbank(sig,samplerate=rate) # Average the stereo signal duration = False if duration: nframes = duration * rate sig = sig[0:nframes, :] #signal = sig.mean() # Default gammatone-based spectrogram parameters twin = 0.250 thop = twin/2 channels = 8 fmin = 20 formatter = plot.ERBFormatter(fmin, rate/2, unit='Hz', places=0) axes.yaxis.set_major_formatter(formatter) # Figure out time axis scaling duration = len(sig) / rate # Calculate 1:1 aspect ratio aspect_ratio = duration/scipy.constants.golden gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin) Z = np.flipud(20 * np.log10(gtg)) z_reshaped = Z.reshape(Z.size, 1) img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio) matplotlib.pyplot.show() ipdb.set_trace() fig = matplotlib.pyplot.figure() axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # Default gammatone-based spectrogram parameters twin = 0.250 thop = twin/2 channels = 16 fmin = 20 formatter = plot.ERBFormatter(fmin, rate*3/4, unit='Hz', places=0) axes.yaxis.set_major_formatter(formatter) # Figure out time axis scaling duration = len(sig) / rate # Calculate 1:1 aspect ratio aspect_ratio = duration/scipy.constants.golden gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin) Z = np.flipud(20 * np.log10(gtg)) img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio) matplotlib.pyplot.show() ipdb.set_trace() mfcc_feat = mfcc(sig,samplerate=rate, winlen=twin,winstep=thop)
def get_gtg(rate, signal, twin, thop, channels, fmin): gtg = gtgram.gtgram(signal, rate, twin, thop, channels, fmin) Z = np.flipud(20 * np.log10(gtg)) return Z
def get_gfb(filelist, config): # Read the filelist fp = open(filelist, 'r') flist = fp.read().splitlines() flist = filter(None, flist) # Create output directory if non-existant opdir = os.path.dirname(flist[0].split(',')[1]) if not os.path.exists(opdir): os.makedirs(opdir) # Read the relevant configs from the configfile framelen = float(config['framelen']) frameshift = float(config['frameshift']) wintype = config['wintype'] if wintype == 'rectangular': winfun = np.ones else: winfun = getattr(np, wintype) # Number of channels for gammatone filterbank if 'nbanks' in config: nbanks = int(config['nbanks']) else: raise ConfigError('nbanks parameter not set in config file') # Min frequency of Gammatone filterbank if 'min_freq' in config: min_freq = float(config['min_freq']) else: min_freq = 0 mvn = config['mvn'] mvn = mvn.upper() == 'TRUE' if 'std_frac' in config: std_frac = float(config['std_frac']) else: std_frac = 1.0 del1_flag = config['delta1'] del2_flag = config['delta2'] del1_flag = del1_flag.upper() == 'TRUE' del2_flag = del2_flag.upper() == 'TRUE' # Iterate over the filelist to extract features if mvn: feats_list = [] for iter1, fline in enumerate(flist): infnm = fline.split(',')[0] opfnm = fline.split(',')[1] sig, fs = librosa.load(infnm, sr=None) sig = sig / max(abs(sig)) dither = 1e-6 * np.random.rand(sig.shape[0]) sig = sig + dither win_length = int(fs * framelen * 0.001) hop_length = int(fs * frameshift * 0.001) feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001, nbanks, min_freq) # Code for amplitude range compression if config['compression'] == 'log': feats = librosa.logamplitude(feats) elif config['compression'][0:4] == 'root': rootval = float(config['compression'].split('_')[1]) feats = np.sign(feats) * (np.abs(feats)**(1 / rootval)) if np.sum(np.isnan(feats)): print('NaN Error in root compression for file: %s' % infnm) exit() if del1_flag: feats_del1 = librosa.feature.delta(feats, order=1, axis=1) if del2_flag: feats_del2 = librosa.feature.delta(feats, order=2, axis=1) if del1_flag: feats = np.concatenate((feats, feats_del1), axis=0) if del2_flag: feats = np.concatenate((feats, feats_del2), axis=0) feats_list.append(feats) all_feats = np.concatenate(feats_list, axis=1) f_mean = np.mean(all_feats, axis=1)[:, None] f_std = np.std(all_feats, axis=1)[:, None] opdir = os.path.dirname(opfnm) mvn_params = np.concatenate((f_mean, f_std), axis=1) postfix = os.path.basename(filelist).split('.')[0] np.save(opdir + '/mvn_params_' + postfix + '.npy', mvn_params) for iter1, fline in enumerate(flist): infnm = fline.split(',')[0] opfnm = fline.split(',')[1] sig, fs = librosa.load(infnm, sr=None) sig = sig / max(abs(sig)) dither = 1e-6 * np.random.rand(sig.shape[0]) sig = sig + dither win_length = int(fs * framelen * 0.001) hop_length = int(fs * frameshift * 0.001) feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001, nbanks, min_freq) if config['compression'] == 'log': feats = librosa.logamplitude(feats) elif config['compression'][0:4] == 'root': rootval = float(config['compression'].split('_')[1]) feats = np.sign(feats) * (np.abs(feats)**(1 / rootval)) if np.sum(np.isnan(feats)): print('NaN Error in root compression for file: %s' % infnm) exit() if del1_flag: feats_del1 = librosa.feature.delta(feats, order=1, axis=1) if del2_flag: feats_del2 = librosa.feature.delta(feats, order=2, axis=1) if del1_flag: feats = np.concatenate((feats, feats_del1), axis=0) if del2_flag: feats = np.concatenate((feats, feats_del2), axis=0) if mvn: feats = mvnormalize(feats, mvn_params, std_frac) writehtk(feats.T, frameshift, opfnm) fp.close()
def gammatone_bank(wav: NDVar, f_min: float, f_max: float, n: int, integration_window: float = 0.010, tstep: float = None, location: str = 'right', pad: bool = True, name: str = None) -> NDVar: """Gammatone filterbank response Parameters ---------- wav : NDVar Sound input. f_min : scalar Lower frequency cutoff. f_max : scalar Upper frequency cutoff. n : int Number of filter channels. integration_window : scalar Integration time window in seconds (default 10 ms). tstep : scalar Time step size in the output (default is same as ``wav``). location : str Location of the output relative to the input time axis: - ``right``: gammatone sample at end of integration window (default) - ``left``: gammatone sample at beginning of integration window - ``center``: gammatone sample at center of integration window Since gammatone filter response depends on ``integration_window``, the filter response will be delayed relative to the analytic envlope. To ignore this delay, use `location='left'` pad : bool Pad output to match time axis of input. name : str NDVar name (default is ``wav.name``). Notes ----- Requires the ``fmax`` branch of the gammatone library to be installed: $ pip install https://github.com/christianbrodbeck/gammatone/archive/fmax.zip """ from gammatone.filters import centre_freqs from gammatone.gtgram import gtgram tmin = wav.time.tmin wav_ = wav if location == 'left': if pad: wav_ = _pad_func(wav, wav.time.tmin - integration_window) elif location == 'right': # tmin += window_time if pad: wav_ = _pad_func(wav, tstop=wav.time.tstop + integration_window) elif location == 'center': dt = integration_window / 2 # tmin += dt if pad: wav_ = _pad_func(wav, wav.time.tmin - dt, wav.time.tstop + dt) else: raise ValueError(f"mode={location!r}") sfreq = 1 / wav.time.tstep if tstep is None: tstep = wav.time.tstep x = gtgram(wav_.get_data('time'), sfreq, integration_window, tstep, n, f_min, f_max) freqs = centre_freqs(sfreq, n, f_min, f_max) # freqs = np.round(freqs, out=freqs).astype(int) freq_dim = Scalar('frequency', freqs[::-1], 'Hz') time_dim = UTS(tmin, tstep, x.shape[1]) return NDVar(x, (freq_dim, time_dim), name or wav.name)
twin = 0.250 thop = twin/2 channels = 16 fmin = 20 formatter = plot.ERBFormatter(fmin, rate*3/4, unit='Hz', places=0) axes.yaxis.set_major_formatter(formatter) # Figure out time axis scaling duration = len(sig) / rate # Calculate 1:1 aspect ratio aspect_ratio = duration/scipy.constants.golden gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin) Z = np.flipud(20 * np.log10(gtg)) ipdb.set_trace() img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio) matplotlib.pyplot.show() ipdb.set_trace()
def perform_gammatone_spectrogram(audio_samples, sample_rate = 44100, window_time = 0.05, hop_time = 0.025, channels = 256, cutoff_low = 20): return gtgram.gtgram(audio_samples, sample_rate, window_time, hop_time, channels, cutoff_low)