Beispiel #1
0
def worker(audio_dir: list, save_dir, process_i=0):
    def read_wav(audio_path):
        '''

        :param audio_path:
        :return:
        audio_data,sampleRate
        '''
        audio_data, sampleRate = sf.read(audio_path)
        # print('audio :{0}'.format(audio_path))
        # print('sample rate :{0}'.format(sampleRate))
        # print('shape: {0}'.format(audio_data.shape))
        return audio_data, sampleRate

    for var in tqdm(audio_dir, desc='process {0}'.format(process_i)):
        audio_data, sr = read_wav(var)

        processed_sig = []
        # t=[]
        for i in range(audio_data.shape[1]):
            mean = np.mean(audio_data[:, i])
            std = np.std(audio_data[:, i])
            x = (audio_data[:, i] - mean) / std
            # t.append(x)
            # window size 0.0638s hop 0.0318 100 channels f_min 100Hz
            temp = gt.gtgram(x, sr, 0.0638, 0.0318, 100, 100)
            temp = librosa.amplitude_to_db(temp)
            processed_sig.append(temp)
        # t=np.asarray(t)
        processed_sig = np.asarray(processed_sig)

        feature_dict = {'gfcc': processed_sig, 'shape': processed_sig.shape}
        save_name = var.split('\\')[-1].split('.wav')[0] + '.gzip'
        with gzip.open(os.path.join(save_dir, save_name), 'wb') as f:
            pickle.dump(feature_dict, f)
Beispiel #2
0
def load_sound_stims(files,
                     root="",
                     windowtime=0.016,
                     ovl=0.0016,
                     f_min=500,
                     f_max=8000,
                     gammatone=False,
                     dsample=10,
                     sres=15,
                     compress=0):
    stims = []
    durations = []

    for f in files:
        Fs, wave = read(root + f)
        duration = int(1000 * len(wave) / Fs)
        durations.append(duration)
        if gammatone:
            Pxx = gg.gtgram(wave, Fs, windowtime, ovl, sres, f_min)
            Pxx = np.log10(Pxx)

        else:
            w = np.hanning(int(windowtime * Fs))
            Pxx = libtfr.stft(wave, w, int(w.size * .1))
            freqs, ind = libtfr.fgrid(Fs, w.size, [f_min, f_max])
            Pxx = Pxx[ind, :]
            Pxx = np.log10(Pxx + compress)
            Pxx = resample(Pxx, sres)
        Pxx = resample(Pxx, duration / dsample, axis=1)
        stims.append(Pxx)
    return stims, durations
Beispiel #3
0
 def _function(self, recording):
     gram = gtgram.gtgram(recording,
                          fs=self.fs,
                          window_time=(self.window_time / self.fs),
                          hop_time=(self.hop_time / self.fs),
                          channels=self.num_banks,
                          f_min=20).T
     return gram
Beispiel #4
0
 def make_spectrogram(self, audio_samples):
     gtg=gtgram.gtgram(audio_samples,
                          self.sample_rate,
                          self.window_time,
                          self.hop_time,
                          self.num_filters,
                          self.cutoff_low)
   
     return gtg
Beispiel #5
0
def make_spect(filepath, method='fourier', height=60, interval=1, verbose=False, max_len=1080):
    """
    Turns a file containing sound data into a matrix for processing. Two methods are supported,
    fourier spectrum analysis, which returns a spectrogram, and gammatone which returns a gammatone quefrency cepstrum.
    Gammatones take `much` longer, but are ostensibly better for feature analysis, and are smaller. Spectrograms are big
    but don't take very much time to create.


    :param str filepath: path to file

    :param str method: 'fourier' or 'gamma'

    :param num max_len: the maximum length in seconds of a song to convert. Important for memory management.
    Default is a half-hour

    :param int height: for gammatones, how many quefrency bins should be used. default 60.

    :param int interval: for gammatones, the width in seconds of the time bins. default 2.

    :param bool verbose: toggles behavior showing a plot of the returned 'gram.

    :return: np.array
    a matrix representing (in decibels) the completed analysis.
    """
    try:
        data, sr = sf.read(filepath)
    except RuntimeError:
        return None

    if len(data) // sr > max_len:
        return None

    if verbose:
        plt.figure()

    if method == 'fourier':
        f, t, sxx = signal.spectrogram(data[:, 0], sr)
        del data

        if verbose:
            plt.pcolormesh(t, f, 10 * np.log10(sxx))
            plt.ylabel('Frequency [Hz]')
            plt.xlabel('Time [sec]')
            plt.show()
    elif method == 'gamma':
        sxx = gt.gtgram(data[:, 0], sr, interval, interval, height, 20)
        del data
        if verbose:
            plt.pcolormesh(10 * np.log10(sxx))
            plt.show()
    else:
        raise ValueError(f'{method} is not a valid method.')
    with np.testing.suppress_warnings() as sup:
        sup.filter(RuntimeWarning)
        # This is because log10 will throw a warning when it coerces a 0 to Nan and I find that obnoxious.
        return 10 * np.log10(sxx)
Beispiel #6
0
def load_stimulus(path,
                  window,
                  step,
                  f_min=0.5,
                  f_max=8.0,
                  f_count=30,
                  compress=1,
                  gammatone=False,
                  **kwargs):
    """Load sound stimulus and calculate spectrotemporal representation.

    Parameters:

    path: location of wave file
    window: duration of window (in ms)
    step: window step (in ms)
    f_min: minimum frequency (in kHz)
    f_max: maximum frequency (in kHz)
    f_count: number of frequency bins
    gammatone: if True, use gammatone filterbank

    Returns spectrogram, duration (ms)
    """
    import ewave
    fp = ewave.open(path, "r")
    Fs = fp.sampling_rate / 1000.
    osc = ewave.rescale(fp.read(), 'h')
    if gammatone:
        import gammatone.gtgram as gg
        Pxx = gg.gtgram(osc, Fs * 1000, window / 1000, step / 1000, f_count,
                        f_min * 1000)
    else:
        import libtfr
        # nfft based on desired number of channels btw f_min and f_max
        nfft = int(f_count / (f_max - f_min) * Fs)
        npoints = int(Fs * window)
        if nfft < npoints:
            raise ValueError(
                "window size {} ({} points) too large for desired freq resolution {}. "
                "Decrease to {} ms or increase f_count.".format(
                    window, f_count, npoints, nfft / Fs))

        nstep = int(Fs * step)
        taper = np.hanning(npoints)
        mfft = libtfr.mfft_precalc(nfft, taper)
        Pxx = mfft.mtspec(osc, nstep)
        freqs, ind = libtfr.fgrid(Fs, nfft, [f_min, f_max])
        Pxx = Pxx[ind, :]
    if compress is not None:
        Pxx = np.log10(Pxx + compress) - np.log10(compress)
    return Pxx, Pxx.shape[1] * step
Beispiel #7
0
def save_gammatone(list_audio, output_dir, fs):
    """
    Save a list of chunks audio as gammagram
    :param list_audio: list of audio chunks
    :param output_dir: path of the output_dir
    :param fs: sampling rate
    :return:
    """

    for i, f in enumerate(list_audio):
        waveform = gtgram(f,
                          fs,
                          window_time=0.04,
                          hop_time=0.02,
                          channels=128,
                          f_min=120)
        wave_tmp = np.expand_dims(waveform, axis=0)
        filename = os.path.join(output_dir, f"chunk_{i}.npy")
        np.save(filename, wave_tmp)
    f_size = fd * fs

    (rate, sig) = wav.read(new_file_name_path)
    x_brahms, sr_brahms = librosa.load(file, duration=30, offset=30)

    mfcc_feat = mfcc(sig, samplerate=rate)  #(2992, 13)

    ipdb.set_trace()

    #mfcc_one_line = mfcc_feat.reshape(38896, 1)
    fbank_feat = fbank(sig, samplerate=rate)
    logfbank_feat = logfbank(sig, samplerate=rate)
    d_mfcc_feat = delta(mfcc_feat, 2)
    #gammatone.gtgram.gtgram(wave, fs, window_time, hop_time, channels, f_min)

    gtgram_function = gtgram.gtgram(sig, rate, .250, .125, 1, 20)

    print("mfcc_feat.shape:", mfcc_feat.shape)
    print("mfcc_one_line.shape", mfcc_one_line.shape)
    print("logfbank_feat.shape", logfbank_feat.shape)
    print("d_mfcc_feat.shape", d_mfcc_feat.shape)
    print("gtgram_function.shape", gtgram_function.shape)
    print("gtgram_function.shape.T", gtgram_function.T.shape)
    #ssc = ssc(sig,samplerate=rate)

    #print(logfbank_feat[1:3,:])

    #gammatone.filters.centre_freqs(fs, num_freqs, cutoff)
    #centre_freqs = filters.#centre_freqs(rate, sig.shape[0], 100)
    """
	Renders the given ``duration`` of audio from the audio file at ``path``
def start_process():
	files = [os.path.join(AUDIOS_PATH+"complete/",fn) for fn in os.listdir(AUDIOS_PATH+"complete/") if fn.endswith('.mp3')]
	for file in files:
		filename = file.split("/")[-1].split(".")[:-1][0]

		# Set up the plot	
		fig = matplotlib.pyplot.figure()
		axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
			
		
		new_file_name_path = AUDIOS_PATH+"cuts/30s_cuts/"+filename+".wav"
		dataset.cut_30s_from_file(filename, file, AUDIOS_PATH+"cuts/")
		#track_30s = AudioSegment.from_wav(new_file_name_path)
		#play(track_30s)
		#aa,bb,cc,dd, plt = get_spectrogram(new_file_name_path)
		#matplotlib.pyplot.show()
		#ipdb.set_trace()

		(rate,sig) = wav.read(new_file_name_path)
		
		
		#fbank_feat = fbank(sig,samplerate=rate)

			# Average the stereo signal
		duration = False
		if duration:
			nframes = duration * rate
			sig = sig[0:nframes, :]

		#signal = sig.mean()
	 
		# Default gammatone-based spectrogram parameters	
		twin = 0.250
		thop = twin/2
		channels = 8
		fmin = 20


		formatter = plot.ERBFormatter(fmin, rate/2, unit='Hz', places=0)
		axes.yaxis.set_major_formatter(formatter)

		# Figure out time axis scaling
		duration = len(sig) / rate

		# Calculate 1:1 aspect ratio
		aspect_ratio = duration/scipy.constants.golden

		gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin)

		Z = np.flipud(20 * np.log10(gtg))
		z_reshaped = Z.reshape(Z.size, 1)
		img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio)



		matplotlib.pyplot.show()
		
		
		ipdb.set_trace()




		fig = matplotlib.pyplot.figure()
		axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
		# Default gammatone-based spectrogram parameters	
		twin = 0.250
		thop = twin/2
		channels = 16
		fmin = 20


		formatter = plot.ERBFormatter(fmin, rate*3/4, unit='Hz', places=0)
		axes.yaxis.set_major_formatter(formatter)

		# Figure out time axis scaling
		duration = len(sig) / rate

		# Calculate 1:1 aspect ratio
		aspect_ratio = duration/scipy.constants.golden

		gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin)

		Z = np.flipud(20 * np.log10(gtg))


		img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio)


		matplotlib.pyplot.show()
		
		
		ipdb.set_trace()

		mfcc_feat = mfcc(sig,samplerate=rate, winlen=twin,winstep=thop)
def get_gtg(rate, signal, twin, thop, channels, fmin):
    gtg = gtgram.gtgram(signal, rate, twin, thop, channels, fmin)
    Z = np.flipud(20 * np.log10(gtg))
    return Z
Beispiel #11
0
def get_gfb(filelist, config):
    # Read the filelist
    fp = open(filelist, 'r')
    flist = fp.read().splitlines()
    flist = filter(None, flist)
    # Create output directory if non-existant
    opdir = os.path.dirname(flist[0].split(',')[1])
    if not os.path.exists(opdir):
        os.makedirs(opdir)
    # Read the relevant configs from the configfile
    framelen = float(config['framelen'])
    frameshift = float(config['frameshift'])
    wintype = config['wintype']
    if wintype == 'rectangular':
        winfun = np.ones
    else:
        winfun = getattr(np, wintype)
    # Number of channels for gammatone filterbank
    if 'nbanks' in config:
        nbanks = int(config['nbanks'])
    else:
        raise ConfigError('nbanks parameter not set in config file')
    # Min frequency of Gammatone filterbank
    if 'min_freq' in config:
        min_freq = float(config['min_freq'])
    else:
        min_freq = 0
    mvn = config['mvn']
    mvn = mvn.upper() == 'TRUE'
    if 'std_frac' in config:
        std_frac = float(config['std_frac'])
    else:
        std_frac = 1.0
    del1_flag = config['delta1']
    del2_flag = config['delta2']
    del1_flag = del1_flag.upper() == 'TRUE'
    del2_flag = del2_flag.upper() == 'TRUE'
    # Iterate over the filelist to extract features
    if mvn:
        feats_list = []
        for iter1, fline in enumerate(flist):
            infnm = fline.split(',')[0]
            opfnm = fline.split(',')[1]
            sig, fs = librosa.load(infnm, sr=None)
            sig = sig / max(abs(sig))
            dither = 1e-6 * np.random.rand(sig.shape[0])
            sig = sig + dither
            win_length = int(fs * framelen * 0.001)
            hop_length = int(fs * frameshift * 0.001)
            feats = gtgram.gtgram(sig, fs, framelen * 0.001,
                                  frameshift * 0.001, nbanks, min_freq)
            # Code for amplitude range compression
            if config['compression'] == 'log':
                feats = librosa.logamplitude(feats)
            elif config['compression'][0:4] == 'root':
                rootval = float(config['compression'].split('_')[1])
                feats = np.sign(feats) * (np.abs(feats)**(1 / rootval))
                if np.sum(np.isnan(feats)):
                    print('NaN Error in root compression for file: %s' % infnm)
                    exit()
            if del1_flag:
                feats_del1 = librosa.feature.delta(feats, order=1, axis=1)
            if del2_flag:
                feats_del2 = librosa.feature.delta(feats, order=2, axis=1)
            if del1_flag:
                feats = np.concatenate((feats, feats_del1), axis=0)
            if del2_flag:
                feats = np.concatenate((feats, feats_del2), axis=0)

            feats_list.append(feats)
        all_feats = np.concatenate(feats_list, axis=1)
        f_mean = np.mean(all_feats, axis=1)[:, None]
        f_std = np.std(all_feats, axis=1)[:, None]
        opdir = os.path.dirname(opfnm)
        mvn_params = np.concatenate((f_mean, f_std), axis=1)
        postfix = os.path.basename(filelist).split('.')[0]
        np.save(opdir + '/mvn_params_' + postfix + '.npy', mvn_params)

    for iter1, fline in enumerate(flist):
        infnm = fline.split(',')[0]
        opfnm = fline.split(',')[1]
        sig, fs = librosa.load(infnm, sr=None)
        sig = sig / max(abs(sig))
        dither = 1e-6 * np.random.rand(sig.shape[0])
        sig = sig + dither
        win_length = int(fs * framelen * 0.001)
        hop_length = int(fs * frameshift * 0.001)
        feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001,
                              nbanks, min_freq)
        if config['compression'] == 'log':
            feats = librosa.logamplitude(feats)
        elif config['compression'][0:4] == 'root':
            rootval = float(config['compression'].split('_')[1])
            feats = np.sign(feats) * (np.abs(feats)**(1 / rootval))
            if np.sum(np.isnan(feats)):
                print('NaN Error in root compression for file: %s' % infnm)
                exit()
        if del1_flag:
            feats_del1 = librosa.feature.delta(feats, order=1, axis=1)
        if del2_flag:
            feats_del2 = librosa.feature.delta(feats, order=2, axis=1)
        if del1_flag:
            feats = np.concatenate((feats, feats_del1), axis=0)
        if del2_flag:
            feats = np.concatenate((feats, feats_del2), axis=0)
        if mvn:
            feats = mvnormalize(feats, mvn_params, std_frac)
        writehtk(feats.T, frameshift, opfnm)
    fp.close()
Beispiel #12
0
def gammatone_bank(wav: NDVar,
                   f_min: float,
                   f_max: float,
                   n: int,
                   integration_window: float = 0.010,
                   tstep: float = None,
                   location: str = 'right',
                   pad: bool = True,
                   name: str = None) -> NDVar:
    """Gammatone filterbank response

    Parameters
    ----------
    wav : NDVar
        Sound input.
    f_min : scalar
        Lower frequency cutoff.
    f_max : scalar
        Upper frequency cutoff.
    n : int
        Number of filter channels.
    integration_window : scalar
        Integration time window in seconds (default 10 ms).
    tstep : scalar
        Time step size in the output (default is same as ``wav``).
    location : str
        Location of the output relative to the input time axis:

        - ``right``: gammatone sample at end of integration window (default)
        - ``left``: gammatone sample at beginning of integration window
        - ``center``: gammatone sample at center of integration window

        Since gammatone filter response depends on ``integration_window``, the
        filter response will be delayed relative to the analytic envlope. To
        ignore this delay, use `location='left'`
    pad : bool
        Pad output to match time axis of input.
    name : str
        NDVar name (default is ``wav.name``).

    Notes
    -----
    Requires the ``fmax`` branch of the gammatone library to be installed:

        $ pip install https://github.com/christianbrodbeck/gammatone/archive/fmax.zip
    """
    from gammatone.filters import centre_freqs
    from gammatone.gtgram import gtgram

    tmin = wav.time.tmin
    wav_ = wav
    if location == 'left':
        if pad:
            wav_ = _pad_func(wav, wav.time.tmin - integration_window)
    elif location == 'right':
        # tmin += window_time
        if pad:
            wav_ = _pad_func(wav, tstop=wav.time.tstop + integration_window)
    elif location == 'center':
        dt = integration_window / 2
        # tmin += dt
        if pad:
            wav_ = _pad_func(wav, wav.time.tmin - dt, wav.time.tstop + dt)
    else:
        raise ValueError(f"mode={location!r}")
    sfreq = 1 / wav.time.tstep
    if tstep is None:
        tstep = wav.time.tstep
    x = gtgram(wav_.get_data('time'), sfreq, integration_window, tstep, n,
               f_min, f_max)
    freqs = centre_freqs(sfreq, n, f_min, f_max)
    # freqs = np.round(freqs, out=freqs).astype(int)
    freq_dim = Scalar('frequency', freqs[::-1], 'Hz')
    time_dim = UTS(tmin, tstep, x.shape[1])
    return NDVar(x, (freq_dim, time_dim), name or wav.name)
	twin = 0.250
	thop = twin/2
	channels = 16
	fmin = 20


	formatter = plot.ERBFormatter(fmin, rate*3/4, unit='Hz', places=0)
	axes.yaxis.set_major_formatter(formatter)

	# Figure out time axis scaling
	duration = len(sig) / rate

	# Calculate 1:1 aspect ratio
	aspect_ratio = duration/scipy.constants.golden

	gtg = gtgram.gtgram(sig, rate, twin, thop, channels, fmin)
	Z = np.flipud(20 * np.log10(gtg))


	ipdb.set_trace()


	img = axes.imshow(Z, extent=[0, duration, 1, 0], aspect=aspect_ratio)


	matplotlib.pyplot.show()
	
	
	ipdb.set_trace()
	
	
Beispiel #14
0
def perform_gammatone_spectrogram(audio_samples, sample_rate = 44100, window_time = 0.05, hop_time = 0.025, channels = 256, cutoff_low = 20):
    return gtgram.gtgram(audio_samples, sample_rate, window_time, hop_time, channels, cutoff_low)