Esempio n. 1
0
File: main.py Progetto: beckgom/msaf
def features(filename):
    # print '\t[1/5] loading audio'
    y, sr = librosa.load(filename, sr=SR)

    # print '\t[2/5] Separating harmonic and percussive signals'
    y_perc, y_harm = hp_sep(y)

    # print '\t[3/5] detecting beats'
    bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH)

    # print '\t[4/5] generating CQT'
    M1 = np.abs(
        librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72)
    )

    M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max)

    # print '\t[5/5] generating MFCC'
    S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS)
    M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC)

    n = min(M1.shape[1], M2.shape[1])

    beats = beats[beats < n]

    beats = np.unique(np.concatenate([[0], beats]))

    times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH)

    times = np.concatenate([times, [float(len(y)) / sr]])
    M1 = librosa.feature.sync(M1, beats, aggregate=np.median)
    M2 = librosa.feature.sync(M2, beats, aggregate=np.mean)
    return (M1, M2), times
Esempio n. 2
0
    def __test(y, top_db, ref, trim_duration):
        yt, idx = librosa.effects.trim(y, top_db=top_db,
                                       ref=ref)

        # Test for index position
        fidx = [slice(None)] * y.ndim
        fidx[-1] = slice(*idx.tolist())
        assert np.allclose(yt, y[fidx])

        # Verify logamp
        rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False)
        logamp = librosa.logamplitude(rms**2, ref=ref, top_db=None)
        assert np.all(logamp > - top_db)

        # Verify logamp
        rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze()
        logamp_all = librosa.logamplitude(rms_all**2, ref=ref,
                                          top_db=None)

        start = int(librosa.samples_to_frames(idx[0]))
        stop = int(librosa.samples_to_frames(idx[1]))
        assert np.all(logamp_all[:start] <= - top_db)
        assert np.all(logamp_all[stop:] <= - top_db)

        # Verify duration
        duration = librosa.get_duration(yt)
        assert np.allclose(duration, trim_duration, atol=1e-1), duration
Esempio n. 3
0
def compute_features(audio, y_harmonic):
    """Computes the HPCP and MFCC features.

    Parameters
    ----------
    audio: np.array(N)
        Audio samples of the given input.
    y_harmonic: np.array(N)
        Harmonic part of the audio signal, in samples.

    Returns
    -------
    mfcc: np.array(N, msaf.Anal.mfcc_coeff)
        Mel-frequency Cepstral Coefficients.
    hpcp: np.array(N, 12)
        Pitch Class Profiles.
    tonnetz: np.array(N, 6)
        Tonal Centroid features.
    cqt: np.array(N, msaf.Anal.cqt_bins)
        Constant-Q log-scale features.
    tempogram: np.array(N, 192)
        Tempogram features.
    """
    logging.info("Computing Spectrogram...")
    S = librosa.feature.melspectrogram(audio,
                                       sr=msaf.Anal.sample_rate,
                                       n_fft=msaf.Anal.frame_size,
                                       hop_length=msaf.Anal.hop_size,
                                       n_mels=msaf.Anal.n_mels)

    logging.info("Computing Constant-Q...")
    cqt = librosa.logamplitude(np.abs(
        librosa.cqt(audio,
                    sr=msaf.Anal.sample_rate,
                    hop_length=msaf.Anal.hop_size,
                    n_bins=msaf.Anal.cqt_bins,
                    real=False)) ** 2,
        ref_power=np.max).T

    logging.info("Computing MFCCs...")
    log_S = librosa.logamplitude(S, ref_power=np.max)
    mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=msaf.Anal.mfcc_coeff).T

    logging.info("Computing HPCPs...")
    hpcp = librosa.feature.chroma_cqt(y=y_harmonic,
                                      sr=msaf.Anal.sample_rate,
                                      hop_length=msaf.Anal.hop_size,
                                      n_octaves=msaf.Anal.n_octaves,
                                      fmin=msaf.Anal.f_min).T

    logging.info("Computing Tonnetz...")
    tonnetz = utils.chroma_to_tonnetz(hpcp)
    logging.info("Computing Tempogram...")
    tempogram = librosa.feature.tempogram(audio,
                                      sr=msaf.Anal.sample_rate,
                                      hop_length=msaf.Anal.hop_size,
                                      win_length=192).T
    return mfcc, hpcp, tonnetz, cqt, tempogram
Esempio n. 4
0
def do_cqt(src, track_id):
	SRC_cqt_L = librosa.logamplitude(librosa.cqt(src[0,:], sr=CQT_CONST["sr"], 
									 hop_length=CQT_CONST["hop_len"], 
		                             bins_per_octave=CQT_CONST["bins_per_octave"], 
		                             n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0)
	SRC_cqt_R = librosa.logamplitude(librosa.cqt(src[1,:], sr=CQT_CONST["sr"], 
									 hop_length=CQT_CONST["hop_len"], 
		                             bins_per_octave=CQT_CONST["bins_per_octave"], 
		                             n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0)
	np.save(PATH_CQT + str(track_id) + '.npy', np.dstack((SRC_cqt_L, SRC_cqt_R)))
	print "Done: %s" % str(track_id)
def process_one_file(audio_file, midi_file, output_midi_file, pair_file,
                     diagnostics_file):
    """
    Wrapper routine for loading in audio/MIDI data, aligning, and writing
    out the result.

    Parameters
    ----------
    audio_file, midi_file, output_midi_file, pair_file, diagnostics_file : str
        Paths to the audio file to align, MIDI file to align, and paths where
        to write the aligned MIDI, the synthesized pair file, and the DTW
        diagnostics file.
    """
    # Load in the audio data
    audio_data, _ = librosa.load(audio_file, sr=create_data.FS)
    # Compute the log-magnitude CQT of the data
    audio_cqt, audio_times = create_data.extract_cqt(audio_data)
    audio_cqt = librosa.logamplitude(audio_cqt, ref_power=audio_cqt.max()).T
    # Load and synthesize MIDI data
    midi_object = pretty_midi.PrettyMIDI(midi_file)
    midi_audio = midi_object.fluidsynth(fs=create_data.FS)
    # Compute log-magnitude CQT
    midi_cqt, midi_times = create_data.extract_cqt(midi_audio)
    midi_cqt = librosa.logamplitude(midi_cqt, ref_power=midi_cqt.max()).T
    # Compute cosine distance matrix
    distance_matrix = scipy.spatial.distance.cdist(
        midi_cqt, audio_cqt, 'cosine')
    # Get lowest cost path
    p, q, score = djitw.dtw(
        distance_matrix, GULLY, np.median(distance_matrix), inplace=False)
    # Normalize by path length
    score = score/len(p)
    # Normalize by distance matrix submatrix within path
    score = score/distance_matrix[p.min():p.max(), q.min():q.max()].mean()
    # Adjust the MIDI file
    midi_object.adjust_times(midi_times[p], audio_times[q])
    # Write the result
    midi_object.write(output_midi_file)
    # Synthesize aligned MIDI
    midi_audio_aligned = midi_object.fluidsynth(fs=create_data.FS)
    # Adjust to the same size as audio
    if midi_audio_aligned.shape[0] > audio_data.shape[0]:
        midi_audio_aligned = midi_audio_aligned[:audio_data.shape[0]]
    else:
        trim_amount = audio_data.shape[0] - midi_audio_aligned.shape[0]
        midi_audio_aligned = np.append(midi_audio_aligned,
                                       np.zeros(trim_amount))
    # Stack one in each channel
    librosa.output.write_wav(
        pair_file, np.array([midi_audio_aligned, audio_data]), create_data.FS)
    # Write out diagnostics
    with open(diagnostics_file, 'wb') as f:
        json.dump({'p': list(p), 'q': list(q), 'score': score}, f)
Esempio n. 6
0
def do_HPS_on_CQT(CQT, track_id):
	'''HPS on CQT
		input CQT: log-amplitude.
	'''
	
	CQT = 10**(0.05*CQT) # log_am --> linear (with ref_power=1.0)
	ret_H = np.zeros(CQT.shape)
	ret_P = np.zeros(CQT.shape)
	for depth_cqt in xrange(CQT.shape[2]):
		ret_H[:,:,depth_cqt], ret_P[:,:,depth_cqt] = librosa.decompose.hpss(CQT[:,:,depth_cqt])
	np.save(PATH_CQT_H+str(track_id)+'.npy', librosa.logamplitude(ret_H))
	np.save(PATH_CQT_P+str(track_id)+'.npy', librosa.logamplitude(ret_P))
	print "Done: %d, HPS for CQT " % track_id
Esempio n. 7
0
def process_audio(infile):

    y, sr = librosa.load(infile, sr=SR)

    # 1. Compute magnitude spectrogram
    D = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP))

    # 2. Compute HPSS
    Harm, Perc = hpss(y)

    # 3. Compute RPCA
    Lowrank, Sparse, _ = rpca.robust_pca(D, max_iter=RPCA_MAX_ITER)

    Lowrank = np.maximum(0.0, Lowrank)
    Sparse  = np.maximum(0.0, Sparse)

    D = np.abs(D)**2
    Harm = np.abs(Harm)**2
    Perc = np.abs(Perc)**2
    Lowrank = np.abs(Lowrank)**2
    Sparse = np.abs(Sparse)**2

    S       = librosa.feature.melspectrogram(S=librosa.logamplitude(D, ref_power=D.max()), 
                                             sr=sr,
                                             n_mels=N_MELS,
                                             fmax=FMAX)

    Harm       = librosa.feature.melspectrogram(S=librosa.logamplitude(Harm, ref_power=Harm.max()), 
                                             sr=sr,
                                             n_mels=N_MELS,
                                             fmax=FMAX)

    Perc       = librosa.feature.melspectrogram(S=librosa.logamplitude(Perc, ref_power=Perc.max()), 
                                             sr=sr,
                                             n_mels=N_MELS,
                                             fmax=FMAX)

    Lowrank       = librosa.feature.melspectrogram(S=librosa.logamplitude(Lowrank, ref_power=Lowrank.max()), 
                                             sr=sr,
                                             n_mels=N_MELS,
                                             fmax=FMAX)

    Sparse       = librosa.feature.melspectrogram(S=librosa.logamplitude(Sparse, ref_power=Sparse.max()), 
                                             sr=sr,
                                             n_mels=N_MELS,
                                             fmax=FMAX)

    return S, Harm, Perc, Lowrank, Sparse
Esempio n. 8
0
    def compute_features(self):
        """Actual implementation of the features.

        Returns
        -------
        cqt: np.array(N, F)
            The features, each row representing a feature vector for a give
            time frame/beat.
        """
        linear_cqt = (
            np.abs(
                librosa.cqt(
                    self._audio,
                    sr=self.sr,
                    hop_length=self.hop_length,
                    n_bins=self.n_bins,
                    norm=self.norm,
                    filter_scale=self.filter_scale,
                    real=False,
                )
            )
            ** 2
        )
        cqt = librosa.logamplitude(linear_cqt, ref_power=self.ref_power).T
        return cqt
def compute_melgram(audio_path):
	''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
	96 == #mel-bins and 1366 == #time frame

	parameters
	----------
	audio_path: path for the audio file. Any format supported by audioread will work.
	More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load

	'''

	# mel-spectrogram parameters
	SR = 12000
	N_FFT = 512
	N_MELS = 96
	HOP_LEN = 256
	DURA = 29.12 # to make it 1366 frame..

	src, sr = librosa.load(audio_path, sr=SR) #whole signal
	n_sample = src.shape[0]
	n_sample_fit = int(DURA*SR)

	if n_sample < n_sample_fit: # if too short
		src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
	elif n_sample > n_sample_fit: # if too long
		src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]

	ret = librosa.logamplitude(librosa.feature.melspectrogram(y=src, 
																sr=SR,
																hop_length=HOP_LEN,
																n_fft=N_FFT,
																n_mels=N_MELS)**2,
								ref_power=1.0)
	ret = ret[np.newaxis, np.newaxis, :]
	return ret
def transform_audio(audio,
                    n_fft=2048,
                    n_mels=40,
                    sr=22050,
                    hop_length=512,
                    fmin=None,
                    fmax=None):
    # Midi values of 24 (C2) and 120 (C10) are chosen, since humans typically
    # can't hear much beyond this range.
    if not fmin:
        fmin = librosa.midi_to_hz(24)
    if not fmax:
        fmax = librosa.midi_to_hz(120)
    # First stage is a mel-frequency specrogram of bounded range.
    mel = librosa.feature.melspectrogram(audio,
                                         sr=sr,
                                         n_fft=n_fft,
                                         hop_length=hop_length,
                                         n_mels=n_mels,
                                         fmax=fmax,
                                         fmin=fmin)
    # Second stage is log-amplitude; power is relative to peak in the signal.
    log_amplitude = librosa.logamplitude(mel, ref_power=np.max)
    # Third stage transposes the data so that frames become samples.
    # Its shape is:
    # (length of audio / frame duration, number of mel bands)
    transpose = np.transpose(log_amplitude)
    return (transpose,
            {'n_fft': n_fft, 'n_mels': n_mels, 'sr': sr,
            'hop_length': hop_length, 'fmin': fmin, 'fmax': fmax})
Esempio n. 11
0
def get_beat(y, PARAMETERS):
    '''Estimate beat times and tempo'''
    # Compute a log-power mel spectrogram on the percussive component
    S_p = librosa.feature.melspectrogram(y=y, 
                                         sr=PARAMETERS['load']['sr'], 
                                         n_fft=PARAMETERS['stft']['n_fft'], 
                                         hop_length=PARAMETERS['beat']['hop_length'],
                                         n_mels=PARAMETERS['mel']['n_mels'],
                                         fmax=PARAMETERS['mel']['fmax'])
    
    S_p = librosa.logamplitude(S_p, ref_power=S_p.max())
    
    # Compute the median onset aggregation
    odf = librosa.onset.onset_strength(S=S_p, aggregate=np.median)
    
    # Get beats
    tempo, beats = librosa.beat.beat_track(onset_envelope=odf, 
                                           sr=PARAMETERS['load']['sr'], 
                                           hop_length=PARAMETERS['beat']['hop_length'])
      
    beat_times = librosa.frames_to_time(beats, 
                                        sr=PARAMETERS['load']['sr'], 
                                        hop_length=PARAMETERS['beat']['hop_length'])
    
    return tempo, beat_times, odf
Esempio n. 12
0
def amplitude_for_file(audio_path):
    y, sr = librosa.load(audio_path)
    # from http://bmcfee.github.io/librosa/librosa.html#librosa.core.logamplitude
    # Get a power spectrogram from a waveform y
    S = np.abs(librosa.stft(y)) ** 2
    log_S = librosa.logamplitude(S)
    return log_S
Esempio n. 13
0
def decompose(y, n_components=8):
    # How about something more advanced?  Let's decompose a spectrogram with NMF, and then resynthesize an individual component
    D = librosa.stft(y)

    # Separate the magnitude and phase
    S, phase = librosa.magphase(D)

    # Decompose by nmf
    components, activations = librosa.decompose.decompose(S, n_components, sort=True)

    plt.figure(figsize=(12,4))

    plt.subplot(1,2,1)
    librosa.display.specshow(librosa.logamplitude(components**2.0, ref_power=np.max), y_axis='log')
    plt.xlabel('Component')
    plt.ylabel('Frequency')
    plt.title('Components')

    plt.subplot(1,2,2)
    librosa.display.specshow(activations)
    plt.xlabel('Time')
    plt.ylabel('Component')
    plt.title('Activations')

    plt.tight_layout()
    plt.savefig('components_activations.png')

    print('components', components.shape)
    print('activations', activations.shape)
    return components, activations, phase
Esempio n. 14
0
def analyzeAudios():
    # librosa API reference: http://bmcfee.github.io/librosa/
    audioNumber=4
    filename=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat))[audioNumber]
    print('"'+filename+'"')
    sys.exit(0)

    y,sr=librosa.load(filename)
    onsets=librosa.onset.onset_detect(y,sr)

    fileoutName=filename.replace('.'+audioTargetFormat,'.png')
    fileoutName='test.png'
    #%matplotlib inline
    seaborn.set(style='ticks')
    S = librosa.feature.melspectrogram(y,sr=sr,n_mels=128)
    log_S = librosa.logamplitude(S, ref_power=np.max)

    fig = plt.figure(figsize=(12,4))
    ax = fig.add_subplot(211)
    ax.contourf(log_S)
    plt.title('mel power spectrogram')

    #ax.annotate('$->$',xy=(2.,-1),xycoords='data',
    #xytext=(-150, -140), textcoords='offset points',
    #bbox=dict(boxstyle="round", fc="0.8"),
    #arrowprops=dict(arrowstyle="->",patchB=el, connectionstyle="angle,angleA=90,angleB=0,rad=10"),)

    ax = fig.add_subplot(212)
    ax.plot(onsets)
    #plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()
    #plt.show()
    plt.savefig(fileoutName,format='png',dpi=900)
    print(fileoutName)
Esempio n. 15
0
    def post_process_features(gram, beats):
        '''
        Apply processing to a feature matrix given the supplied param values

        Parameters
        ----------
        gram : np.ndarray
            Feature matrix, shape (n_features, n_samples)
        beats : np.ndarray
            Indices of beat locations in gram

        Returns
        -------
        gram : np.ndarray
            Feature matrix, shape (n_samples, n_features), post-processed
            according to the values in `params`
        '''
        # Convert to chroma
        if params['feature'] == 'chroma':
            gram = librosa.feature.chroma_cqt(
                C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START))
        # Beat-synchronize the feature matrix
        if params['beat_sync']:
            gram = librosa.feature.sync(gram, beats, pad=False)
        # Compute log magnitude
        gram = librosa.logamplitude(gram, ref_power=gram.max())
        # Normalize the feature vectors
        gram = librosa.util.normalize(gram, norm=params['norm'])
        # Standardize the feature vectors
        if params['standardize']:
            gram = scipy.stats.mstats.zscore(gram, axis=1)
        # Transpose it to (n_samples, n_features) and return it
        return gram.T
Esempio n. 16
0
def feature_extraction(y=None, fs=None, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None):
    # Extract features, Mel Frequency Cepstral Coefficients
    eps = numpy.spacing(1)

    # Windowing function
    if mfcc_params['window'] == 'hamming_asymmetric':
        window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False)
    elif mfcc_params['window'] == 'hamming_symmetric':
        window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True)
    elif mfcc_params['window'] == 'hann_asymmetric':
        window = scipy.signal.hann(mfcc_params['n_fft'], sym=False)
    elif mfcc_params['window'] == 'hann_symmetric':
        window = scipy.signal.hann(mfcc_params['n_fft'], sym=True)
    else:
        window = None

    # Calculate Static Coefficients
    magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, n_fft=mfcc_params['n_fft'], win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], window=window))**2
    mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk'])
    mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram)
    mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum))

    # Collect the feature matrix
    feature_matrix = mfcc
    if include_delta:
        # Delta coefficients
        mfcc_delta = librosa.feature.delta(mfcc, **delta_params)

        # Add Delta Coefficients to feature matrix
        feature_matrix = numpy.vstack((feature_matrix, mfcc_delta))

    if include_acceleration:
        # Acceleration coefficients (aka delta)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params)

        # Add Acceleration Coefficients to feature matrix
        feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2))


    if not include_mfcc0:
        # Omit mfcc0
        feature_matrix = feature_matrix[1:, :]

    feature_matrix = feature_matrix.T

    # Collect into data structure
    if statistics:
        return {
            'feat': feature_matrix,
            'stat': {
                'mean': numpy.mean(feature_matrix, axis=0),
                'std': numpy.std(feature_matrix, axis=0),
                'N': feature_matrix.shape[0],
                'S1': numpy.sum(feature_matrix, axis=0),
                'S2': numpy.sum(feature_matrix ** 2, axis=0),
            }
        }
    else:
        return {
            'feat': feature_matrix}
Esempio n. 17
0
def onsets(D):
    S = librosa.logamplitude(D)
    o = np.diff(S, axis=1)
    o = np.maximum(0, o)
    o = np.median(o, axis=0)
    o = o / o.max()
    return o
Esempio n. 18
0
def delta_features(lowlevel):
    '''Log-mel power delta features'''

    M0 = librosa.logamplitude(lowlevel['mel_spectrogram'])
    M1 = librosa.feature.delta(M0)
    M2 = librosa.feature.delta(M1)

    return np.vstack([M0, M1, M2])
Esempio n. 19
0
def analyze_frames(y, sr, debug=False):
    A = {}
    
    hop_length = 128

    # First, get the track duration
    A['duration'] = float(len(y)) / sr

    # Then, get the beats
    if debug: print "> beat tracking"
    tempo, beats = librosa.beat.beat_track(y, sr, hop_length=hop_length)

    # Push the last frame as a phantom beat
    A['tempo'] = tempo
    A['beats'] = librosa.frames_to_time(beats, sr, hop_length=hop_length).tolist()

    if debug: print "beats count: ", len(A['beats'])

    if debug: print "> spectrogram"
    S = librosa.feature.melspectrogram(y, sr,   n_fft=2048, 
                                                hop_length=hop_length, 
                                                n_mels=80, 
                                                fmax=8000)
    S = S / S.max()

    # A['spectrogram'] = librosa.logamplitude(librosa.feature.sync(S, beats)**2).T.tolist()

    # Let's make some beat-synchronous mfccs
    if debug: print "> mfcc"
    S = librosa.feature.mfcc(librosa.logamplitude(S), n_mfcc=40)
    A['timbres'] = librosa.feature.sync(S, beats).T.tolist()

    if debug: print "timbres count: ", len(A['timbres'])

    # And some chroma
    if debug: print "> chroma"
    S = N.abs(librosa.stft(y, hop_length=hop_length))

    # Grab the harmonic component
    H = librosa.decompose.hpss(S)[0]
    # H = librosa.hpss.hpss_median(S, win_P=31, win_H=31, p=1.0)[0]
    A['chroma'] = librosa.feature.sync(librosa.feature.chromagram(S=H, sr=sr),
                                        beats,
                                        aggregate=N.median).T.tolist()

    # Relative loudness
    S = S / S.max()
    S = S**2

    if debug: print "> dists"
    dists = structure(N.vstack([N.array(A['timbres']).T, N.array(A['chroma']).T]))
    A['dense_dist'] = dists

    edge_lens = [A["beats"][i] - A["beats"][i - 1]
                 for i in xrange(1, len(A["beats"]))]
    A["avg_beat_duration"] = N.mean(edge_lens)

    return A
Esempio n. 20
0
    def process(self, filename):
        y, sr = librosa.load(filename, 16000)

        # Let's make and display a mel-scaled power (energy-squared) spectrogram
        # We use a small hop length of 64 here so that the frames line up with the beat tracker example below.
        S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128)

        # Convert to log scale (dB). We'll use the peak power as reference.
        log_S = librosa.logamplitude(S, ref_power=np.max)

        # Make a new figure
        plt.figure(figsize=(12,4))

        # Display the spectrogram on a mel scale
        # sample rate and hop length parameters are used to render the time axis
        librosa.display.specshow(log_S, sr=sr, hop_length=64, x_axis='time', y_axis='mel')

        # Put a descriptive title on the plot
        plt.title('mel power spectrogram')

        # draw a color bar
        plt.colorbar(format='%+02.0f dB')

        # Make the figure layout compact
        # plt.tight_layout()


        # Next, we'll extract the top 20 Mel-frequency cepstral coefficients (MFCCs)
        mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20)

        # Let's pad on the first and second deltas while we're at it
        delta_mfcc  = librosa.feature.delta(mfcc)
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)

        # How do they look?  We'll show each in its own subplot
        plt.figure(figsize=(12, 6))

        plt.subplot(3,1,1)
        librosa.display.specshow(mfcc)
        plt.ylabel('MFCC')
        plt.colorbar()

        plt.subplot(3,1,2)
        librosa.display.specshow(delta_mfcc)
        plt.ylabel('MFCC-$\Delta$')
        plt.colorbar()

        plt.subplot(3,1,3)
        librosa.display.specshow(delta2_mfcc, sr=sr, hop_length=64, x_axis='time')
        plt.ylabel('MFCC-$\Delta^2$')
        plt.colorbar()

        #plt.tight_layout()

        # For future use, we'll stack these together into one matrix
        M = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
        plt.show()
        return mfcc
def show_feature_superimposed(sound_files, genre, feature, binsize=1024, plot_on="waveform"):

    wavedata     = sound_files[genre]["wavedata"]
    samplerate   = sound_files[genre]["samplerate"]
    timestamps   = sound_files[genre]["%s_timestamp" % (feature)]
    feature_data = sound_files[genre][feature]

    #TODO debug scale and remove if possible

    if feature == "sc":
        scale = 250.0
    elif feature == "zcr":
        scale = 1000.0
    elif feature == "rms":
        scale = 1000.0
    elif feature == "sr":
        scale = 250.0
    elif feature == "sf":
        scale = 250.0


    # plot feature-data
    scaled_fd_y = timestamps * scale


    win = np.hanning(binsize)

    if len(wavedata.shape) > 1:
        wavedata = wavedata[:,0]

    D = lr.core.stft(wavedata, n_fft=binsize, window=win)

    fig, ax = plt.subplots(2, 1, sharex=False, figsize=(PLOT_WIDTH, 7), sharey=True)

    # show spectrogram
    plt.subplot(2, 1, 1)
    lr.display.specshow(lr.logamplitude(np.abs(D)**2, ref_power=np.max), sr=samplerate*2, y_axis='log', x_axis='time')

    if plot_on == "spectrogram":
        scaled_fd_x = feature_data
        _ = plt.plot(scaled_fd_y, scaled_fd_x, color='r', linewidth=1);
        #ax = plt.gca().set_yscale("log")

    # show waveform
    plt.subplot(2, 1, 2);
    lr.display.waveplot(normalize_wav(wavedata), sr=samplerate, alpha=0.75);

    if plot_on == "waveform":
        scaled_fd_x = (feature_data / np.max(feature_data));
        _ = plt.plot(scaled_fd_y, scaled_fd_x, color='r', linewidth=1);

        ax = plt.gca()
        ax.axhline(y=0,c="green",linewidth=3,zorder=0)

    plt.tight_layout();

    plt.show();
    plt.clf();
Esempio n. 22
0
def compute_features(audio_file, intervals, level):
    """Computes the subseg-sync cqt features from the given audio file, if
    they are not previously computed. Saves the results in the feat_dir folder.

    Parameters
    ----------
    audio_file : str
        Path to the audio file.
    intervals : np.array
        Intervals containing the estimated boundaries.
    level : str
        Level in the hierarchy.

    Returns
    -------
    cqgram : np.array
        Subseg-sync constant-Q power spectrogram.
    intframes : np.array
        The frame indeces.
    """
    # Check if features have already been computed
    if level == "small_scale":
        features_file = os.path.join(features_dir, os.path.basename(audio_file).split('.')[0] +
                                    "_small_scale.mp3.pk")
    else:
        features_file = os.path.join(features_dir, os.path.basename(audio_file) +
                                    ".pk")
    if os.path.isfile(features_file):
        return read_features(features_file)

    y, sr = librosa.load(audio_file, sr=11025)

    # Default hopsize is 512
    hopsize = 512
    cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr, hop_length=hopsize)**2, ref_power=np.max)

    # Track beats
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr,
                                           hop_length=hopsize)

    # Synchronize
    cqgram = librosa.feature.sync(cqgram, beats, aggregate=np.median)

    intframes = None
    if intervals is not None:
        # convert intervals to frames
        intframes = librosa.time_to_frames(intervals, sr=sr, hop_length=hopsize)

        # Match intervals to subseg points
        intframes = librosa.util.match_events(intframes, beats)

    # Save the features
    save_features(cqgram, intframes, beats, features_file)

    return cqgram, intframes
Esempio n. 23
0
def single_file_featurization(wavfile):
    '''
    INPUT:
    row of dataframe with 'audio_slice_name' as the filename of the audio sample

    OUTPUT:
    feature vector for audio sample

    Function for dataframe apply for extracting each audio sample into a feature vector
    of mfcc coefficients
    '''

    # print statements to update the progress of the processing
    try:
        # load the raw audio .wav file as a matrix using librosa
        wav_mat, sr = lr.load(wavfile, sr=sample_rate)

        # create the spectrogram using the predefined variables for mfcc extraction
        S = lr.feature.melspectrogram(wav_mat, sr=sr, n_mels=n_filters, fmax=sr/2, n_fft=window, hop_length=hop)

        # using the pre-defined spectrogram, extract the mfcc coefficients
        mfcc = lr.feature.mfcc(S=lr.logamplitude(S), n_mfcc=25)

        # calculate the first and second derivatives of the mfcc coefficients to detect changes and patterns
        mfcc_delta = lr.feature.delta(mfcc)
        mfcc_delta = mfcc_delta.T
        mfcc_delta2 = lr.feature.delta(mfcc, order=2)
        mfcc_delta2 = mfcc_delta2.T
        mfcc = mfcc.T

        # combine the mfcc coefficients and their derivatives in a column stack for analysis
        total_mfcc = np.column_stack((mfcc, mfcc_delta, mfcc_delta2))

        # use the average of each column to condense into a feature vector
        # this makes each sample uniform regardless of the length of original the audio sample
        # the following features are extracted
        # - avg of mfcc, first derivative, second derivative
        # - var of mfcc, first derivative, second derivative
        # - max of mfcc
        # - min of mfcc
        # - median of mfcc
        # - skew of mfcc
        # - kurtosis of mfcc
        avg_mfcc = np.mean(total_mfcc, axis=0)
        var_mfcc = np.var(total_mfcc, axis=0)
        max_mfcc = np.max(mfcc, axis=0)
        min_mfcc = np.min(mfcc, axis=0)
        med_mfcc = np.median(mfcc, axis=0)
        skew_mfcc = skew(mfcc, axis=0)
        kurt_mfcc = skew(mfcc, axis=0)

        # combine into one vector and append to the total feature matrix
        return np.concatenate((avg_mfcc, var_mfcc, max_mfcc, min_mfcc, med_mfcc, skew_mfcc, kurt_mfcc))
    except:
        print "Uhmmm something bad happened"
        return np.zeros(7)
Esempio n. 24
0
def plot_spect(spec):
    plt.figure(figsize=(12, 8))
    nb=len(spec)
    i=0
    for s in spec:
        i+=1
        plt.subplot(nb, 1, i)
        D = librosa.logamplitude(np.abs(s)**2, ref_power=np.max)
        librosa.display.specshow(D,y_axis='log', x_axis='time')
    plt.show()
Esempio n. 25
0
def wiener_enhance(target, accomp, thresh=-6, transit=3, n_fft=2048):
    '''
    Given a noisy signal and a signal which approximates the noise, try to remove the noise.
    
    Input:
        target - Noisy signal
        accomp - Approximate noise
        thresh - Sigmoid threshold, default -6
        tranist - Sigmoid transition, default 3
        n_fft - FFT length, default 2048 (hop is always n_fft/4)
    Output:
        filtered - Target, Wiener filtered to try to remove noise
    '''
    target_spec = librosa.stft(target, n_fft=n_fft, hop_length=n_fft/4)
    accomp_spec = librosa.stft(accomp, n_fft=n_fft, hop_length=n_fft/4)
    spec_ratio = librosa.logamplitude(target_spec) - librosa.logamplitude(accomp_spec)
    spec_ratio = (spec_ratio - thresh)/transit
    mask = 0.5 + 0.5*(spec_ratio/np.sqrt(1 + spec_ratio**2))
    return librosa.istft(target_spec*mask, hop_length=n_fft/4)
Esempio n. 26
0
def get_beat_mfccs(filename):
    y, sr = librosa.load(filename)
    
    S = librosa.feature.melspectrogram(y, sr, n_fft=2048, hop_length=64, n_mels=128, fmax=8000)
    
    tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=64)
    
    M = librosa.feature.mfcc(librosa.logamplitude(S), n_mfcc=32)
    M = librosa.feature.sync(M, beats)
    return M
def export_image(args):
	feat, tpl_info, h_key, layer = args
	harmonic_ins = tpl_info.harmonic_ins
	harmonic_chords = tpl_info.harmonic_chords

	sub_folder = '%d_%d/' % (layer ,feat)

	path_deconv_results = 'results/'
	path_img_results = 'images/'
	if not os.path.exists(path_img_results):
		os.makedirs(path_img_results)
	path_img_out = '%s%s/' % (path_img_results, sub_folder)
	path_img_out2= '%slayer-%d/' % (path_img_results, layer)

	if not os.path.exists(path_img_out):
		os.makedirs(path_img_out)
	if not os.path.exists(path_img_out2):
		os.makedirs(path_img_out2)

	img_name = '%d_%d_%s.png' % (layer, feat, h_key)
	if os.path.exists(path_img_out2 + img_name):
		return

	wav_name_suffix = '_deconved_from_depth_%d_feature_%d' % (layer, feat)

	fig, axes = plt.subplots(nrows=len(harmonic_ins), 
							ncols=len(harmonic_chords),
							sharex='col', 
							sharey='row')
	for inst_idx, h_inst in enumerate(harmonic_ins):
		for chord_idx, h_chord in enumerate(harmonic_chords):
			ax = axes[inst_idx][chord_idx]
			segment_name = '%s_%s_%s' % (h_key, h_inst, h_chord)
			path_wav = '%s%s/' % (path_deconv_results, segment_name)
			filename_wav = segment_name + wav_name_suffix
			src_here, sr = librosa.load(path_wav+filename_wav+'.wav', 
										sr=SAMPLE_RATE, 
										mono=True)
			SRC = librosa.stft(src_here, 
								n_fft=N_FFT, 
								hop_length=N_FFT/2)
			ax.imshow(librosa.logamplitude(np.flipud(np.abs(SRC))), aspect=200)
			ax.set_xticks([], [])
			ax.set_yticks([], [])
			ax.axis('auto')
			if chord_idx == 0:
				ax.set_ylabel(harmonic_ins[inst_idx][:6])
			if inst_idx == len(harmonic_ins)-1:
				ax.set_xlabel(harmonic_chords[chord_idx])

	fig.savefig(os.path.join(path_img_out, img_name), dpi=200, bbox_inches='tight')
	fig.savefig(os.path.join(path_img_out2, img_name), dpi=200, bbox_inches='tight')
	plt.close(fig)
	print '%s: done' % img_name
	return
Esempio n. 28
0
    def chroma(y):
        # Build the wrapper
        CQT      = np.abs(librosa.cqt(y,    sr=SR, 
                                            resolution=NOTE_RES,
                                            hop_length=HOP_LENGTH,
                                            fmin=NOTE_MIN,
                                            n_bins=NOTE_NUM))

        C_to_Chr = librosa.filters.cq_to_chroma(CQT.shape[0], n_chroma=N_CHROMA) 

        return librosa.logamplitude(librosa.util.normalize(C_to_Chr.dot(CQT)))
Esempio n. 29
0
def analyzeAudios2():
    filenames=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat))
    for filename in filenames:
        for isuffix in ['harmonic','percussive','mfcc']:
            if re.search('\.'+isuffix+'\.'+audioTargetFormat+'$',filename):
                continue

        print(filename)
        y, sr = librosa.load(filename)

        #lenY=len(y)
        #idx1=min(int(20*sr),lenY)
        #idx2=min(int(24*sr),lenY)
        #y = y[idx1:idx2]

        #y_harmonic, y_percussive = librosa.effects.hpss(y)
        S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
        log_S = librosa.logamplitude(S, ref_power=np.max)

        seaborn.set(style='ticks')

        fileoutName=filename.replace('.'+audioTargetFormat,'.melpower.png')
        plt.figure(figsize=(12,4))
        librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
        plt.title('mel power spectrogram')
        plt.colorbar(format='%+02.0f dB')
        plt.tight_layout()
        plt.savefig(fileoutName,format='png',dpi=300)


        # Next, we'll extract the top 13 Mel-frequency cepstral coefficients (MFCCs)
        mfcc        = librosa.feature.mfcc(S=log_S, n_mfcc=13)
        delta_mfcc  = librosa.feature.delta(mfcc)
        delta2_mfcc = librosa.feature.delta(mfcc, order=2)

        fileoutName=filename.replace('.'+audioTargetFormat,'.melcoeff.png')
        plt.figure(figsize=(12, 6))
        plt.subplot(3,1,1)
        librosa.display.specshow(mfcc)
        plt.ylabel('MFCC')
        plt.colorbar()
        plt.subplot(3,1,2)
        librosa.display.specshow(delta_mfcc)
        plt.ylabel('MFCC-$\Delta$')
        plt.colorbar()
        plt.subplot(3,1,3)
        librosa.display.specshow(delta2_mfcc, sr=sr, x_axis='time')
        plt.ylabel('MFCC-$\Delta^2$')
        plt.colorbar()
        plt.tight_layout()
        plt.savefig(fileoutName,format='png',dpi=300)

        # For future use, we'll stack these together into one matrix
        M = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
Esempio n. 30
0
    def get_mfcc(y):
        # Generate a mel-spectrogram
        S = librosa.feature.melspectrogram(y, sr,   n_fft=N_FFT, 
                                                    hop_length=HOP_LENGTH, 
                                                    n_mels=N_MELS, 
                                                    fmax=FMAX).astype(np.float32)
    
        # Put on a log scale
        S = librosa.logamplitude(S, ref_power=S.max())

        return librosa.feature.mfcc(S=S, n_mfcc=N_MFCC)
Esempio n. 31
0
def example_librosa():
    import matplotlib.pyplot as plt
    import specplotting
    import librosa

    audio_path = "./scratch/lab1-resources/gas_station.wav"
    sample_rate, s_in = read_wav_audio(audio_path)
    lr_y, lr_sr = librosa.load(audio_path, 16000)
    print("Librosa sr: ", lr_sr)
    samples = len(s_in)
    print("The file is %d samples long" % samples)
    print('The sample rate is %d Hz' % sample_rate)

    ms_per_sec = 1000.0
    milliseconds = samples / sample_rate * ms_per_sec
    print('The file is %d milliseconds long' % milliseconds)

    inp = np.reshape(s_in, [1, s_in.shape[0], 1])
    inplens = np.array([s_in.shape[0]])

    g = tf.Graph()
    with g.as_default():
        raw_waveforms = tf.placeholder(tf.float64, [None, None, 1],
                                       name="raw_waveforms")
        raw_waveform_lengths = tf.placeholder(tf.int32, [None],
                                              name="raw_waveform_lengths")
        N_fft = 512
        audio = AudioPreprocessing(raw_waveforms,
                                   raw_waveform_lengths,
                                   16000,
                                   25.0,
                                   10.0,
                                   N_fft=N_fft,
                                   channels=1)

        print(audio.frame_length_py)
        print(audio.N_fft_py)
        print(audio.frame_shift_py)

        S = librosa.core.stft(lr_y,
                              n_fft=N_fft,
                              hop_length=audio.frame_shift_py,
                              win_length=audio.frame_length_py,
                              window="hamming",
                              center=True,
                              pad_mode="constant")
        print(S.shape)
        # S = librosa.feature.melspectrogram(S=S, sr=lr_sr, n_mels=23)
        ref_fbank = librosa.logamplitude(S, amin=10**(-50)).T

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            feed_dict = {
                g.get_tensor_by_name("raw_waveforms:0"): inp,
                g.get_tensor_by_name("raw_waveform_lengths:0"): inplens,
            }
            out = sess.run(
                {
                    "filterbank": audio.s_pe.
                    log_magnitude_spectrogram,  # audio.s_pe.log_mel_fbank_features,
                },
                feed_dict=feed_dict)
            print(out["filterbank"][0, :, :, 0].shape)
            specplotting.plot_spec(out["filterbank"][0, :, :, 0],
                                   sample_rate=sample_rate,
                                   title="Mel Filterbank Energies (dB)")
            plt.ylabel("Feature")
            plt.show()
            specplotting.plot_spec(
                ref_fbank,
                sample_rate=sample_rate,
                title="Librosa Ref Mel Filterbank Energies (dB)")
            plt.ylabel("Feature")
            plt.show()
Esempio n. 32
0
def build_datasets(train_percentage=0.8, preproc=False):
    if preproc:
        path = "Preproc3/"
    else:
        path = "../Music/"

    # TODO : replace by csv.get_tags("annotations_subset.csv")
    # class_names = get_class_names(path=path)
    # print("class_names = ", class_names)
    class_names = csv.get_tags()
    print("class_names = ", class_names)

    # TODO : rewrite get_total_files
    # total_files, total_train, total_test = get_total_files(path=path, train_percentage=train_percentage)
    # print("total files = ", total_files)
    #
    # nb_classes = len(class_names)

    # pre-allocate memory for speed (old method used np.concatenate, slow)
    mel_dims = get_sample_dimensions(
        path=path)  # Find out the 'shape' of each data file

    filelist = csv.get_total_files()  #TODO : return file list
    filelist_train = filelist[1:1000]
    filelist_test = filelist[1000:1100]
    filelist_train_test = filelist[1:1100]
    total_train = len(filelist_train)
    total_test = len(filelist_test)
    nb_classes = len(csv.get_tags())

    X_train = np.zeros((total_train, mel_dims[1], mel_dims[2], mel_dims[3]))
    Y_train = np.zeros((total_train, nb_classes))
    X_test = np.zeros((total_test, mel_dims[1], mel_dims[2], mel_dims[3]))
    Y_test = np.zeros((total_test, nb_classes))
    paths_train = []
    paths_test = []

    train_count = 0
    test_count = 0

    for idx, file in enumerate(filelist_train_test):

        this_Y = np.array(csv.get_tag_np_vector(
            idx))  #TODO: return np.array (dim = tag number)
        audio_path = path + csv.get_file_path(
            idx)  #TODO: return np.array (dim = tag number)

        n_files = len(filelist_train_test)
        n_load = n_files
        n_train = int(train_percentage * n_load)
        printevery = 100

        if (0 == idx % printevery):
            print('\r Loading file: {:14s} ({:2d} of {:2d} classes)'.format(
                file, idx + 1, nb_classes),
                  ", file ",
                  idx + 1,
                  " of ",
                  n_load,
                  ": ",
                  audio_path,
                  sep="")
        # start = timer()
        if (preproc):
            melgram = np.load(audio_path + ".npy")
            sr = 44100
        else:
            aud, sr = librosa.load(audio_path, mono=mono, sr=None)
            melgram = librosa.logamplitude(
                librosa.feature.melspectrogram(aud, sr=sr, n_mels=96),
                ref_power=1.0)[np.newaxis, np.newaxis, :, :]
        melgram = melgram[:, :, :, 0:mel_dims[
            3]]  # just in case files are differnt sizes: clip to first file size

        # end = timer()
        # print("time = ",end - start)
        if (idx < total_train):
            # concatenate is SLOW for big datasets; use pre-allocated instead
            # X_train = np.concatenate((X_train, melgram), axis=0)
            # Y_train = np.concatenate((Y_train, this_Y), axis=0)
            X_train[train_count, :, :] = melgram
            Y_train[train_count, :] = this_Y
            paths_train.append(
                audio_path)  # list-appending is still fast. (??)
            train_count += 1
        else:
            X_test[test_count, :, :] = melgram
            Y_test[test_count, :] = this_Y
            # X_test = np.concatenate((X_test, melgram), axis=0)
            # Y_test = np.concatenate((Y_test, this_Y), axis=0)
            paths_test.append(audio_path)
            test_count += 1

    print("Shuffling order of data...")
    X_train, Y_train, paths_train = shuffle_XY_paths(X_train, Y_train,
                                                     paths_train)
    X_test, Y_test, paths_test = shuffle_XY_paths(X_test, Y_test, paths_test)

    return X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr
Esempio n. 33
0
    def mfcc(self, audio_raw, plot=False):
        """Static MFCC

        Parameters
        ----------
        audio_raw : numpy.ndarray
            Audio data

        Returns
        -------
        list of numpy.ndarrays
            List of feature matrices, feature matrix per audio channel

        """

        window = self._window_function(
            N=self.parameters['general'].get('win_length_samples'),
            window_type=self.parameters['mfcc'].get('window'))

        mel_basis = librosa.filters.mel(
            sr=self.parameters['general'].get('fs'),
            n_fft=self.parameters['mfcc'].get('n_fft'),
            n_mels=self.parameters['mfcc'].get('n_mels'),
            fmin=self.parameters['mfcc'].get('fmin'),
            fmax=self.parameters['mfcc'].get('fmax'),
            htk=self.parameters['mfcc'].get('htk'))

        if self.parameters['mfcc'].get('normalize_mel_bands'):
            mel_basis /= numpy.max(mel_basis, axis=-1)[:, None]

        # feature_matrix = []
        # for channel in range(0, audio_raw.shape[0]):
        channel = 0
        # Calculate Static Coefficients
        spectrogram_ = self._spectrogram(
            y=audio_raw[channel, :],
            n_fft=self.parameters['mfcc'].get('n_fft'),
            win_length_samples=self.parameters['general'].get(
                'win_length_samples'),
            hop_length_samples=self.parameters['general'].get(
                'hop_length_samples'),
            spectrogram_type=self.parameters['mfcc'].get('spectrogram_type')
            if 'spectrogram_type' in self.parameters['mfcc'] else 'power',
            center=True,
            window=window)

        mel_spectrum = numpy.dot(mel_basis, spectrogram_)  # shape=(d, t)

        mfcc = librosa.feature.mfcc(
            S=librosa.logamplitude(mel_spectrum),
            n_mfcc=self.parameters['mfcc'].get('n_mfcc'))
        mfcc = mfcc.T
        # feature_matrix.append(mfcc.T)

        if plot:
            import matplotlib.pyplot as plt
            plt.subplot(1, 2, 1)
            plt.imshow(
                numpy.reshape(librosa.logamplitude(mel_spectrum),
                              (self.parameters['mfcc'].get('n_mels'), -1)))
            plt.subplot(1, 2, 2)
            plt.imshow(
                numpy.reshape(mfcc,
                              (self.parameters['mfcc'].get('n_mfcc'), -1)))
            plt.show()

        return mfcc
    abs2_stft = (stft.real * stft.real) + (stft.imag * stft.imag)

    # Gather frequency bins according to the Mel scale.
    melspec = librosa.feature.melspectrogram(
        y=None,
        S=abs2_stft,
        sr=logmelspec_settings["sr"],
        n_fft=logmelspec_settings["n_fft"],
        n_mels=logmelspec_settings["n_mels"],
        htk=True,
        fmin=logmelspec_settings["fmin"],
        fmax=logmelspec_settings["fmax"])

    # Apply pointwise base-10 logarithm.
    # The multiplication by 0.5 is to compensate for magnitude squaring.
    logmelspec = 0.5 * librosa.logamplitude(melspec, ref=1.0)

    # Convert to single floating-point precision.
    logmelspec = logmelspec.astype('float32')

    # Write to HDF5 dataset.
    # hop_start is an integer because chunk_start is both a multiple
    # of sample_rate and lms_hop_length = chunk_duration.
    hop_start = int((chunk_start * lms_sr) / (sample_rate * lms_hop_length))
    n_hops_in_chunk = logmelspec.shape[1]
    hop_stop = min(hop_start + n_hops_in_chunk, n_hops)
    lms_dataset[:, hop_start:hop_stop] = logmelspec

# Close file.
out_file.close()
Esempio n. 35
0
    def __init__(
        self,
        path,
        suffix='',  # required data file parameters
        subjects='all',  # optional selector (list) or 'all'
        start_sample=0,
        stop_sample=None,  # optional for selection of sub-sequences
        frame_size=-1,
        hop_size=-1,  # values > 0 will lead to windowing
        label_mode='tempo',
        name='',  # optional name
        n_fft=0,
        n_freq_bins=None,
        save_matrix_path=None,
        channels=None,
        resample=None,
        stimulus_id_filter=None,
        keep_metadata=False,
        spectrum_log_amplitude=False,
        spectrum_normalization_mode=None,
    ):
        '''
        Constructor
        '''

        self.name = name

        self.spectrum_normalization_mode = spectrum_normalization_mode
        self.spectrum_log_amplitude = spectrum_log_amplitude

        self.datafiles = []
        subject_paths = glob.glob(os.path.join(path, 'Sub*'))
        for path in subject_paths:
            dataset_filename = os.path.join(path, 'dataset' + suffix + '.pklz')
            if os.path.isfile(dataset_filename):
                log.debug('addding {}'.format(dataset_filename))
                self.datafiles.append(dataset_filename)
            else:
                log.warn('file does not exists {}'.format(dataset_filename))
        self.datafiles.sort()

        if subjects == 'all':
            subjects = np.arange(0, len(self.datafiles))
        assert subjects is not None and len(subjects) > 0

        self.label_mode = label_mode
        self.label_converter = LabelConverter()

        if stimulus_id_filter is None:
            stimulus_id_filter = []
        self.stimulus_id_filter = stimulus_id_filter

        self.subject_partitions = []
        # used to keep track of original subjects
        self.sequence_partitions = []
        # used to keep track of original sequences
        self.trial_partitions = []
        # keeps track of original trials

        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []

        sequences = []
        labels = []
        n_sequences = 0
        last_raw_label = -1
        for i in xrange(len(self.datafiles)):
            if i in subjects:
                with log_timing(
                        log, 'loading data from {}'.format(self.datafiles[i])):
                    self.subject_partitions.append(n_sequences)
                    # save start of next subject

                    subject_sequences, subject_labels, channel_meta = load(
                        self.datafiles[i])

                    subject_trial_no = -1

                    for j in xrange(len(subject_sequences)):
                        l = subject_labels[j]
                        # get raw label

                        if l in stimulus_id_filter:
                            #                             log.debug('skipping stimulus {}'.format(l));
                            continue

                        c = channel_meta[j][0]

                        if channels is not None and not c in channels:  # apply optional channel filter
                            log.debug('skipping channel {}'.format(c))
                            continue

                        self.sequence_partitions.append(n_sequences)
                        # save start of next sequence

                        if l != last_raw_label:  # if raw label changed...
                            self.trial_partitions.append(n_sequences)
                            # ...save start of next trial
                            subject_trial_no += 1
                            # increment subject_trial_no counter
                            last_raw_label = l

                        l = self.label_converter.get_label(
                            l[0], self.label_mode)
                        # convert to label_mode view

                        s = subject_sequences[j]
                        s = s[start_sample:stop_sample]
                        # get sub-sequence in original space

                        # down-sample if requested
                        if resample is not None and resample[0] != resample[1]:
                            s = librosa.resample(s, resample[0], resample[1])

                        if n_fft is not None and n_fft > 0:  # Optionally:
                            #     transform to spectogram
                            hop_length = n_fft / 4
                            '''
                            from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                            >>> # Get a power spectrogram from a waveform y
                            >>> S       = np.abs(librosa.stft(y)) ** 2
                            >>> log_S   = librosa.logamplitude(S)
                            '''
                            s = np.abs(
                                librosa.core.stft(s,
                                                  n_fft=n_fft,
                                                  hop_length=hop_length))**2

                            if n_freq_bins is not None:  # Optionally:
                                s = s[0:n_freq_bins, :]
                                #    cut off high bands

                            if self.spectrum_log_amplitude:
                                s = librosa.logamplitude(s)
                            '''
                            NOTE on normalization:
                            It depends on the structure of a neural network and (even more) 
                            on the properties of data. There is no best normalization algorithm 
                            because if there would be one, it would be used everywhere by default...
                        
                            In theory, there is no requirement for the data to be normalized at all. 
                            This is a purely practical thing because in practice convergence could 
                            take forever if your input is spread out too much. The simplest would be 
                            to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                            on activation function), and in most cases it does work. If your 
                            algorithm converges well, then this is your answer. If not, there are 
                            too many possible problems and methods to outline here without knowing 
                            the actual data.
                            '''

                            ## normalize to mean 0, std 1
                            if self.spectrum_normalization_mode == 'mean0_std1':
                                #                                 s = preprocessing.scale(s, axis=0);
                                mean = np.mean(s)
                                std = np.std(s)
                                s = (s - mean) / std

                            ## normalize by linear transform to [0,1]
                            elif self.spectrum_normalization_mode == 'linear_0_1':
                                s = s / np.max(s)

                            ## normalize by linear transform to [-1,1]
                            elif self.spectrum_normalization_mode == 'linear_-1_1':
                                s = -1 + 2 * (s - np.min(s)) / (np.max(s) -
                                                                np.min(s))

                            elif self.spectrum_normalization_mode is not None:
                                raise ValueError(
                                    'unsupported spectrum normalization mode {}'
                                    .format(self.spectrum_normalization_mode))

                            #print s.mean(axis=0)
                            #print s.std(axis=0)

                            # transpose to fit pylearn2 layout
                            s = np.transpose(s)
                        else:
                            # normalize to max amplitude 1
                            s = librosa.util.normalize(s)

                        s = np.asfarray(s, dtype='float32')

                        if frame_size > 0 and hop_size > 0:
                            s, l = self._split_sequence(
                                s, l, frame_size, hop_size)

#                         print s.shape
                        n_sequences += len(s)

                        sequences.append(s)
                        labels.extend(l)

                        if keep_metadata:
                            self.metadata.append({
                                'subject':
                                i,  # subject 
                                'trial_no':
                                subject_trial_no,  # trial_no
                                'stimulus':
                                last_raw_label[0],  # stimulus 
                                'channel':
                                c,  # channel
                                'start':
                                self.sequence_partitions[-1],  # start
                                'stop':
                                n_sequences  # stop
                            })

        # turn into numpy arrays
        sequences = np.vstack(sequences)
        #         print sequences.shape;

        labels = np.hstack(labels)

        one_hot_y = one_hot(labels)

        self.labels = labels
        # save for later

        if n_fft > 0:
            sequences = np.array([sequences])

            # re-arrange dimensions
            sequences = sequences.swapaxes(0, 1).swapaxes(1, 2).swapaxes(2, 3)

            log.debug('final dataset shape: {} (b,0,1,c)'.format(
                sequences.shape))

            super(EEGDataset, self).__init__(topo_view=sequences,
                                             y=one_hot_y,
                                             axes=['b', 0, 1, 'c'])
        else:
            super(EEGDataset, self).__init__(X=sequences,
                                             y=one_hot_y,
                                             axes=['b', 0, 1, 'c'])

        log.debug(
            'generated dataset "{}" with shape X={} y={} labels={} '.format(
                self.name, self.X.shape, self.y.shape, self.labels.shape))

        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(X=sequences, y=one_hot_y)
            with log_timing(
                    log,
                    'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix)
Esempio n. 36
0
def prepare_set(dataset_name, set_name, normalize=True, with_factors=True, scaler=None):
    if not os.path.exists(common.PATCHES_DIR):
        os.makedirs(common.PATCHES_DIR)
    f = h5py.File(common.PATCHES_DIR+'/patches_%s_%s_%sx%s_tmp.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS),'w')
    spec_folder=common.SPECTRO_PATH+SPECTRO_FOLDER+"/"
    items = open(common.DATASETS_DIR+'/items_index_%s_%s.tsv' % (set_name, dataset_name)).read().splitlines()
    n_items = len(items) * N_SAMPLES
    print n_items
    x_dset = f.create_dataset("features", (n_items,1,N_FRAMES,N_BINS), dtype='f')
    i_dset = f.create_dataset("index", (n_items,), maxshape=(n_items,), dtype='S18')
    if with_factors:
        factors = np.load(common.DATASETS_DIR+'/y_%s_%s_%s.npy' % (set_name, Y_PATH,dataset_name))
        y_dset = f.create_dataset("targets", (n_items,factors.shape[1]), dtype='f')
    k=0
    itemset = []
    itemset_index = []
    for t,track_id in enumerate(items):
        if MSD:
            msd_folder = track_id[2]+"/"+track_id[3]+"/"+track_id[4]+"/"
        else:
            msd_folder = ""
        file = spec_folder+msd_folder+track_id+".pk"
        try:
            spec = pickle.load(open(file))
            spec = librosa.logamplitude(np.abs(spec) ** 2,ref_power=np.max).T
            for i in range(0,N_SAMPLES):
                try:
                    sample = sample_patch(spec,N_FRAMES)
                    x_dset[k,:,:,:] = sample.reshape(-1,sample.shape[0],sample.shape[1])
                    if with_factors:
                        y_dset[k,:] = factors[t]
                    i_dset[k] = track_id
                    itemset.append(track_id)
                    itemset_index.append(t)
                    k+=1
                except Exception as e:
                    print 'Error',e
                    print file
        except Exception as e:
            print 'Error1',e
        if t%1000==0:
            print t

    print x_dset.shape

    # Clean empty spectrograms
    print "Cleaning empty spectrograms"
    f2 = h5py.File(common.PATCHES_DIR+'/patches_%s_%s_%sx%s.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS),'w')
    index = f['index'][:]
    index_clean = np.where(index != "")[0]
    n_items = len(index_clean)
    x_dset2 = f2.create_dataset("features", (n_items,1,N_FRAMES,N_BINS), dtype='f')
    i_dset2 = f2.create_dataset("index", (n_items,), maxshape=(n_items,), dtype='S18')
    for i in range(0,len(index_clean)):
        x_dset2[i] = x_dset[index_clean[i]]
        i_dset2[i] = i_dset[index_clean[i]]

    f.close()
    os.remove(common.PATCHES_DIR+'/patches_%s_%s_%sx%s_tmp.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS))

    # Normalize
    if normalize:
        print "Normalizing"
        block_step = 10000
        for i in range(0,len(itemset),block_step):
            x_block = x_dset2[i:min(len(itemset),i+block_step)]
            x_norm, scaler = scale(x_block,scaler)
            x_dset2[i:min(len(itemset),i+block_step)] = x_norm
        scaler_file=common.PATCHES_DIR+'/scaler_%s_%sx%s.pk' % (DATASET_NAME,N_SAMPLES,SECONDS)
        pickle.dump(scaler,open(scaler_file,'wb'))
    return scaler
Esempio n. 37
0
def mfcc(data, sr=22050, n_mfcc=20, **kwargs):
    S = logamplitude(melspectrogram(y=data, sr=sr, **kwargs))
    return np.dot(dct(n_mfcc, S.shape[0]), S)
    feats, beat_times = extractFeature(file_path,
                                       file_ext,
                                       feature,
                                       scale=1,
                                       round_to=0,
                                       normalize=0,
                                       beat_sync=beat_sync,
                                       transpose=False,
                                       save=True)
else:
    feats = dd.io.load(save_path)
    beat_times = feats['beat_times']
    feats = feats[feature]

# Convert to db
feats_log = librosa.logamplitude(feats, ref_power=feats.max())
# L2 normalize the columns, force features to lie on a sphere!
feats_log_normed = librosa.util.normalize(feats_log, norm=2., axis=0)

savemat(save_path[:-3] + '.mat', dict(feats_log=feats_log))
fig, axes = plt.subplots(3, 1, figsize=(18, 6))
axes[0].set_title(feature)
axes[1].set_title('dB Feature')
axes[2].set_title('Normed(dB Feature)')
axes[0].imshow(feats,
               aspect='auto',
               origin='low',
               interpolation='nearest',
               cmap=plt.cm.plasma)
axes[1].imshow(feats_log,
               aspect='auto',
def getSampleSSMs():
    Kappa = 0.1
    hopSize = 512
    TempoBias1 = 180
    TempoBias2 = 180
    DPixels = 400
    BeatsPerBlock = 8
    p = np.arange(DPixels)
    [I, J] = np.meshgrid(p, p)

    FeatureParams = {
        'MFCCBeatsPerBlock': BeatsPerBlock,
        'MFCCSamplesPerBlock': 200,
        'DPixels': DPixels,
        'ChromaBeatsPerBlock': 20,
        'ChromasPerBlock': 40
    }

    CSMTypes = {
        'MFCCs': 'Euclidean',
        'SSMs': 'Euclidean',
        'CurvsSS': 'Euclidean',
        'TorsSS': 'Euclidean',
        'D2s': 'EMD1D',
        'Chromas': 'CosineOTI'
    }

    fin = open('covers32k/list1.list', 'r')
    files1 = [f.strip() for f in fin.readlines()]
    fin.close()
    fin = open('covers32k/list2.list', 'r')
    files2 = [f.strip() for f in fin.readlines()]
    fin.close()

    cmap = 'Spectral'

    #67 is a good male/female example
    for index in [11]:
        fileprefix = "Covers80%i" % index
        filename1 = "covers32k/" + files1[index] + ".mp3"
        filename2 = "covers32k/" + files2[index] + ".mp3"
        artist1 = getCovers80ArtistName(files1[index])
        artist2 = getCovers80ArtistName(files2[index])
        songName = getCovers80SongName(files1[index])

        print("Getting features for %s..." % filename1)
        (XAudio1, Fs1) = getAudio(filename1)
        (tempo, beats1) = getBeats(XAudio1, Fs1, TempoBias1, hopSize)
        (Features1, O1) = getBlockWindowFeatures(
            (XAudio1, Fs1, tempo, beats1, hopSize, FeatureParams))
        bRatio1 = float(Fs1) / hopSize

        print("Getting features for %s..." % filename2)
        (XAudio2, Fs2) = getAudio(filename2)
        (tempo, beats2) = getBeats(XAudio2, Fs2, TempoBias2, hopSize)
        (Features2, O2) = getBlockWindowFeatures(
            (XAudio2, Fs2, tempo, beats2, hopSize, FeatureParams))
        bRatio2 = float(Fs2) / hopSize

        #Make SSM CSM
        plt.figure()
        CSM = getCSM(Features1['SSMs'], Features2['SSMs'])
        idx = plotCSM(CSM, artist1, artist2, songName)
        plt.savefig("DissertationFigures/CSM%i_SSM.svg" % index,
                    bbox_inches='tight')

        D1 = np.zeros((DPixels, DPixels))
        D1[I < J] = Features1['SSMs'][idx[0]]
        D1 = D1 + D1.T
        t1l = beats1[idx[0]] / bRatio1
        t1r = beats1[idx[0] + BeatsPerBlock] / bRatio1
        s1 = beats1[idx[0]] * hopSize
        s2 = beats1[idx[0] + BeatsPerBlock] * hopSize
        x1 = XAudio1[s1:s2]
        scipy.io.wavfile.write("DissertationFigures/%i_1.wav" % index, Fs1, x1)

        D2 = np.zeros((DPixels, DPixels))
        D2[I < J] = Features2['SSMs'][idx[1]]
        D2 = D2 + D2.T
        t2l = beats2[idx[1]] / bRatio2
        t2r = beats2[idx[1] + BeatsPerBlock] / bRatio2
        s1 = beats2[idx[1]] * hopSize
        s2 = beats2[idx[1] + BeatsPerBlock] * hopSize
        x2 = XAudio2[s1:s2]
        scipy.io.wavfile.write("DissertationFigures/%i_2.wav" % index, Fs2, x2)

        #Plot spectrograms
        plt.clf()
        plt.figure(figsize=(12, 5))
        plt.subplot(211)
        S1 = librosa.logamplitude(np.abs(librosa.stft(x1)))
        #librosa.display.specshow(S1, x_axis='time', y_axis='log')
        plt.subplot(212)
        S2 = librosa.logamplitude(np.abs(librosa.stft(x2)))
        #librosa.display.specshow(S2, x_axis='time', y_axis='log')
        plt.savefig("DissertationFigures/Spectrograms%i.svg" % index,
                    bbox_inches='tight')

        #Plot SSMs
        plt.clf()
        plt.subplot(121)
        plt.title(artist1)
        plt.imshow(D1,
                   interpolation='nearest',
                   cmap=cmap,
                   extent=(t1l, t1r, t1r, t1l))
        plt.xlabel("Time (sec)")
        plt.ylabel("Time (sec)")
        plt.subplot(122)
        plt.title(artist2)
        plt.imshow(D2,
                   interpolation='nearest',
                   cmap=cmap,
                   extent=(t2l, t2r, t2r, t2l))
        plt.xlabel("Time (sec)")
        plt.ylabel("Time (sec)")
        plt.savefig("DissertationFigures/SSMs%i.svg" % index,
                    bbox_inches='tight')
Esempio n. 40
0
        audioclip *= normalization_factor  #how Karol does it
        s = 0
        while True:
            window_wav = audioclip[(s * STEP_SIZE):(
                s * STEP_SIZE + TIME_WINDOW_SIZE)]  #how Karol does it
            s += 1
            if len(window_wav) < TIME_WINDOW_SIZE: break
            window_spcgm = librosa.feature.melspectrogram(
                window_wav,
                hop_length=512,
                n_fft=fft_window_len,
                sr=sampl_freq_Hz,
                n_mels=img_height)  #how Karol does it
            window_spcgm = window_spcgm[:, :
                                        img_width]  #for some reason the window_spcgm returned by window_spcgm has a width of 42 so we have to trim it to 41
            window_spcgm = librosa.logamplitude(
                window_spcgm)  #how Karol does it
            if np.mean(window_spcgm
                       ) <= silence_threshold:  #That's what Karol said
                too_quiet_ctr += 1
            else:
                observations_spcgm = np.vstack(
                    (observations_spcgm, [window_spcgm]))
                observations_wav = np.vstack((observations_wav, window_wav))
                labels = np.hstack((labels, label_for_file))  #*np.ones(1, int)
                classPriorsAfterWindowing[label_for_file] += 1

    tooShortList = classPriorsRaw - classPriorsBeforeWindowing
    tooShortList_mat[fold_num - 1] = tooShortList
    classPriorsRaw /= N
    classPriorsRaw_mat[fold_num - 1] = classPriorsRaw
    classPriorsBeforeWindowing /= N - np.sum(tooShortList)
Esempio n. 41
0
def harmonic_index(
        sourcefile,
        offset=0.0,
        duration=120.0,
        key=None,
        output_dir=None,
        n_fft=4096,
        hop_length=1024,
        pitch_median=5,  # how many frames for running medians?
        high_pass_f=40.0,
        low_pass_f=4000.0,
        debug=False,
        cached=True,
        n_peaks=16,
        **kwargs):
    """
    Index spectral peaks
    """
    if debug:
        from librosa.display import specshow
        import matplotlib.pyplot as plt
    # args that will make a difference to content,
    # apart from the sourcefile itself
    argset = dict(
        analysis="harmonic_index",
        # sourcefile=sourcefile,
        offset=offset,
        duration=duration,
        n_fft=n_fft,
        hop_length=hop_length,
        high_pass_f=high_pass_f,
        low_pass_f=low_pass_f,
        pitch_median=pitch_median,
        n_peaks=n_peaks,
    )
    sourcefile = Path(sourcefile).resolve()
    if output_dir is None:
        output_dir = sourcefile.parent
    output_dir = Path(output_dir)

    if key is None:
        key = str(sourcefile.stem) + "___" + sfio.safeish_hash(argset)

    metadatafile = (output_dir / key).with_suffix(".json")
    if cached and metadatafile.exists():
        return json.load(metadatafile.open("r"))

    metadata = dict(key=key, metadatafile=str(metadatafile), **argset)
    y, sr = sfio.load(str(sourcefile),
                      sr=None,
                      mono=True,
                      offset=offset,
                      duration=duration)

    if high_pass_f is not None:
        y = basicfilter.high_passed(y, sr, high_pass_f)

    dur = librosa.get_duration(y=y, sr=sr)

    metadata["dur"] = dur
    metadata["sr"] = sr
    # convert to spectral frames
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)
    y_rms = librosa.feature.rmse(S=D)
    # Separate into harmonic and percussive. I think this preserves phase?
    H, P = librosa.decompose.hpss(D)
    # Resynthesize the harmonic component as waveforms
    y_harmonic = librosa.istft(H)
    harmonicfile = str(output_dir / key) + ".harmonic.wav"
    sfio.save(harmonicfile, y_harmonic, sr=sr, norm=True)
    metadata["harmonicfile"] = harmonicfile

    # Now, power spectrogram
    H_mag, H_phase = librosa.magphase(H)

    H_peak_f, H_peak_mag = librosa.piptrack(S=H_mag,
                                            sr=sr,
                                            fmin=high_pass_f,
                                            fmax=low_pass_f)

    # First we smooth to use inter-bin information
    H_peak_f = median_filter(H_peak_f, size=(1, pitch_median))
    H_peak_mag = median_filter(H_peak_mag, size=(1, pitch_median))

    H_peak_power = np.real(H_peak_mag**2)
    H_rms = librosa.feature.rmse(S=H_peak_mag)

    if debug:
        plt.figure()
        specshow(librosa.logamplitude(H_peak_f, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak Freqs')
        plt.figure()
        specshow(librosa.logamplitude(H_peak_power, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak amps')
        plt.figure()

    # Now we pack down to the biggest few peaks:
    H_peak_f, H_peak_power = compress_peaks(H_peak_f, H_peak_power, n_peaks)

    if debug:
        plt.figure()
        specshow(librosa.logamplitude(H_peak_f, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak Freqs packed')
        plt.figure()
        specshow(librosa.logamplitude(H_peak_power, ref_power=np.max),
                 y_axis='log',
                 sr=sr)
        plt.title('Peak amps packed')
        # plt.figure()
        # plt.scatter(
        #     librosa.logamplitude(H_peak_power, ref_power=np.max),
        #     y_axis='log',
        #     sr=sr)
        # plt.title('Compressed')

    return dict(
        metadata=metadata,
        peak_f=H_peak_f,
        peak_power=H_peak_power,
        rms=y_rms,
        harm_rms=H_rms,
    )
Esempio n. 42
0
def doSpect(trackL=None, saveDir=None):

    #Use global S as counter for saved spects
    global S

    #Do nothing if test complete
    if S >= 5 and TEST:
        return False  #Do nothing

    #Do we have a track path and genre?
    if trackL == None:
        print 'Missing Track information: [trackPath, genre]'
        return False

    fpath = str(trackL[0])  #File path
    genre = str(trackL[1])  #Track Genre

    #Split up the path string and get
    #the file name and extension
    tmp = fpath.split('/')
    tmp2 = str(tmp[-1]).split('.')
    fullFileName = tmp[-1]  # filename.mp3
    fileName = str(int(tmp2[0]))  # filename (minus leading zeros)
    fileExt = tmp2[1]  # .mp3/.png

    #Verify the file exists and is accessible
    if not os.path.isfile(fpath):
        #File doesn't exist or isn't accessible.
        print 'File: ' + fullFileName + ' does not exist or is not accessible\n'
    else:
        #Create Spectrogram (Modified from Joseph Kotva's Code)

        #Setup the save path
        if saveDir == None:
            savePath = 'sorted/spect/' + genre + '/' + fileName + '.png'
        else:
            savePath = saveDir + '/' + fileName + '.png'

        #Does the spectrogram already exist? Save time, skip it then
        if not os.path.exists(savePath):
            #Try to load the audio file using librosa
            print 'Attempting to load: ' + fpath
            try:
                data, sr = librosa.load(fpath, mono=True)  #mono(1channel)
            except IOError:
                print 'Unable to load: ' + fpath + '\nSkipping...'
                #no s increment here because we didn't make the spectrogram!
                return False  #Failure
                #continue #restart loop at next index, skip this file

            #Was the audio file somehow loaded yet has no data points?
            if data.size == 0:
                print 'Unable to load: ' + fpath + '\nFile was opened but there was no data! Corrupted?\nSkipping...'
                return False  #Failure
                #continue #restart loop at next index, skip this file

            #Some calculations on the audio sample points
            stft = np.abs(librosa.stft(data, n_fft=2048, hop_length=512))
            mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
            log_mel = librosa.logamplitude(mel)

            #print 'Generating Spectrogram for: '+ fpath
            #Create the spectrogram image
            librosa.display.specshow(log_mel, sr=sr, hop_length=512)
            plt.axis(
                "normal"
            )  #axis limits auto scaled to make image sit well in plot box.
            plt.margins(0, 0)  #remove margins
            plt.gca().xaxis.set_major_locator(
                plt.NullLocator())  #remove x axis locator
            plt.gca().yaxis.set_major_locator(
                plt.NullLocator())  #remove y axis locator

            #Save the plotted figure (image) using "SortedVersion" dir structure
            #the image can/will be copied later into a "DataVersion" dir set.
            plt.savefig(savePath,
                        dpi=100,
                        frameon='false',
                        bbox_inches="tight",
                        pad_inches=0.0)
            plt.clf()  #Clear the current figure (possibly helps with speed)

            S += 1  #Increment counter
            print 'Finished spectrogram(' + str(S) + '): ' + savePath
            if S == 5 and TEST:
                print 'Stopping spectrograms here, spect test done!'
        else:
            #The spectrogram already exists, skip it
            print savePath + ' already exists, skipping...'
            if not TEST:
                S += 1  #Keep counting though!

    return True
        count += 1

        if count == 0:
            continue

        print count
        if not os.path.exists('spectrograms/' + row[7]):
            os.makedirs('spectrograms/' + row[7])

        y, sr = librosa.load("audio/fold" + str(row[5]) + "/" + str(row[0]))

        # Let's make and display a mel-scaled power (energy-squared) spectrogram
        S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)

        # Convert to log scale (dB). We'll use the peak power as reference.
        log_S = librosa.logamplitude(S, ref_power=np.max)

        # Make a new figure
        fig = plt.figure(figsize=(12, 4))
        ax = plt.Axes(fig, [0., 0., 1., 1.])
        ax.set_axis_off()
        fig.add_axes(ax)

        # Display the spectrogram on a mel scale
        # sample rate and hop length parameters are used to render the time axis
        librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')

        # Make the figure layout compact

        #plt.show()
        plt.savefig('spectrograms/' + row[7] + '/' + row[0] + '.png')
Esempio n. 44
0
def dataset(modalities=0,
            forcetempTime=4,
            contactmicTime=0.2,
            leaveObjectOut=False,
            verbose=False):
    materials = ['plastic', 'glass', 'fabric', 'metal', 'wood', 'ceramic']
    X = []
    y = []
    objects = dict()
    for m, material in enumerate(materials):
        if verbose:
            print 'Processing', material
            sys.stdout.flush()
        with open(
                'data_processed/processed_0.1sbefore_%s_times_%.2f_%.2f.pkl' %
            (material, forcetempTime, contactmicTime), 'rb') as f:
            allData = pickle.load(f)
            for j, (objName, objData) in enumerate(allData.iteritems()):
                if leaveObjectOut:
                    objects[objName] = {'x': [], 'y': []}
                    X = objects[objName]['x']
                    y = objects[objName]['y']
                for i in xrange(len(objData['temperature'])):
                    y.append(m)

                    if modalities > 2:
                        # Mel-scaled power (energy-squared) spectrogram
                        sr = 48000
                        S = librosa.feature.melspectrogram(np.array(
                            objData['contact'][i]),
                                                           sr=sr,
                                                           n_mels=128)
                        # Convert to log scale (dB)
                        log_S = librosa.logamplitude(S, ref_power=np.max)

                    if modalities == 0:
                        X.append(objData['force0'][i] + objData['force1'][i])
                    elif modalities == 1:
                        X.append(objData['temperature'][i])
                    elif modalities == 2:
                        X.append(objData['temperature'][i] +
                                 objData['force0'][i] + objData['force1'][i])
                    elif modalities == 3:
                        X.append(log_S.flatten())
                    elif modalities == 4:
                        X.append(objData['temperature'][i] +
                                 log_S.flatten().tolist())
                    elif modalities == 5:
                        X.append(objData['temperature'][i] +
                                 objData['force0'][i] + objData['force1'][i] +
                                 log_S.flatten().tolist())
                    elif modalities == 6:
                        X.append(objData['force0'][i] + objData['force1'][i] +
                                 log_S.flatten().tolist())

    if leaveObjectOut:
        return objects
    else:
        X = np.array(X)
        y = np.array(y)
        if verbose:
            print 'X:', np.shape(X), 'y:', np.shape(y)
        return X, y
A = np.sin(2*np.pi*np.arange(20992)*400/20000) #about 400 Hz
B = np.sin(2*np.pi*np.arange(20992)*200/20000) #about 400 Hz
C = A + B
D = np.hstack((np.sin(2*np.pi*np.arange(10992)*200/20000), np.sin(2*np.pi*np.arange(10000)*400/20000)))
D[:3500] = 0
D[14500:18000] = 0
E = np.sin(2*np.pi*np.arange(20992)*20/20000*150/20992*np.arange(1, 20993))

#wav_signals = np.vstack((A, B, C, D, E))
wav_signals = scipy.io.loadmat("data/fold10_RANDOM_OBS")['picked_obs']

all_spcgm = np.zeros((0, 60, 41), np.float64)
for j in range(wav_signals.shape[0]):
    spcgm = librosa.feature.melspectrogram(wav_signals[j], hop_length=512, n_fft=1024, sr=22050, n_mels=60) #how Karol does it
    spcgm = spcgm[:, :41] #for some reason the spcgm returned by spcgm has a width of 42 so we have to trim it to 41
    spcgm = librosa.logamplitude(spcgm) #how Karol does it
    all_spcgm = np.vstack((all_spcgm, [spcgm]))

wav_signals = np.expand_dims(wav_signals, -1)
synth = {x_pl_1: wav_signals}

sess=tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts))
sess.run(tf.global_variables_initializer())
if STORE_TEST_ERROR: res = sess.run(fetches=fetches_test, feed_dict=eat_this)
activations_of_wav_signals = sess.run(a2, synth)
sess.close()

if STORE_TEST_ERROR:
    y_test_pred = res[0]
    test_loss = res[1]
    test_accuracy = res[2]
Esempio n. 46
0
 def __call__(self, S):
     return librosa.logamplitude(S, **self.__dict__)
Esempio n. 47
0
def feature_extraction(y, fs=44100, statistics=True, include_mfcc0=True, include_delta=True,
                       include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None):
    """Feature extraction, MFCC based features

    Outputs features in dict, format:

        {
            'feat': feature_matrix [shape=(frame count, feature vector size)],
            'stat': {
                'mean': numpy.mean(feature_matrix, axis=0),
                'std': numpy.std(feature_matrix, axis=0),
                'N': feature_matrix.shape[0],
                'S1': numpy.sum(feature_matrix, axis=0),
                'S2': numpy.sum(feature_matrix ** 2, axis=0),
            }
        }

    Parameters
    ----------
    y: numpy.array [shape=(signal_length, )]
        Audio

    fs: int > 0 [scalar]
        Sample rate
        (Default value=44100)

    statistics: bool
        Calculate feature statistics for extracted matrix
        (Default value=True)

    include_mfcc0: bool
        Include 0th MFCC coefficient into static coefficients.
        (Default value=True)

    include_delta: bool
        Include delta MFCC coefficients.
        (Default value=True)

    include_acceleration: bool
        Include acceleration MFCC coefficients.
        (Default value=True)

    mfcc_params: dict or None
        Parameters for extraction of static MFCC coefficients.

    delta_params: dict or None
        Parameters for extraction of delta MFCC coefficients.

    acceleration_params: dict or None
        Parameters for extraction of acceleration MFCC coefficients.

    Returns
    -------
    result: dict
        Feature dict

    """

    eps = numpy.spacing(1)

    # Windowing function
    if mfcc_params['window'] == 'hamming_asymmetric':
        window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False)
    elif mfcc_params['window'] == 'hamming_symmetric':
        window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True)
    elif mfcc_params['window'] == 'hann_asymmetric':
        window = scipy.signal.hann(mfcc_params['n_fft'], sym=False)
    elif mfcc_params['window'] == 'hann_symmetric':
        window = scipy.signal.hann(mfcc_params['n_fft'], sym=True)
    else:
        window = None

    # Calculate Static Coefficients
    magnitude_spectrogram = numpy.abs(librosa.stft(y + eps,
                                                   n_fft=mfcc_params['n_fft'],
                                                   win_length=mfcc_params['win_length'],
                                                   hop_length=mfcc_params['hop_length'],
                                                   center=True,
                                                   window=window)) ** 2
    mel_basis = librosa.filters.mel(sr=fs,
                                    n_fft=mfcc_params['n_fft'],
                                    n_mels=mfcc_params['n_mels'],
                                    fmin=mfcc_params['fmin'],
                                    fmax=mfcc_params['fmax'],
                                    htk=mfcc_params['htk'])
    mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram)
    mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum),
                                    n_mfcc=mfcc_params['n_mfcc'])

    # Collect the feature matrix
    feature_matrix = mfcc
    if include_delta:
        # Delta coefficients
        mfcc_delta = librosa.feature.delta(mfcc, **delta_params)

        # Add Delta Coefficients to feature matrix
        feature_matrix = numpy.vstack((feature_matrix, mfcc_delta))

    if include_acceleration:
        # Acceleration coefficients (aka delta)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params)

        # Add Acceleration Coefficients to feature matrix
        feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2))

    if not include_mfcc0:
        # Omit mfcc0
        feature_matrix = feature_matrix[1:, :]

    feature_matrix = feature_matrix.T

    # Collect into data structure
    if statistics:
        return {
            'feat': feature_matrix,
            'stat': {
                'mean': numpy.mean(feature_matrix, axis=0),
                'std': numpy.std(feature_matrix, axis=0),
                'N': feature_matrix.shape[0],
                'S1': numpy.sum(feature_matrix, axis=0),
                'S2': numpy.sum(feature_matrix ** 2, axis=0),
            }
        }
    else:
        return {
            'feat': feature_matrix}
Esempio n. 48
0
def extract_features(parent_dir, sub_dirs, file_ext="*.wav",bands=60, frames=101, output=""):
    window_size = 512 * (frames-1)
    log_specgrams = []
    labels = []

    # 90%
    """
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    (0, 60, 101, 2)
    """

    # 50%
    """
    
    (0, 60, 41, 2)
    (13173, 60, 41, 2)
    (13021, 60, 41, 2)
    (14168, 60, 41, 2)
    (14606, 60, 41, 2)
    (13727, 60, 41, 2)
    (12279, 60, 41, 2)
    (12769, 60, 41, 2)
    (11955, 60, 41, 2)
    (12371, 60, 41, 2)
    (12610, 60, 41, 2)
    """

    for l, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            sound_clip, s = librosa.load(fn)
            label = fn.split('\\')[3].split('-')[1]
            # UrbanSound8K/audio/fold1/7061-6-0-0.wav

            for (start, end) in windows(sound_clip, window_size):
                if (len(sound_clip[start:end]) == window_size):
                    signal = sound_clip[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels=bands)
                    logspec = librosa.logamplitude(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T

                    # 같은 배열에 대해 차원만 증가시키는 경우 [:, np.newaxis]를 사용한다.
                    # logspec = (60,41)
                    # logspec.T.flatten() = (41,60) -> (2460,) -> (2460,1) -> (1, 2460)

                    log_specgrams.append(logspec)
                    labels.append(label)

    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3)

    # features
    # (5446,60,41,2)

    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])

    print(features.shape)

    np.savez("Extraction/audio" + output ,features=features,labels=labels)
    return np.array(features), np.array(labels)
def build_datasets(train_percentage=0.8, preproc=False):
    '''
    So we make the training & testing datasets here, and we do it separately.
    Why not just make one big dataset, shuffle, and then split into train & test?
    because we want to make sure statistics in training & testing are as similar as possible
    '''
    if (preproc):
        path = ROOT + "Preproc/"
    else:
        path = ROOT + "Samples/"

    class_names = get_class_names(path=path)
    print("class_names = ", class_names)

    total_files, total_train, total_test = get_total_files(
        path=path, train_percentage=train_percentage)
    print("total files = ", total_files)

    nb_classes = len(class_names)
    mel_dims = get_sample_dimensions(path=path)
    # pre-allocate memory for speed (old method used np.concatenate, slow)
    X_train = np.zeros((total_train, mel_dims[1], mel_dims[2], mel_dims[3]))
    Y_train = np.zeros((total_train, nb_classes))
    X_test = np.zeros((total_test, mel_dims[1], mel_dims[2], mel_dims[3]))
    Y_test = np.zeros((total_test, nb_classes))
    paths_train = []
    paths_test = []

    train_count = 0
    test_count = 0
    for idx, classname in enumerate(class_names):
        this_Y = np.array(encode_class(classname, class_names))
        this_Y = this_Y[np.newaxis, :]
        class_files = os.listdir(path + classname)
        n_files = len(class_files)
        n_load = n_files
        n_train = int(train_percentage * n_load)
        printevery = 100
        print("")
        for idx2, infilename in enumerate(class_files[0:n_load]):
            audio_path = path + classname + '/' + infilename
            if (0 == idx2 % printevery):
                print(
                    '\r Loading class: {:14s} ({:2d} of {:2d} classes)'.format(
                        classname, idx + 1, nb_classes),
                    ", file ",
                    idx2 + 1,
                    " of ",
                    n_load,
                    ": ",
                    audio_path,
                    sep="")
            #start = timer()
            if (preproc):
                melgram = np.load(audio_path)
                sr = 44100
            else:
                aud, sr = librosa.load(audio_path, mono=mono, sr=None)
                melgram = librosa.logamplitude(
                    librosa.feature.melspectrogram(aud, sr=sr, n_mels=96),
                    ref_power=1.0)[np.newaxis, np.newaxis, :, :]
            #end = timer()
            #print("time = ",end - start)
            melgram = melgram[:, :, :, 0:mel_dims[
                3]]  # just in case files are differnt sizes: clip to first file size

            if (idx2 < n_train):
                # concatenate is SLOW for big datasets; use pre-allocated instead
                #X_train = np.concatenate((X_train, melgram), axis=0)
                #Y_train = np.concatenate((Y_train, this_Y), axis=0)
                X_train[train_count, :, :] = melgram
                Y_train[train_count, :] = this_Y
                paths_train.append(
                    audio_path)  # list-appending is still fast. (??)
                train_count += 1
            else:
                X_test[test_count, :, :] = melgram
                Y_test[test_count, :] = this_Y
                #X_test = np.concatenate((X_test, melgram), axis=0)
                #Y_test = np.concatenate((Y_test, this_Y), axis=0)
                paths_test.append(audio_path)
                test_count += 1
        print("")

    print("Shuffling order of data...")
    X_train, Y_train, paths_train = shuffle_XY_paths(X_train, Y_train,
                                                     paths_train)
    X_test, Y_test, paths_test = shuffle_XY_paths(X_test, Y_test, paths_test)

    return X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr
Esempio n. 50
0
def create_spectrogram_plots(
        label_folder='electronic_music/Trance_label/Train/',
        sr=44100,
        n_mels=128,
        n_fft=2048,
        hop_length=512,
        song_duration=180.0,
        n_classes=4):
    """
    Create a spectrogram from a randomly selected song for each artist and plot"

    :param label_folder:
    :param sr:
    :param n_mels:
    :param n_fft:
    :param hop_length:
    :param song_duration:
    :param n_classes:
    :return:
    """

    # get list of all artists
    labels = os.listdir(label_folder)

    fig, ax = plt.subplots(nrows=2,
                           ncols=int(n_classes / 2),
                           figsize=(14, 12),
                           sharex=True,
                           sharey=True)

    row = 0
    col = 0

    # iterate through labels and random songs and plot a spectrogram on a grid
    for label in labels:
        # Randomly select album and song
        label_path = os.path.join(label_folder, label)
        label_songs = os.listdir(label_path)
        song = random.choice(label_songs)
        song_path = os.path.join(label_path, song)

        # Create mel spectrogram
        audio = MP3(song_path)
        audio_lenght = int(audio.info.length)
        audio_middle = (audio_lenght - int(song_duration)) / 2

        y, sr = librosa.load(song_path, sr=sr, offset=audio_middle, duration=5)
        S = librosa.feature.melspectrogram(y,
                                           sr=sr,
                                           n_mels=n_mels,
                                           n_fft=n_fft,
                                           hop_length=hop_length)
        log_S = librosa.logamplitude(S, ref_power=1.0)

        # Plot on grid
        plt.axes(ax[row, col])
        librosa.display.specshow(log_S, sr=sr)
        plt.title(label)
        col += 1
        if col == int(n_classes / 2):
            row += 1
            col = 0

    fig.tight_layout()
    # compute mean
    mean = np.mean(region, axis=1)

    # subtract mean
    out[:, frame] = X[:, frame] - mean

    # store noise
    noise[:, frame] = mean

# zero negative values
out[out < 0] = 0.0

# plot spectrum
plt.figure()
librosa.display.specshow(librosa.logamplitude(X), sr=sr, y_axis='linear')
plt.title('before')
plt.show()

# plot noise reduced spectrogram
plt.figure()
librosa.display.specshow(librosa.logamplitude(out), sr=sr, y_axis='linear')
plt.title('signal')
plt.show()

# plot mean / noise
plt.figure()
librosa.display.specshow(librosa.logamplitude(noise), sr=sr, y_axis='linear')
plt.title('noise')
plt.show()
Esempio n. 52
0
 def _compute_mfcc(self, audio) -> None:
     self.melspec = melspectrogram(audio.raw,
                                   sr=Clip.RATE,
                                   hop_length=Clip.FRAME)
     self.logamp = logamplitude(self.melspec)
     self.mfcc = mfcc(S=self.logamp, n_mfcc=13).transpose()
Esempio n. 53
0
def get_gfb(filelist, config):
    # Read the filelist
    fp = open(filelist, 'r')
    flist = fp.read().splitlines()
    flist = filter(None, flist)
    # Create output directory if non-existant
    opdir = os.path.dirname(flist[0].split(',')[1])
    if not os.path.exists(opdir):
        os.makedirs(opdir)
    # Read the relevant configs from the configfile
    framelen = float(config['framelen'])
    frameshift = float(config['frameshift'])
    wintype = config['wintype']
    if wintype == 'rectangular':
        winfun = np.ones
    else:
        winfun = getattr(np, wintype)
    # Number of channels for gammatone filterbank
    if 'nbanks' in config:
        nbanks = int(config['nbanks'])
    else:
        raise ConfigError('nbanks parameter not set in config file')
    # Min frequency of Gammatone filterbank
    if 'min_freq' in config:
        min_freq = float(config['min_freq'])
    else:
        min_freq = 0
    mvn = config['mvn']
    mvn = mvn.upper() == 'TRUE'
    if 'std_frac' in config:
        std_frac = float(config['std_frac'])
    else:
        std_frac = 1.0
    del1_flag = config['delta1']
    del2_flag = config['delta2']
    del1_flag = del1_flag.upper() == 'TRUE'
    del2_flag = del2_flag.upper() == 'TRUE'
    # Iterate over the filelist to extract features
    if mvn:
        feats_list = []
        for iter1, fline in enumerate(flist):
            infnm = fline.split(',')[0]
            opfnm = fline.split(',')[1]
            sig, fs = librosa.load(infnm, sr=None)
            sig = sig / max(abs(sig))
            dither = 1e-6 * np.random.rand(sig.shape[0])
            sig = sig + dither
            win_length = int(fs * framelen * 0.001)
            hop_length = int(fs * frameshift * 0.001)
            feats = gtgram.gtgram(sig, fs, framelen * 0.001,
                                  frameshift * 0.001, nbanks, min_freq)
            # Code for amplitude range compression
            if config['compression'] == 'log':
                feats = librosa.logamplitude(feats)
            elif config['compression'][0:4] == 'root':
                rootval = float(config['compression'].split('_')[1])
                feats = np.sign(feats) * (np.abs(feats)**(1 / rootval))
                if np.sum(np.isnan(feats)):
                    print('NaN Error in root compression for file: %s' % infnm)
                    exit()
            if del1_flag:
                feats_del1 = librosa.feature.delta(feats, order=1, axis=1)
            if del2_flag:
                feats_del2 = librosa.feature.delta(feats, order=2, axis=1)
            if del1_flag:
                feats = np.concatenate((feats, feats_del1), axis=0)
            if del2_flag:
                feats = np.concatenate((feats, feats_del2), axis=0)

            feats_list.append(feats)
        all_feats = np.concatenate(feats_list, axis=1)
        f_mean = np.mean(all_feats, axis=1)[:, None]
        f_std = np.std(all_feats, axis=1)[:, None]
        opdir = os.path.dirname(opfnm)
        mvn_params = np.concatenate((f_mean, f_std), axis=1)
        postfix = os.path.basename(filelist).split('.')[0]
        np.save(opdir + '/mvn_params_' + postfix + '.npy', mvn_params)

    for iter1, fline in enumerate(flist):
        infnm = fline.split(',')[0]
        opfnm = fline.split(',')[1]
        sig, fs = librosa.load(infnm, sr=None)
        sig = sig / max(abs(sig))
        dither = 1e-6 * np.random.rand(sig.shape[0])
        sig = sig + dither
        win_length = int(fs * framelen * 0.001)
        hop_length = int(fs * frameshift * 0.001)
        feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001,
                              nbanks, min_freq)
        if config['compression'] == 'log':
            feats = librosa.logamplitude(feats)
        elif config['compression'][0:4] == 'root':
            rootval = float(config['compression'].split('_')[1])
            feats = np.sign(feats) * (np.abs(feats)**(1 / rootval))
            if np.sum(np.isnan(feats)):
                print('NaN Error in root compression for file: %s' % infnm)
                exit()
        if del1_flag:
            feats_del1 = librosa.feature.delta(feats, order=1, axis=1)
        if del2_flag:
            feats_del2 = librosa.feature.delta(feats, order=2, axis=1)
        if del1_flag:
            feats = np.concatenate((feats, feats_del1), axis=0)
        if del2_flag:
            feats = np.concatenate((feats, feats_del2), axis=0)
        if mvn:
            feats = mvnormalize(feats, mvn_params, std_frac)
        writehtk(feats.T, frameshift, opfnm)
    fp.close()
Esempio n. 54
0
def create_melspectrogram_dataset(
        label_folder='electronic_music/Trance_label/Train/',
        save_folder='song_mel_label_data',
        sr=44100,
        n_mels=128,
        n_fft=2048,
        hop_length=512,
        song_duration=180.0,
        create_data=False):
    """
    This function creates the dataset given a folder with the correct structure (artist_folder/artists/albums/*.mp3)
    and saves it to a specified folder.

    :param label_folder:
    :param save_folder:
    :param sr:
    :param n_mels:
    :param n_fft:
    :param hop_length:
    :param song_duration:
    :param create_data:
    :return:
    """
    if create_data:
        # get list of all labels
        os.makedirs(save_folder, exist_ok=True)
        labels = [
            path for path in os.listdir(label_folder)
            if os.path.isdir(label_folder + path)
        ]

        # iterate through all lables, songs and find mel spectrogram
        for label in labels:
            print('{} \n'.format(label))
            label_path = os.path.join(label_folder, label)
            label_songs = os.listdir(label_path)

            for song in label_songs:
                print(song)
                song_path = os.path.join(label_path, song)

                # Create mel spectrogram for song_duration in the middle of the song and convert it to the log scale
                audio = MP3(song_path)
                audio_lenght = int(audio.info.length)
                audio_middle = (audio_lenght - int(song_duration)) / 2
                y, sr = librosa.load(song_path,
                                     sr=sr,
                                     offset=audio_middle,
                                     duration=song_duration)
                S = librosa.feature.melspectrogram(y,
                                                   sr=sr,
                                                   n_mels=n_mels,
                                                   n_fft=n_fft,
                                                   hop_length=hop_length)
                log_S = librosa.logamplitude(S, ref_power=1.0)
                data = (label, log_S, song)

                # Save each song
                save_name = label + '_%%-%%_' + song
                with open(os.path.join(save_folder, save_name), 'wb') as fp:
                    dill.dump(data, fp)
Esempio n. 55
0
#############  STFT  #########################

# # audio parameters
sample_rate = 16000
n_fft = 512  # 32 ms frame (like in paper)
hop_size = 128  # 75% overlap

# ##training parameters
gamma = 2

## fft preprocessing
wn_stft = librosa.core.stft(wn, n_fft, hop_size)
y_stft = librosa.core.stft(y, n_fft, hop_size)
x_stft = librosa.core.stft(x, n_fft, hop_size)

log_spec_wn = librosa.logamplitude(np.abs(wn_stft))**gamma
log_spec_y = librosa.logamplitude(np.abs(y_stft))**gamma
log_spec_x = librosa.logamplitude(np.abs(x_stft))**gamma

## plot spectrograms
# plt.figure(figsize=(12, 8))
# plt.subplot(1,2,1)
# librosa.display.specshow(log_spec_wn,fs,hop_size,x_axis="time", y_axis="log")
# plt.subplot(1,2,2)
# librosa.display.specshow(log_spec_y,fs,hop_size,x_axis="time", y_axis="log")
# plt.show()

## LOAD DICTIONARY

# Load previously conputed dictionary:
D = pickle.load(open("Dictionary_4atoms_10it.npy", "rb"))
def specgram(audio,
             n_fft=512,
             hop_length=None,
             mask=True,
             log_mag=True,
             re_im=False,
             dphase=True,
             mag_only=False):
    """Spectrogram using librosa.
  Args:
    audio: 1-D array of float32 sound samples.
    n_fft: Size of the FFT.
    hop_length: Stride of FFT. Defaults to n_fft/2.
    mask: Mask the phase derivative by the magnitude.
    log_mag: Use the logamplitude.
    re_im: Output Real and Imag. instead of logMag and dPhase.
    dphase: Use derivative of phase instead of phase.
    mag_only: Don't return phase.
  Returns:
    specgram: [n_fft/2 + 1, audio.size / hop_length, 2]. The first channel is
      the logamplitude and the second channel is the derivative of phase.
  """
    if not hop_length:
        hop_length = int(n_fft / 2.)

    fft_config = dict(n_fft=n_fft,
                      win_length=n_fft,
                      hop_length=hop_length,
                      center=True)

    spec = librosa.stft(audio, **fft_config)

    if re_im:
        re = spec.real[:, :, np.newaxis]
        im = spec.imag[:, :, np.newaxis]
        spec_real = np.concatenate((re, im), axis=2)

    else:
        mag, phase = librosa.core.magphase(spec)
        phase_angle = np.angle(phase)

        # Magnitudes, scaled 0-1
        if log_mag:
            mag = (librosa.logamplitude(
                mag**2, amin=1e-13, top_db=120., ref_power=np.max) / 120.) + 1
        else:
            mag /= mag.max()

        if dphase:
            #  Derivative of phase
            phase_unwrapped = np.unwrap(phase_angle)
            p = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1]
            p = np.concatenate([phase_unwrapped[:, 0:1], p], axis=1) / np.pi
        else:
            # Normal phase
            p = phase_angle / np.pi
        # Mask the phase
        if log_mag and mask:
            p = mag * p
        # Return Mag and Phase
        p = p.astype(np.float32)[:, :, np.newaxis]
        mag = mag.astype(np.float32)[:, :, np.newaxis]
        if mag_only:
            spec_real = mag[:, :, np.newaxis]
        else:
            spec_real = np.concatenate((mag, p), axis=2)
    return spec_real
Esempio n. 57
0
    def __init__(self, 
                 path,
                 name = '',         # optional name
                 
                 # selectors
                 subjects='all',        # optional selector (list) or 'all'
                 trial_types='all',     # optional selector (list) or 'all'
                 trial_numbers='all',   # optional selector (list) or 'all'
                 conditions='all',      # optional selector (list) or 'all'     
                 
                 partitioner = None,            
                 
                 channel_filter = NoChannelFilter(),   # optional channel filter, default: keep all
                 channel_names = None,  # optional channel names (for metadata)
                 
                 label_map = None,      # optional conversion of labels

                 remove_dc_offset = False,  # optional subtraction of channel mean, usually done already earlier
                 resample = None,       # optional down-sampling

                 # optional sub-sequences selection
                 start_sample = 0,
                 stop_sample  = None,   # optional for selection of sub-sequences

                 # optional signal filter to by applied before spitting the signal
                 signal_filter = None,

                 # windowing parameters
                 frame_size = -1,
                 hop_size   = -1,       # values > 0 will lead to windowing
                 hop_fraction = None,   # alternative to specifying absolute hop_size
                 
                 # optional spectrum parameters, n_fft = 0 keeps raw data
                 n_fft = 0,
                 n_freq_bins = None,
                 spectrum_log_amplitude = False,
                 spectrum_normalization_mode = None,
                 include_phase = False,

                 flatten_channels=False,
                 layout='tf',       # (0,1)-axes layout tf=time x features or ft=features x time

                 save_matrix_path = None,
                 keep_metadata = False,
                 ):
        '''
        Constructor
        '''

        # save params
        self.params = locals().copy()
        del self.params['self']
        # print( self.params)
        
        # TODO: get the whole filtering into an extra class
        
        datafiles_metadata, metadb = load_datafiles_metadata(path)
        
#         print( datafiles_metadata)
        
        def apply_filters(filters, node):            
            if isinstance(node, dict):            
                filtered = []
                keepkeys = filters[0]
                for key, value in node.items():
                    if keepkeys == 'all' or key in keepkeys:
                        filtered.extend(apply_filters(filters[1:], value))
                return filtered
            else:
                return node # [node]
            
        
        # keep only files that match the metadata filters
        self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata)
        
        # copy metadata for retained files
        self.metadb = {}
        for datafile in self.datafiles:
            self.metadb[datafile] = metadb[datafile]
        
#         print( self.datafiles)
#         print( self.metadb)
        
        self.name = name

        if partitioner is not None:
            self.datafiles = partitioner.get_partition(self.name, self.metadb)
        
        self.include_phase = include_phase
        self.spectrum_normalization_mode = spectrum_normalization_mode
        self.spectrum_log_amplitude = spectrum_log_amplitude

        self.sequence_partitions = [] # used to keep track of original sequences
        
        # metadata: [subject, trial_no, stimulus, channel, start, ]
        self.metadata = []
        
        sequences = []
        labels = []
        n_sequences = 0

        if frame_size > 0 and hop_size == -1 and hop_fraction is not None:
            hop_size = np.ceil(frame_size / hop_fraction)

        for i in xrange(len(self.datafiles)):        
            with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): 

                # save start of next sequence
                self.sequence_partitions.append(n_sequences)

                data, metadata = load(os.path.join(path, self.datafiles[i]))

                label = metadata['label']
                if label_map is not None:
                    label = label_map[label]

                multi_channel_frames = []

                # process 1 channel at a time
                for channel in xrange(data.shape[1]):
                    # filter channels
                    if not channel_filter.keep_channel(channel):
                        continue

                    samples = data[:, channel]

                    # subtract channel mean
                    if remove_dc_offset:
                        samples -= samples.mean()

                    # down-sample if requested
                    if resample is not None and resample[0] != resample[1]:
                        samples = librosa.resample(samples, resample[0], resample[1])

                    # apply optional signal filter after down-sampling -> requires lower order
                    if signal_filter is not None:
                        samples = signal_filter.process(samples)

                    # get sub-sequence in resampled space
                    # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape))
                    samples = samples[start_sample:stop_sample]

                    if n_fft is not None and n_fft > 0: # Optionally:
                        ### frequency spectrum branch ###

                        # transform to spectogram
                        hop_length = n_fft / 4;
            
                        '''
                        from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html
                        >>> # Get a power spectrogram from a waveform y
                        >>> S       = np.abs(librosa.stft(y)) ** 2
                        >>> log_S   = librosa.logamplitude(S)
                        '''                                     
                             
                        S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length)
                        # mag = np.abs(S)        # magnitude spectrum
                        mag = np.abs(S)**2       # power spectrum
                        
                        # include phase information if requested
                        if self.include_phase:
                            # phase = np.unwrap(np.angle(S))
                            phase = np.angle(S)

                        # Optionally: cut off high bands
                        if n_freq_bins is not None:
                            mag = mag[0:n_freq_bins, :]
                            if self.include_phase:
                                phase = phase[0:n_freq_bins, :]
                                                  
                        if self.spectrum_log_amplitude:      
                            mag = librosa.logamplitude(mag)
                            
                        s = mag # for normalization
                                                    
                        '''
                        NOTE on normalization:
                        It depends on the structure of a neural network and (even more) 
                        on the properties of data. There is no best normalization algorithm 
                        because if there would be one, it would be used everywhere by default...
                    
                        In theory, there is no requirement for the data to be normalized at all. 
                        This is a purely practical thing because in practice convergence could 
                        take forever if your input is spread out too much. The simplest would be 
                        to just normalize it by scaling your data to (-1,1) (or (0,1) depending 
                        on activation function), and in most cases it does work. If your 
                        algorithm converges well, then this is your answer. If not, there are 
                        too many possible problems and methods to outline here without knowing 
                        the actual data.
                        '''
    
                        ## normalize to mean 0, std 1
                        if self.spectrum_normalization_mode == 'mean0_std1':
                            # s = preprocessing.scale(s, axis=0);
                            mean = np.mean(s)
                            std = np.std(s)
                            s = (s - mean) / std
                        
                        ## normalize by linear transform to [0,1]
                        elif self.spectrum_normalization_mode == 'linear_0_1':
                            s = s / np.max(s)
                        
                        ## normalize by linear transform to [-1,1]
                        elif self.spectrum_normalization_mode == 'linear_-1_1':
                            s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s))
                            
                        elif self.spectrum_normalization_mode is not None:
                            raise ValueError(
                                'unsupported spectrum normalization mode {}'.format(
                                    self.spectrum_normalization_mode)
                             )
                        
                        #print( s.mean(axis=0))
                        #print( s.std(axis=0))
    
                        # include phase information if requested
                        if self.include_phase:
                            # normalize phase to [-1.1]
                            phase = phase / np.pi
                            s = np.vstack([s, phase])
                        
                        # transpose to fit pylearn2 layout
                        s = np.transpose(s)
                        # print( s.shape)

                        ### end of frequency spectrum branch ###
                    else:
                        ### raw waveform branch ###

                        # normalize to max amplitude 1
                        s = librosa.util.normalize(samples)

                        # add 2nd data dimension
                        s = s.reshape(s.shape[0], 1)
                        # print( s.shape)

                        ### end of raw waveform branch ###

                    s = np.asfarray(s, dtype='float32')

                    if frame_size > 0 and hop_size > 0:
                        s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!!
                        frames = frame(s, frame_length=frame_size, hop_length=hop_size)
                    else:
                        frames = s
                    del s
                    # print( frames.shape)

                    if flatten_channels:
                        # add artificial channel dimension
                        frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1))
                        # print( frames.shape)

                        sequences.append(frames)

                        # increment counter by new number of frames
                        n_sequences += frames.shape[0]

                        if keep_metadata:
                            # determine channel name
                            channel_name = None
                            if channel_names is not None:
                                channel_name = channel_names[channel]
                            elif 'channels' in metadata:
                                channel_name = metadata['channels'][channel]

                            self.metadata.append({
                                        'subject'   : metadata['subject'],            # subject
                                        'trial_type': metadata['trial_type'],         # trial_type
                                        'trial_no'  : metadata['trial_no'],           # trial_no
                                        'condition' : metadata['condition'],          # condition
                                        'channel'   : channel,                        # channel
                                        'channel_name' : channel_name,
                                        'start'     : self.sequence_partitions[-1],   # start
                                        'stop'      : n_sequences                     # stop
                                    })

                        for _ in xrange(frames.shape[0]):
                            labels.append(label)
                    else:
                        multi_channel_frames.append(frames)

                    ### end of channel iteration ###


                if not flatten_channels:
                    # turn list into array
                    multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32')
                    # [channels x frames x time x freq] -> cb01
                    # [channels x frames x time x 1] -> cb0.

                    # move channel dimension to end
                    multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4)
                    # print( multi_channel_frames.shape)
                    # log.debug(multi_channel_frames.shape)

                    sequences.append(multi_channel_frames)

                    # increment counter by new number of frames
                    n_sequences += multi_channel_frames.shape[0]

                    if keep_metadata:
                        self.metadata.append({
                                    'subject'   : metadata['subject'],            # subject
                                    'trial_type': metadata['trial_type'],         # trial_type
                                    'trial_no'  : metadata['trial_no'],           # trial_no
                                    'condition' : metadata['condition'],          # condition
                                    'channel'   : 'all',                          # channel
                                    'start'     : self.sequence_partitions[-1],   # start
                                    'stop'      : n_sequences                     # stop
                                })

                    for _ in xrange(multi_channel_frames.shape[0]):
                        labels.append(label)

                ### end of datafile iteration ###
      
        # turn into numpy arrays
        sequences = np.vstack(sequences)
        # print( sequences.shape;)
        
        labels = np.hstack(labels)
        
        # one_hot_y = one_hot(labels)
        one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME!
        one_hot_y = one_hot_formatter.format(labels)
                
        self.labels = labels

        if layout == 'ft': # swap axes to (batch, feature, time, channels)
            sequences = sequences.swapaxes(1, 2)
            
        log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape))
        super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
        
        log.info('generated dataset "{}" with shape X={}={} y={} labels={} '.
                 format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape))

        if save_matrix_path is not None:
            matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c'])
            with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)):
                serial.save(save_matrix_path, matrix)
Esempio n. 58
0
print(S.shape)

# make pictures name
save_path = 'part1_spectrogram.jpg'
#
S = pd.read_csv(fp)
filter_col = [col for col in S if col.startswith('mel')]
S_dat = S[filter_col]

print(S.onset)



# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(np.array(S_dat).T, ref_power=np.max)

# Make a new figure
plt.figure(figsize=(12,4))

# Display the spectrogram on a mel scale
# sample rate and hop length parameters are used to render the time axis
librosa.display.specshow(log_S, x_axis='time', y_axis='mel')

# Put a descriptive title on the plot
plt.title('mel power spectrogram')

# draw a color bar
plt.colorbar(format='%+02.0f dB')

# Make the figure layout compact
Esempio n. 59
0
def analyze_frames(y, sr, debug=False):
    A = {}

    hop_length = 128

    # First, get the track duration
    A['duration'] = float(len(y)) / sr

    # Then, get the beats
    if debug: print "> beat tracking"
    tempo, beats = librosa.beat.beat_track(y, sr, hop_length=hop_length)

    # Push the last frame as a phantom beat
    A['tempo'] = tempo
    A['beats'] = librosa.frames_to_time(beats, sr,
                                        hop_length=hop_length).tolist()

    if debug: print "beats count: ", len(A['beats'])

    if debug: print "> spectrogram"
    S = librosa.feature.melspectrogram(y,
                                       sr,
                                       n_fft=2048,
                                       hop_length=hop_length,
                                       n_mels=80,
                                       fmax=8000)
    S = S / S.max()

    # A['spectrogram'] = librosa.logamplitude(librosa.feature.sync(S, beats)**2).T.tolist()

    # Let's make some beat-synchronous mfccs
    if debug: print "> mfcc"
    S = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=40)
    A['timbres'] = librosa.feature.sync(S, beats).T.tolist()

    if debug: print "timbres count: ", len(A['timbres'])

    # And some chroma
    if debug: print "> chroma"
    S = np.abs(librosa.stft(y, hop_length=hop_length))

    # Grab the harmonic component
    H = librosa.decompose.hpss(S)[0]

    # H = librosa.hpss.hpss_median(S, win_P=31, win_H=31, p=1.0)[0]
    A['chroma'] = librosa.feature.sync(librosa.feature.chromagram(S=H, sr=sr),
                                       beats,
                                       aggregate=np.median).T.tolist()

    # Relative loudness
    S = S / S.max()
    S = S**2

    if debug: print "> dists"
    dists = structure(
        np.vstack([np.array(A['timbres']).T,
                   np.array(A['chroma']).T]))
    A['dense_dist'] = dists

    edge_lens = [
        A["beats"][i] - A["beats"][i - 1] for i in xrange(1, len(A["beats"]))
    ]
    A["avg_beat_duration"] = np.mean(edge_lens)
    A["med_beat_duration"] = np.median(edge_lens)

    return A
Esempio n. 60
0
def CQT(y,
        sr=44100,
        cqt_hop=1024,
        seconds=2.0,
        n_bins=30,
        bins_per_octave=4,
        fmin=27.5,
        use_han=False):
    """
    Get the constant-q transform of the audio file. Takes ((seconds*sr)//cqt_hop) * cqt_hop
    sample long chunks of the audiofile before doing the cqt computation. Hop length between
    these chunks is frame_length - cqt_hop, where frame_length is the size of the chunks of the
    audiofile. These chunks are necessary because librosa's cqt function can only handle short
    duration audio files in a reasonable amount of time.

    Parameters
    ----------
    cqt_hop : integer. 
        The hop length between adjacent frames for when extracting
            the cqt feature.
    seconds : float. 
        The time window to intially chunk the audio file into before
            feeding into the librosa cqt function.
    n_bins : integer. 
        The number of cqt frequency bands to extract.
    bins_per_octave : interger. 
        The number of cqt frequency bands that comprise
            an octave. The number of octaves is n_bins/float(bins_per_octave).
    fmin : integer. 
        The lowest frequency in the range of frequencies covered by the constant
            q transform.
    use_han : boolean. 
        True, window each frame with a hanning window before extracting CQT.   

    Returns
    -------
    CQTlog : np.ndarray [shape=(n_bins, n)]
        The time series of the constant-q transform of the audio file.

    Notes
    -----
    As of 06/22/2016, librosa's util.frame() function already applies a hanning window.

    Examples
    --------
    >>> # Load a file
    >>> y, sr = librosa.load('file.mp3')
    >>> # Calculate the constant q transform of a time-series
    >>> CQTlog = extractor.CQT(y, sr=sr, ...)

    """
    frame_length = seconds * sr
    frame_length = (frame_length // cqt_hop) * cqt_hop
    frame_hop = frame_length - cqt_hop

    padded_y = np.append(y, np.zeros(frame_length))

    y_frames = librosa.util.frame(padded_y,
                                  frame_length=frame_length,
                                  hop_length=frame_hop)

    if use_han:
        han_win = signal.hanning(frame_length)

    CQT_frames = []
    for frame in range(y_frames.shape[1]):
        if not use_han:
            sig = y_frames[:, frame]
        else:
            sig = y_frames[:, frame] * han_win
        CQTf = np.abs(
            librosa.cqt(sig,
                        sr=sr,
                        n_bins=n_bins,
                        hop_length=cqt_hop,
                        bins_per_octave=bins_per_octave,
                        fmin=fmin))
        CQT_frames.append(CQTf[:, 1:-1])

    CQT = np.hstack(CQT_frames)
    CQTlog = librosa.logamplitude(CQT**2, ref_power=np.max)
    return CQTlog