Ejemplo n.º 1
0
 def melspectrogram(self):
     print(self.directory)
     rate = self.rate
     seg_length = 2 * (dim - 1)
     fft = int((2.99936669e-02 * self.data.shape[0]) + 1.42217233e+02)
     hop = int((0.01516035 * self.data.shape[0]) - 2.14982993)
     S = melspectrogram(self.data,
                        self.rate,
                        n_mels=mels,
                        hop_length=hop,
                        center=False,
                        n_fft=fft,
                        norm=1,
                        fmax=fmax_,
                        power=poww)
     while S.shape[1] != dim:
         if S.shape[1] > dim:
             fft += 2
         elif S.shape[1] < dim:
             hop -= 1
         S = melspectrogram(self.data,
                            self.rate,
                            n_mels=mels,
                            hop_length=hop,
                            n_fft=fft,
                            center=False,
                            norm=1,
                            fmax=fmax_,
                            power=poww)
     S = 255 * (S - S.min()) / (S.max() - S.min())
     return (S)
Ejemplo n.º 2
0
    def transform_audio(self, y):
        '''Compute the PCEN of the (log-) Mel Spectrogram        
        Parameters
        ----------
        y : np.ndarray
            The audio buffer
        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The PCEN magnitude
        '''

        #extract proper shape
        S_test = melspectrogram(y=y,
                                sr=self.sr,
                                hop_length=self.hop_length,
                                n_fft=self.n_fft,
                                n_mels=self.n_mels)
        P_test = pcen(S_test,
                      sr=self.sr,
                      hop_length=self.hop_length,
                      time_constant=1)
        n_frames = P_test.shape[1]

        #double audio and reverse pad to prevent zero initial-energy assumption
        y = np.concatenate((y[::-1], y))

        S = melspectrogram(y=y,
                           sr=self.sr,
                           hop_length=self.hop_length,
                           n_fft=self.n_fft,
                           n_mels=self.n_mels)
        if self.log:
            S = amplitude_to_db(S, ref=np.max)

        t_base = (self.hop_length) / (self.sr)  #tau, or hop length in time
        t_constants = t_base * np.array(
            [2**i for i in range(self.n_t_constants)])
        pcen_layers = []

        for T in t_constants:
            P = pcen(S,
                     sr=self.sr,
                     hop_length=self.hop_length,
                     time_constant=T)
            #source of off-by-one error:
            P = P[:, P.shape[1] - n_frames + 1:]  #remove padded section
            P = to_dtype(P, self.dtype)
            pcen_layers.append(P)

        pcen_layers = to_dtype(np.asarray(pcen_layers), self.dtype)

        return {
            'mag': self._index(pcen_layers)
        }  #copied from mel spectrogram pump feature extractor
Ejemplo n.º 3
0
class ExtractMonoAudioFiles(FeaturesExtractor):
    #static vars to be manually set
    ##the natural byterate of the examples here but we can consider modifying it
    sr = 44100
    nblabels = 88
    batchsize = 1000
    #featurefunc = lambda y, sr: lrft.mfcc(y, sr, n_mfcc=1).T
    #featurefunc = lambda *x: x
    featurefunc = lambda y, sr: lrft.melspectrogram(y, sr).T
    inpath = '../simple-wdir'
    
    #for feeder
    featuremutation = lambda y, sr: lrft.melspectrogram(y, sr).T

    @staticmethod
    def labelmutation(pitch, nbsamples):
        labelvect = np.zeros(shape=(nbsamples, ExtractMonoAudioFiles.nblabels))
        labelvect[:, int(pitch)] = np.ones(nbsamples)
        return labelvect

    def __init__(self, inpath=None):
        if inpath is None:
            self.inpath = ExtractMonoAudioFiles.inpath
        else:
            self.inpath = inpath
#        if featurefunc is None:
#            self.featurefunc = ExtractMonoAudioFiles.featurefunc
#        else:
#            self.featurefunc = featurefunc
        super().__init__(self.inpath, ExtractMonoAudioFiles.computefeatureddata, ExtractMonoAudioFiles.extractsamplesmetadata)

    @staticmethod
    def extractsamplesmetadata(path, filelist):
        samplesmetadata = []

        for f in filelist:
            with open(join(path, f), 'r') as fs:
                #get the the onset and offset plus midi pitch
                onset, offset, midipitch = tuple(map(float, fs.readlines()[1][:-1].split('\t')))
            #first is the midi pitch rescaled into [0,87] range
            #second is a tuple of wav path, onset and duration
            samplesmetadata.append(((f[:-3] + 'wav', onset, offset-onset), int(midipitch - 21)))
        return samplesmetadata

    @staticmethod
    def computefeatureddata(path, samplemetadata):
        meta, pitch = samplemetadata

        audiodat = lrco.load(join(path, meta[0]), sr= ExtractMonoAudioFiles.sr,
                             offset=meta[1], duration=meta[2])
        audiodat = ExtractMonoAudioFiles.featurefunc(*audiodat)

        pitchvect = np.array([pitch] * audiodat.shape[0])

        #return (audiodat, pitch)
        return (audiodat, pitchvect)
Ejemplo n.º 4
0
    def logmel(self):
        from librosa.feature import melspectrogram
        from librosa.core import load

        logmel_params = self.config['logmel_params']
        sr = logmel_params['sr']
        n_fft = logmel_params['n_fft']
        hop_length = logmel_params['hop_length']
        n_mels = logmel_params['n_mels']

        feature_path = os.path.join(
            self.dataset['feature_path'],
            'logmel_{}_{}_{}_{}'.format(sr, n_fft, hop_length, n_mels))
        if not os.path.exists(feature_path):
            os.mkdir(feature_path)

        x_train = []
        y_train = []
        f_train = []
        for i, row in self.dataset.train_data.iterrows():
            print('[Train] {}) Getting logmels from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            wav_data, sr = load(wav_name, sr=sr)
            x_train.append(
                melspectrogram(wav_data,
                               sr=sr,
                               n_fft=n_fft,
                               hop_length=hop_length,
                               n_mels=n_mels))
            y_train.append(self._build_multilabel(row))
            f_train.append(row['cur_name'])
            print('done.')

        x_test = []
        y_test = []
        f_test = []
        for i, row in self.dataset.test_data.iterrows():
            print('[Test] {}) Getting mels from {}...'.format(
                i, row['cur_name']),
                  end='')
            wav_name = os.path.join(self.dataset['data_path'], row['cur_name'])
            wav_data, sr = load(wav_name, sr=sr)
            x_test.append(
                melspectrogram(wav_data,
                               sr=sr,
                               n_fft=n_fft,
                               hop_length=hop_length,
                               n_mels=n_mels))
            y_test.append(self._build_multilabel(row))
            f_test.append(row['cur_name'])
            print('done')

        self._save_pickles(feature_path, x_train, y_train, f_train, x_test,
                           y_test, f_test)
def get_mult_feature(wave_name, window):
    (rate, sig) = wav.read(wave_name)  # librosa.load()
    #
    #power = librosa.amplitude_to_db(librosa.stft(sig,n_fft = window,hop_length=window/2), ref=numpy.max)

    power = numpy.abs(librosa.stft(sig, n_fft=window, hop_length=window / 2))
    _min = numpy.min(power)
    _max = numpy.max(power)
    power = (power - _min) / (_max - _min)

    print power.shape

    mels = feature.melspectrogram(sig,
                                  rate,
                                  n_fft=window,
                                  hop_length=window / 2,
                                  n_mels=257)
    _min = numpy.min(mels)
    _max = numpy.max(mels)
    mels = (mels - _min) / (_max - _min)

    print mels.shape

    if power.shape[1] < 250:
        zero = numpy.zeros((power.shape[0], 250 - power.shape[1]))
        power = numpy.hstack((power, zero))
        mels = numpy.hstack((mels, zero))
    else:
        power = power[:, 0:250]
        mels = mels[:, 0:250]
    power = power.T
    mels = mels.T
    feat = numpy.asarray([power, mels])
    return feat
Ejemplo n.º 6
0
def wav_to_mel_util(x):
    mel_spec = melspectrogram(x,
                              sr=sr,
                              n_fft=n_fft,
                              hop_length=hop_length,
                              window=window)
    return mel_spec
def process(src_fname_path, dest_fname_path):
    if not src_fname_path.endswith('.mp3'):
        return

    try:
        # load and process audio
        audio, sr = librosa.load(src_fname_path)
        # audio = lowpass(audio, cutoff=3000, sample_freq=sr)
        # spec = librosa.stft(np.asfortranarray(audio))
        spec = melspectrogram(audio, sr)
        spec_db = librosa.amplitude_to_db(np.abs(spec))

        # generate plot
        scale = 1
        fig = plt.figure(figsize=(1.28 * scale, 0.64 * scale))  #128x64
        plt.box(False)
        plt.subplots_adjust(left=0, right=1, bottom=0, wspace=0, hspace=0)
        librosa.display.specshow(spec_db,
                                 sr=sr,
                                 cmap='gray_r',
                                 x_axis='time',
                                 y_axis='log')
        fig.savefig(dest_fname_path, bbox_inches=None, pad_inches=0)
        plt.close()
        print('{0} -> {1}'.format(src_fname_path, dest_fname_path))
    except Exception as e:
        print('processing {0}: {1}'.format(src_fname_path, e))
Ejemplo n.º 8
0
def model_predict():
    """
    Prediction of the classes 'open' and 'close'
    """
    #  Create a numpy array of audio data

    signal = np.frombuffer(stream.read(config.new_len,
                                       exception_on_overflow=False),
                           dtype=np.float32)
    # signal_merged = np.append(config.signal_old, signal)
    # config.signal_old = signal_merged[signal_len:]
    signal_merged = signal
    #  Using random signal from folder 'clean' (turned off)
    # signal_merged = evaluation_using_random()

    mel = melspectrogram(y=signal_merged,
                         sr=config.sample_rate,
                         n_mels=config.n_mels,
                         n_fft=config.n_fft,
                         hop_length=config.hop_length,
                         window=config.window)

    X = librosa.amplitude_to_db(mel)

    X = X.reshape(1, X.shape[0], X.shape[1], 1)

    #  Prediction

    prediction = model.predict([X])
    # start_time = time.time()
    #  Show the prediction, signal and mel (turned off)
    signal_visualisation(prediction, mel, signal_merged)
    # print(str((time.time() - start_time)*1000)+'ms')
    return prediction[0][0], prediction[0][1]
Ejemplo n.º 9
0
def process_individual_file(input_file_uri, index):
    """
        It processes each input file and saves those in the required
        format, which is going to be used while training.

        Inputs:
            input_file_name : name of the data file to be used during training.

        Returns:
            {output_file_uri, num_of_mel_frames, length_of_time_scaled_audio}

    """

    data = [librosa_import(uri)[0] for uri in files_uri][0]

    mfcc = melspectrogram(data,
                          n_fft=params.nFft,
                          hop_length=params.hop_size,
                          n_mels=params.num_mels).T

    mfcc_shape = mfcc.shape
    num_elem_in_mfcc = mfcc_shape[0] * mfcc_shape[1]
    pad_val = params.scale_factor * num_elem_in_mfcc - data.shape[0]
    data = np.pad(data, [0, pad_val], mode="constant")
    assert data.shape[0] % num_elem_in_mfcc == 0

    output_file_uri = os.path.join(training_data_folder,
                                   "sample_{}".format(index))
    postprocess_data(data, mfcc, output_file_uri)

    return {output_file_uri, mfcc_shape[0], len(data)}
Ejemplo n.º 10
0
 def compute_spectrogram(self):
     # Compute the spectrogram and convert to dB
     if not hasattr(self, "spectrogram"):
         self.spectrogram = melspectrogram(self.waveform,
                                           self.sampling_rate)
         self.spectrogram_db = power_to_db(self.spectrogram, ref=np.max)
         print("... Computed spectrogram")
 def process_signal(self, signal):
     ft = np.abs(stft(signal, n_fft=self.window_size, hop_length=self.window_stride, window='hann'))
     mel = melspectrogram(sr=self.sample_rate,S=ft)
     mfccs = mfcc( sr=self.sample_rate, n_mfcc=self.num_mfccs,S=mel)
     deltas=  delta(mfccs)
     delta_deltas=  delta(mfccs,order=2)
     return mfccs, deltas, delta_deltas
Ejemplo n.º 12
0
    def __data_generation(self, list_IDs_temp):
        #'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        Y = np.empty((self.batch_size, self.n_classes), dtype=np.bool)

        # Generate data
        for i, row in list_IDs_temp.iterrows():
            if row.path not in self.audio.keys():
                #print('{} - loading {}'.format(i, row.path))
                #sys.stdout.flush()
                aud, fs = load(row.path)
                coefs = melspectrogram(aud,
                                       sr=fs,
                                       n_fft=2**12,
                                       hop_length=2**11,
                                       n_mels=64,
                                       fmax=10000)
                self.audio[row.path] = coefs
                #print('{} - loaded!'.format(i))
                #sys.stdout.flush()
                # we've loaded one more track, add it to the counter
                self.pbar.update(1)

            start_ind = np.random.randint(low=0,
                                          high=self.audio[row.path].shape[1] -
                                          self.window)
            clip = self.audio[row.path][:, start_ind:start_ind + self.window]
            #
            # start_ind = np.random.randint(low=0,high=coefs.shape[1]-self.window)
            # clip = coefs[:,start_ind:start_ind+self.window]
            X[i, :, :, 0] = clip
            Y[i, :] = row.iloc[2:-1].values.astype(np.int64)

        return X, Y
Ejemplo n.º 13
0
def wave2spec(
        wave, fs, frame_period, window,
        nperseg=None, nmels=80, preemphasis_coef=None,
        f_min=0, f_max=None, dtype='float32',
        return_t=False):

    stft_kwargs = make_stft_args(
        frame_period, fs, nperseg=nperseg, window=window)
    htk, norm = True, "slaney"

    if preemphasis_coef is not None:
        spec_wave = preemphasis(wave, preemphasis_coef)
    else:
        spec_wave = wave
    _, t, Zxx = signal.stft(
        spec_wave, **stft_kwargs)
    pspec = np.abs(Zxx)
    mspec = melspectrogram(
        sr=fs, S=pspec,
        n_mels=nmels, fmin=f_min, fmax=f_max,
        power=1.0, htk=htk, norm=norm)
    pspec = pspec.T.astype(dtype)
    mspec = mspec.T.astype(dtype)
    upsample = fs // (1000 // frame_period)
    length = (len(wave) // upsample) * upsample
    wave = wave[:length]
    mspec = mspec[:length // upsample]
    spec = pspec[:length // upsample]

    if return_t:
        return wave, spec, mspec, upsample, t
    else:
        return wave, spec, mspec, upsample
def spectogram(audio):
    spec = melspectrogram(audio, sr=audio_params['sr'], center=True, 
          n_mels=audio_params['n_mel'], n_fft=audio_params['n_fft'], 
          win_length=audio_params['win_length'], hop_length=audio_params['hop_length'], 
          fmin=audio_params['f_min'], 
          fmax=audio_params['f_max'])
    spec = np.log(spec)
Ejemplo n.º 15
0
def extract_features(signal, normalize=False, wavelet=0):

    # handle less than 3 [seg]
    L = sr*3 # Total length for samples ~3[seg]
    signal_length = signal.shape[0]
    if signal_length < L:
        #pad by repeating signal
        signal = np.pad(signal, (0, L-signal_length), mode='wrap')
    elif signal_length > L:
        signal = signal[:L]

    # Calculate melspectrogram
    melspec = melspectrogram(signal, sr=sr, center=False,  #fmax = sr/2
                             hop_length=window_size, win_length=window_size, 
                             n_mels=128) # shape:[bands, frames]
                             
    # Transform to log scale and transpose
    melspec = power_to_db(melspec, ref=np.amax(melspec)).T # shape:[frames, bands]
    
    if normalize:
      melspec = (melspec - np.mean(melspec))/np.std(melspec)
    
    # 2D Discrete Wavelet Transform
    if wavelet != 0:
        LL, (LH, HL, HH) = dwt2(melspec, wavelet)
        melspec = np.stack([LL,LH,HL,HH],axis=-1) # shape: [frames, bands, 4]
    else:
        melspec = melspec[..., np.newaxis]
    
    # Reshape
    features = melspec[np.newaxis, ...] # shape : [1, frames, bands, channels]
    return features
Ejemplo n.º 16
0
def create_mels_deltas(waveform, sample_rate):
    one_mel = melspectrogram(waveform.squeeze(0).numpy(),
                             sr=sample_rate,
                             n_fft=2048,
                             hop_length=1024,
                             n_mels=128,
                             fmin=0.0,
                             fmax=sample_rate / 2,
                             htk=True,
                             norm=None)
    one_mel = np.log(one_mel + 1e-8)
    one_mel = (one_mel - np.min(one_mel)) / (np.max(one_mel) - np.min(one_mel))
    one_mel_delta = delta(one_mel)
    one_mel_delta = (one_mel_delta - np.min(one_mel_delta)) / (
        np.max(one_mel_delta) - np.min(one_mel_delta))
    one_mel_delta_delta = delta(one_mel, order=2)
    one_mel_delta_delta = (one_mel_delta_delta - np.min(one_mel_delta_delta)
                           ) / (np.max(one_mel_delta_delta) -
                                np.min(one_mel_delta_delta))
    mel_3d = torch.cat([
        torch.tensor(one_mel).unsqueeze(0),
        torch.tensor(one_mel_delta).unsqueeze(0),
        torch.tensor(one_mel_delta_delta).unsqueeze(0)
    ],
                       dim=0)
    return mel_3d
def prepare_data_streaminput(data, config):
    X = np.empty(shape=(1, config.dim[0], config.dim[1], 1))
    input_length = config.audio_length

    # Remove silence or noise or things like that

    # Random offset / Padding
    if len(data) > input_length:
        max_offset = len(data) - input_length
        offset = np.random.randint(max_offset)
        data = data[offset:(input_length + offset)]
    else:
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        data = np.pad(data, (offset, input_length - len(data) - offset),
                      "constant")

    #data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
    S = feature.melspectrogram(data,
                               sr=config.sampling_rate,
                               n_fft=2048,
                               hop_length=512,
                               n_mels=config.n_mfcc)
    S_DB = power_to_db(S, ref=np.max)

    data = np.expand_dims(S_DB, axis=-1)
    X[0, ] = data

    return X
Ejemplo n.º 18
0
def extract_audio_features(root_dir, row):
    raw_data_dir = join(root_dir, RAW_DATA_DIR)
    row_dict = row.to_dict()
    waveform, _ = load(raw_data_dir + row_dict['filename'], sr=FEATURE_ARGS['sr'])
    row_dict['melspec'] = _clean_features(melspectrogram(waveform, n_mels=EXTRACTOR_ARGS['n_mels'], **FEATURE_ARGS))
    row_dict['mfcc'] = _clean_features(mfcc(waveform, n_mfcc=EXTRACTOR_ARGS['n_mfcc'], **FEATURE_ARGS))
    return row_dict
Ejemplo n.º 19
0
    def CalcFrameSpectrogram(self, plot=False, spec_mode='hz'):
        fft_size = 2048
        hop_length = 512  #fft_size//8
        self.CurrentSpectrogram = []  #initialize for every new frame!!!
        if self.CurrentFrame.ndim == 1:
            #reshape so that it has dimension (dim,1)
            self.CurrentFrame = np.reshape(self.CurrentFrame,
                                           (len(self.CurrentFrame), 1))

        mels_keep = 119  #128 is equal to keeping all the coefficients
        for ch in range(self.CurrentFrame.shape[1]):  #for every channel
            Spectrogram = melspectrogram(self.CurrentFrame[:, ch],
                                         sr=self.Fs,
                                         n_fft=fft_size,
                                         hop_length=hop_length,
                                         power=2).astype(
                                             np.float32)[0:mels_keep, :]
            self.CurrentSpectrogram.append(Spectrogram)

        if plot == True:
            plt.figure(figsize=(10, 4))
            librosa.display.specshow(librosa.power_to_db(Spectrogram,
                                                         ref=np.max),
                                     sr=self.Fs,
                                     hop_length=hop_length,
                                     x_axis='time',
                                     y_axis=spec_mode,
                                     fmax=self.Fs / 2,
                                     cmap='RdBu')
            #plt.ylim([0, freqs[117]])
            plt.colorbar(format='%+2.0f dB')
            plt.title('Spectrogram')
            plt.tight_layout()
            plt.show()
Ejemplo n.º 20
0
def result():
    if request.method == 'POST':
        testing = []
        f = request.files['file']
        f.save(f.filename)
        file = f.filename
        model = load_model('Birdmodel.h5')
        Y, SR = load(file, res_type='kaiser_fast')
        onsets = frames_to_time(onset_detect(Y, SR), SR)
        leading_silence = onsets[0]
        y, sr = load(file,
                     offset=leading_silence,
                     res_type='kaiser_fast',
                     duration=10)
        spec = melspectrogram(y=y, sr=sr)
        testing.append(spec)
        testing = np.asarray(testing)
        result = Model.predict(model, testing)
        if (np.where(result[0] == max(result[0]))[0] == 0):
            spe = 'Vanellus indicus'
        elif (np.where(result[0] == max(result[0]))[0] == 1):
            spe = 'Acridotheres tristis'
        elif (np.where(result[0] == max(result[0]))[0] == 2):
            spe = 'Columba Livia'
        elif (np.where(result[0] == max(result[0]))[0] == 3):
            spe = 'Amaurornis phoenicurus'
        elif (np.where(result[0] == max(result[0]))[0] == 4):
            spe = 'Centropus sinensis'
        os.remove(file)
        clear_session()
        return render_template("result.html", name=f.filename, species=spe)
Ejemplo n.º 21
0
def fbank(path, fft_span, hop_span, n_mels, fmin, fmax,affichage=False):
    """
    :param path: emplacement du fichier
    :param fft_span: taille de la fenetre pour la transformee de fourrier en seconde
    :param hop_span: pas entre deux echantillons en seconde
    :param n_mels: nombre de bandes de frequences mel
    :param fmin: frequence minimale de la decomposition
    :param fmax: frequence maximale de la decomposition
    :param affichage: True si on veut afficher le spectrogramme
    :return: Renvoie les vecteurs fbank representant le signal
             X matrice representant la decomposition fbank au cours du temps (une ligne = une decomposition pour une periode hop_span, de taille n_mels)
    """

    # 1ere facon d ouvrir un fichier
    # wav_signal = scipy.io.wavfile.read(path)
    # wav = np.array(wav_signal[1])
    # s_rate = wav_signal[0]
    # Deuxieme facon d ouvrir un fichier
    wav, s_rate = librosa.load(path)

    X = feature.melspectrogram(util.normalize(wav), s_rate, S=None, n_fft=int(np.floor(fft_span * s_rate)),
                               hop_length=int(np.floor(hop_span * s_rate)), n_mels=n_mels, fmin=fmin, fmax=fmax)
    # #Verification nombre d'echantillons (un toutes les 10ms)
    # size = X.shape
    # print 'Taille de la matrice de sortie',size
    # print 'Taille d un morceau de signal de 10ms que l on obtient' ,len(wav)/size[1]
    # print 'taille theorique d un morceau de signal',0.01*s_rate
    # print 's_rate',s_rate
    # print 'longueur',wav.shape
    # print wav.shape[0]/s_rate
    X = np.log(X)
    if affichage:
      afficherSpec(X,s_rate,hop_span)
    return np.transpose(X)
Ejemplo n.º 22
0
    def make_blocking_data(self):
        xData, yData = list(), list()
        path = self.featurePath + self.name + '/'
        for j, filename in enumerate(os.listdir(path)):
            print(f"{self.name} {filename} ({j + 1})")
            WavPath = path + filename
            y, sr = load(WavPath, mono=True)
            S = melspectrogram(y, sr).T
            S = S[:-1 * (S.shape[0] % 128)]
            num_chunk = S.shape[0] / 128
            data_chunks = np.split(S, num_chunk)
            xChunks, yChunks = list(), list()
            for unit in data_chunks:
                xChunks.append(unit)
                yChunks.append(self.labelDict[self.name])
            xData.append(xChunks)
            yData.append(yChunks)
        xData = [unit for record in xData for unit in record]
        yData = [unit for record in yData for unit in record]

        self.features = torch.tensor(data=xData, device=device)
        self.labels = torch.tensor(data=yData, device=device)
        print(self.features.shape)
        print(self.labels.shape)
        self.x_cat_data.append(self.features)
        self.y_cat_data.append(self.labels)
        return
def calculate_spectrograms(audio_dir, out_dir, file_type='.mp3'):
    files = glob.glob(os.path.join(audio_dir, '*' + file_type))
    num_files = len(files)

    print(f'{num_files} audio files found')

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    for i, file_name in enumerate(sorted(files)):
        start_time = time.time()
        track_name = os.path.basename(file_name)
        track_id = os.path.splitext(track_name)[0]
        try:
            song_name = track_to_song[track_id]
        except KeyError:
            continue
        if song_name in wmf_item2i.keys():
            audio_file = os.path.join(audio_dir,
                                      track_name)
            out_file = os.path.join(out_dir, track_id) + '.npy'
            if not os.path.exists(out_file):
                y, sr = load(audio_file)
                mel_spectrogram = melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=128)

    wmf_item2i = pickle.load(open('../../index_dicts.pkl', 'rb'))['item2i']
    track_to_song = pickle.load(open('../../track_to_song.pkl', 'rb'))
    calculate_spectrograms(audio_dir='../../data/MillionSongSubset/audio', out_dir='../../data/MillionSongSubset/spectrograms')
Ejemplo n.º 24
0
    def transform_audio(self, y):
        '''Compute the Mel spectrogram

        Parameters
        ----------
        y : np.ndarray
            The audio buffer

        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape=(n_frames, n_mels)
                The Mel spectrogram
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        mel = np.sqrt(melspectrogram(y=y, sr=self.sr,
                                     n_fft=self.n_fft,
                                     hop_length=self.hop_length,
                                     n_mels=self.n_mels,
                                     fmax=self.fmax)).astype(np.float32)

        mel = fix_length(mel, n_frames)

        if self.log:
            mel = amplitude_to_db(mel, ref=np.max)

        return {'mag': mel.T[self.idx]}
Ejemplo n.º 25
0
    def get_seq_size(self, frames, sr):
        """
        Get audio sequence size of audio time series when converted to mfcc-features or mel spectrogram

        :param frames: audio time series
        :param sr: sampling rate of frames
        :return: sequence size of mfcc-converted audio
        """

        if self.type == 'mfcc':
            mfcc_frames = mfcc(frames,
                               sr,
                               n_fft=self.frame_length,
                               hop_length=self.hop_length,
                               n_mfcc=self.mfcc_features,
                               n_mels=self.n_mels)
            return mfcc_frames.shape[1]

        elif self.type == 'spectrogram':
            spectrogram = melspectrogram(frames,
                                         sr,
                                         n_fft=self.frame_length,
                                         hop_length=self.hop_length,
                                         n_mels=self.n_mels)
            return spectrogram.shape[1]

        else:
            raise ValueError('Not a valid feature type: ', self.type)
Ejemplo n.º 26
0
    def transform_audio(self, y):
        '''Compute the PCEN of the (log-) Mel Spectrogram        
        Parameters
        ----------
        y : np.ndarray
            The audio buffer
        Returns
        -------
        data : dict
            data['mag'] : np.ndarray, shape = (n_frames, n_bins)
                The PCEN magnitude
        '''
        n_frames = self.n_frames(get_duration(y=y, sr=self.sr))

        S = melspectrogram(y=y,
                           sr=self.sr,
                           hop_length=self.hop_length,
                           n_mels=self.n_mels,
                           n_fft=self.n_fft)

        if self.log:
            S = amplitude_to_db(S, ref=np.max)

        P = pcen(S, sr=sr, hop_length=self.hop_length)

        P = to_dtype(P, self.dtype)

        return {
            'mag': P[self.idx]
        }  #copied from mel spectrogram pump feature extractor
Ejemplo n.º 27
0
    def from_audio(
        cls,
        audio,
        n_fft=1024,
        n_mels=128,
        window="flattop",
        win_length=256,
        hop_length=32,
        htk=True,
        fmin=None,
        fmax=None,
    ):
        """ Create a MelSpectrogram object from an Audio object

        The kwargs are cherry-picked from:

        - https://librosa.org/doc/latest/generated/librosa.feature.melspectrogram.html#librosa.feature.melspectrogram
        - https://librosa.org/doc/latest/generated/librosa.filters.mel.html?librosa.filters.mel

        Args:
            n_fft: Length of the FFT window [default: 1024]
            n_mels: Number of mel bands to generate [default: 128]
            window: The windowing function to use [default: "flattop"]
            win_length: Each frame of audio is windowed by `window`. The window
                will be of length `win_length` and then padded with zeros to match
                `n_fft` [default: 256]
            hop_length: Number of samples between successive frames [default: 32]
            htk: use HTK formula instead of Slaney [default: True]
            fmin: lowest frequency (in Hz) [default: None]
            fmax: highest frequency (in Hz). If None, use `fmax = sr / 2.0` [default: None]

        Returns:
            opensoundscape.melspectrogram.MelSpectrogram object
        """

        if not isinstance(audio, Audio):
            raise TypeError("Class method expects Audio class as input")

        process_fmin = fmin if fmin else 0
        process_fmax = fmax if fmax else audio.sample_rate / 2

        S = melspectrogram(
            y=audio.samples,
            sr=audio.sample_rate,
            n_fft=n_fft,
            win_length=win_length,
            hop_length=hop_length,
            window=window,
            n_mels=n_mels,
            htk=htk,
            fmin=process_fmin,
            fmax=process_fmax,
        )

        # Make spectrogram "right-side up"
        S = S[::-1]

        return cls(S, audio.sample_rate, hop_length, process_fmin,
                   process_fmax)
def make_stacked_mels(mono_signal,n_fft,samprate,hop_length,fmin,fmax,n_mels):     
    # create 3 mel spectrograms with different fft window size, all other variables the same
    mel = [melspectrogram(mono_signal, sr=samprate, hop_length=hop_length, n_fft=j, n_mels=n_mels, fmin=fmin, fmax=fmax) for j in n_fft]
    # turn spectrograms into log values
    mel_db = [librosa.power_to_db(mel[k],ref=np.max) for k in range(len(mel))]
    # re-stack these spectrograms into a single array
    mel_db = np.stack(mel_db,axis=-1)  
    return mel_db
Ejemplo n.º 29
0
def melspectro(audio, sr=8000, win_len=256, n_mels=29):
    hop_len = win_len // 2
    return lf.melspectrogram(audio,
                             sr=sr,
                             n_fft=win_len,
                             hop_length=hop_len,
                             win_length=win_len,
                             n_mels=n_mels)
Ejemplo n.º 30
0
def mel_from_files(file_loc, MEL_PARAMS):
    phonemes = dict()
    mels = dict()
    for i, loc in file_loc.items():
        fs, phonemes[i] = wavfile.read(loc)
        a_mel = melspectrogram(phonemes[i], sr=fs, **MEL_PARAMS)
        mels[i] = a_mel
    return mels
def mfcc(path):
    # Let's make and display a mel-scaled power (energy-squared) spectrogram
    # We use a small hop length of 64 here so that the
    # frames line up with the beat tracker example below.

    y, sr = load_files(path)

    print 'claculating mfcc ' + path
    S = feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128)
    
    # Convert to log scale (dB). We'll use the peak power as reference.
    log_S = logamplitude(S, ref_power=np.max)
    mfcc_v = feature.mfcc(S=log_S, n_mfcc=14)
    
    return np.sum(mfcc_v, axis=1)/mfcc_v.shape[1]
def extract_feature_spectrogram( data ):
    filename, lbl = data
    try:
        signal,sr = librosa.load(filename)
        if len(signal) < 5*sr:
            print "Too short"
            return filename, None, None
        else:
            print "OK", len(signal) / float(sr)
        if sr != 22050:
            print "Non standart sr", sr
            return filename, None, None
        signal = signal[:5*sr]
        spectrogram = melspectrogram(y=signal, sr=sr, n_fft=fft_points,hop_length = fft_overlap,
                                    fmax=5000, n_mels = mfcc_coefficients)
    #spectrogram = spectrogram / spectrogram.max()
    #print gmm.means_.shape
    #result_features = np.vstack( [ gmm. ] )
        return filename, lbl, spectrogram
    except Exception,e:
        print e
        return filename, None, None
Ejemplo n.º 33
0
def mfcc(data, sr=22050, n_mfcc=20, **kwargs):
    """Mel-frequency cepstral coefficients (original function from librosa,
    modified to accept additional parameters).

    :parameters:
      - data  : np.ndarray or None
          audio time series
      - sr    : int > 0
          sampling rate of y
      - n_mfcc: int
          number of MFCCs to return

    .. note::
        additional keyword arguments are passed to the melspectrogram function.

    :returns:
      - M     : np.ndarray, shape=(n_mfcc, S.shape[1])
          MFCC sequence

    """
    S = logamplitude(melspectrogram(y=data, sr=sr, **kwargs))
    return np.dot(dct(n_mfcc, S.shape[0]), S)
Ejemplo n.º 34
0
def get_spectrogram(path):
    """Строим спектограмму из wav файла"""
    y, sr = load(path)
    S = melspectrogram(y, sr=sr, n_mels=100)
    log_S = logamplitude(S, ref_power=np.max)
    return log_S