Python mfccの例、sidekit.frontend.features.mfcc Pythonの例

コード例 #1

0

ファイルを表示

ファイル: features.py プロジェクト: loretoparisi/hf-experiments

def _wav2feats(wavname):
    """
    Extract features for wav 16k mono
    """
    ext = os.path.splitext(wavname)[-1]
    assert ext.lower() == '.wav' or ext.lower() == '.wave'
    sig, read_framerate, sampwidth = read_wav(wavname)
    shp = sig.shape
    # wav should contain a single channel
    assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1)
    # wav sample rate should be 16000 Hz
    assert read_framerate == 16000
    # LP: to be checked when sampwidth == 4
    # assert sampwidth == 2
    sig *= (2**(15 - sampwidth))

    with warnings.catch_warnings() as w:
        # ignore warnings resulting from empty signals parts
        warnings.filterwarnings('ignore',
                                message='divide by zero encountered in log',
                                category=RuntimeWarning,
                                module='sidekit')
        _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True)

    # Management of short duration segments
    difflen = 0
    if len(loge) < 68:
        difflen = 68 - len(loge)
        warnings.warn(
            "media %s duration is short. Robust results require length of at least 720 milliseconds"
            % wavname)
        mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
        #loge = np.concatenate((loge, np.ones(difflen) * np.min(mspec)))

    return mspec, loge, difflen

コード例 #2

0

ファイルを表示

ファイル: buffer_utils.py プロジェクト: lixwy/bmmaudio

def feat_from_raw(raw):  # see features.py
    sampwidth = 2
    nchannels = 1
    nframes = len(raw) / sampwidth
    out = struct.unpack_from("%dh" % nframes * nchannels, raw)
    sig = np.reshape(np.array(out), (-1, nchannels)).squeeze()
    sig = sig.astype(np.float32)
    shp = sig.shape
    # wav should contain a single channel
    assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1)
    sig *= (2**(15 - sampwidth))

    with warnings.catch_warnings() as w:
        # ignore warnings resulting from empty signals parts
        warnings.filterwarnings('ignore',
                                message='divide by zero encountered in log',
                                category=RuntimeWarning,
                                module='sidekit')
        _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True)

    # Management of short duration segments
    difflen = 0
    if len(loge) < 68:
        difflen = 68 - len(loge)
        warnings.warning(
            "media %s duration is short. Robust results require length of at least 720 milliseconds"
            % wavname)
        mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
        #loge = np.concatenate((loge, np.ones(difflen) * np.min(mspec)))

    return mspec, loge, difflen

コード例 #3

0

ファイルを表示

def _wav2feats(wavname):
    """
    """
    ext = os.path.splitext(wavname)[-1]
    assert ext.lower() == '.wav' or ext.lower() == '.wave'
    sig, read_framerate, sampwidth = read_wav(wavname)
    shp = sig.shape
    # wav should contain a single channel
    assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1)
    # wav sample rate should be 16000 Hz
    assert read_framerate == 16000
    sig *= (2**(15 - sampwidth))
    _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True)
    return mspec, loge

コード例 #4

0

ファイルを表示

ファイル: dataset.py プロジェクト: theo2021/Automatic-language-identification

    def __data_generation_dnn(self, list_dirs_temp):

        # Initialization
        X = np.empty((self.batch_size, 39 * 21))
        target = np.empty((self.batch_size, 8))
        for i, item_path in enumerate(list_dirs_temp):
            if self.feat == 'mfcc':
                if self.precomputed:
                    feat = np.load(item_path)[:, :, 0].T
                else:
                    waveform, sr = sf.read(item_path)
                    feat_item = mfcc(waveform + 1e-9,
                                     maxfreq=sr / 2.0,
                                     nwin=.128,
                                     shift=.032)[0]
                    feat_delta1 = compute_delta(feat_item)
                    feat_delta2 = compute_delta(feat_delta1)
                    feat = np.concatenate(
                        (feat_item, feat_delta1, feat_delta2), axis=1)
            else:
                if self.precomputed:
                    feat = np.load(item_path)[:, :, 0].T
                else:
                    waveform, sr = sf.read(item_path)
                    feat_item = plp(waveform + 1e-9,
                                    fs=sr,
                                    rasta=False,
                                    nwin=.128,
                                    shift=.032)[0]
                    feat_delta1 = compute_delta(feat_item)
                    feat_delta2 = compute_delta(feat_delta1)
                    feat = np.concatenate(
                        (feat_item, feat_delta1, feat_delta2), axis=1)

            mirror_feat = np.pad(feat, ((10, ), (0, )), 'reflect')
            frames = []
            for j in range(10 + i * 25, 10 + (i + 1) * 25):
                frames.append(np.reshape(mirror_feat[j - 10:j + 11, :], -1))
            X[25 * i:25 * (i + 1), ] = np.array(frames)

            # Store class
            label = item_path.split('/')[-3]

            target_i = keras.utils.to_categorical(self.target_to_class[label],
                                                  num_classes=self.n_classes)
            target[25 * i:25 * (i + 1), :] = np.repeat(target_i.reshape(
                (1, -1)),
                                                       repeats=25,
                                                       axis=0)
        return X, target

コード例 #5

0

ファイルを表示

ファイル: dataset.py プロジェクト: theo2021/Automatic-language-identification

 def __data_generation_lstm(self, item_path):
     # Generate data
     waveform, sr = sf.read(item_path)
     if self.feat == 'mfcc':
         feat_item = mfcc(waveform + 1e-9, maxfreq=sr / 2.0)[0]
     else:
         feat_item = plp(waveform + 1e-9, fs=sr, rasta=False)[0]
     feat_delta1 = compute_delta(feat_item)
     feat_delta2 = compute_delta(feat_delta1)
     feat = np.concatenate((feat_item, feat_delta1, feat_delta2), axis=1)
     # Store class
     label = item_path.split('/')[-3]
     y = self.target_to_class[label]
     target = np.zeros((feat.shape[0], self.n_classes))
     target[:, y] = 1
     return feat.reshape(1, *feat.shape), target

コード例 #6

0

ファイルを表示

ファイル: segmenter.py プロジェクト: rbhttchr/inaSpeechSegmenter

def _wav2feats(wavname):
    """ 
    """
    ext = os.path.splitext(wavname)[-1]
    assert ext.lower() == '.wav' or ext.lower() == '.wave'
    sig, read_framerate, sampwidth = read_wav(wavname)
    shp = sig.shape
    # wav should contain a single channel
    assert len(shp) == 1 or (len(shp) == 2 and shp[1] == 1)
    # wav sample rate should be 16000 Hz
    assert read_framerate == 16000
    assert sampwidth == 2
    sig *= (2**(15 - sampwidth))

    with warnings.catch_warnings() as w:
        # ignore warnings resulting from empty signals parts
        warnings.filterwarnings('ignore',
                                message='divide by zero encountered in log',
                                category=RuntimeWarning,
                                module='sidekit')
        _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True)

    return mspec, loge

コード例 #7

0

ファイルを表示

    def extract(self, *file):
        show, channel, input_audio_filename, output_feature_filename, extra = file[
            0]
        backing_store = True
        if input_audio_filename is not None:
            self.audio_filename_structure = input_audio_filename
        audio_filename = self.audio_filename_structure.format(show)

        # If the output file name does not include the ID of the show,
        # (i.e., if the feature_filename_structure does not include {})
        # the feature_filename_structure is updated to use the output_feature_filename
        if output_feature_filename is not None:
            self.feature_filename_structure = output_feature_filename

        if extra:
            feature_filename = self.feature_filename_structure.format(
                show, extra)
        else:
            feature_filename = self.feature_filename_structure.format(show)
        # if os.path.exists(feature_filename):
        #     return
        # Open audio file, get the signal and possibly the sampling frequency
        signal, sample_rate = read_audio(audio_filename,
                                         self.sampling_frequency)
        if signal.ndim == 1:
            signal = signal[:, np.newaxis]

        # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
        length, chan = signal.shape

        # If the size of the signal is not enough for one frame, return zero features
        PARAM_TYPE = np.float32
        if length < self.window_sample:
            cep = np.empty((0, self.ceps_number), dtype=PARAM_TYPE)
            energy = np.empty((0, 1), dtype=PARAM_TYPE)
            fb = np.empty((0, self.filter_bank_size), dtype=PARAM_TYPE)
            label = np.empty((0, 1), dtype='int8')

        else:
            # Random noise is added to the input signal to avoid zero frames.
            np.random.seed(0)
            signal[:, channel] += 0.0001 * np.random.randn(signal.shape[0])

            dec = self.shift_sample * 250 * 25000 + self.window_sample
            dec2 = self.window_sample - self.shift_sample
            start = 0
            end = min(dec, length)

            # Process the signal by batch to avoid problems for very long signals
            while start < (length - dec2):

                if self.feature_type == 'mfcc':
                    # Extract cepstral coefficients, energy and filter banks
                    cep, energy, _, fb = mfcc(signal[start:end, channel],
                                              fs=self.sampling_frequency,
                                              lowfreq=self.lower_frequency,
                                              maxfreq=self.higher_frequency,
                                              nlinfilt=self.filter_bank_size if
                                              self.filter_bank == "lin" else 0,
                                              nlogfilt=self.filter_bank_size if
                                              self.filter_bank == "log" else 0,
                                              nwin=self.window_size,
                                              shift=self.shift,
                                              nceps=self.ceps_number,
                                              get_spec=False,
                                              get_mspec=True,
                                              prefac=self.pre_emphasis)
                elif self.feature_type == 'plp':
                    cep, energy, _, fb = plp(signal[start:end, channel],
                                             nwin=self.window_size,
                                             fs=self.sampling_frequency,
                                             plp_order=self.ceps_number,
                                             shift=self.shift,
                                             get_spec=False,
                                             get_mspec=True,
                                             prefac=self.pre_emphasis,
                                             rasta=self.rasta_plp)

                # Perform feature selection
                label, threshold = self._vad(cep, energy, fb, signal[start:end,
                                                                     channel])
                # print(len(label[label]))
                if len(label) < len(energy):
                    label = np.hstack(
                        (label, np.zeros(len(energy) - len(label),
                                         dtype='bool')))

                start = end - dec2
                end = min(end + dec, length)

        # Create the HDF5 file
        # Create the directory if it dosn't exist
        dir_name = os.path.dirname(feature_filename)  # get the path
        if not os.path.exists(dir_name) and (dir_name is not ''):
            os.makedirs(dir_name)

        h5f = h5py.File(feature_filename,
                        'a',
                        backing_store=backing_store,
                        driver='core')
        if "cep" not in self.save_param:
            cep = None
            # cep_mean = None
            # cep_std = None
        if "energy" not in self.save_param:
            energy = None
            # energy_mean = None
            # energy_std = None
        if "fb" not in self.save_param:
            fb = None

        if "vad" not in self.save_param:
            label = None

        cep, fb, label = self.postProc(cep, energy, fb, label)

        write_hdf5(show, h5f, cep, None, None, None, None, None, fb, None,
                   None, None, None, None, label)

        h5f.close()
        pass

コード例 #8

0

ファイルを表示

    def getFeatures(self, signal, fs, winlength, *args):
        if len(args) == 0:
            calcmfccs = False
            ncep = 1
        elif len(args) == 1:
            calcmfccs = True
            ncep = args[0]
        else:
            print 'Incorrect number of inputs.'

        signal = np.real(signal)

        if fs <= 0:
            fs = 44100

        if isinstance(ncep, list):
            ncep = round(np.real(ncep[0]))
        else:
            ncep = round(np.real(ncep))

        if ncep < 1:
            ncep = 1

        if isinstance(winlength, list):
            winlength = np.real(winlength[0])
        else:
            winlength = np.real(winlength)

        if winlength * fs < ncep:
            winlength = ncep / fs

        winshift = 0.5
        minfreq = 20
        maxfreq = 4000
        nbands = 30
        lifterexp = 0
        preemph = 0

        if calcmfccs == True:
            ceps, log_energy, spec, mspec = ff.mfcc(signal,
                                                    lowfreq=minfreq,
                                                    maxfreq=maxfreq,
                                                    nlinfilt=lifterexp,
                                                    nlogfilt=nbands,
                                                    nwin=winlength,
                                                    fs=fs,
                                                    nceps=int(ncep),
                                                    shift=winlength * winshift,
                                                    prefac=preemph)

            output = []
            output.append(ceps)
        else:
            output = []

        if not output:
            spectram, log_energy = ff.power_spectrum(signal,
                                                     fs=fs,
                                                     win_time=winlength,
                                                     shift=winlength *
                                                     winshift,
                                                     prefac=1)
            output.append(spectram)
            output.append(log_energy)

        return output

コード例 #9

0

ファイルを表示

ファイル: features_extractor.py プロジェクト: zhilangtaosha/sidekit

    def extract(self, show, channel,
                input_audio_filename=None,
                output_feature_filename=None,
                backing_store=False):
        """
        Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
        for a single channel from a given audio file.

        :param show: ID if the show
        :param channel: channel number (0 if mono file)
        :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show
        :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show
        :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed
        :param feature_type: can be mfcc or plp
        :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering

        :return: an hdf5 file handler
        """
        # Create the filename to load

        # If the input audio file name does not include the ID of the show
        # (i.e., if the audio_filename_structure does not include {})
        # the audio_filename_structure is updated to use the input_audio_filename
        if input_audio_filename is not None:
            self.audio_filename_structure = input_audio_filename
        audio_filename = self.audio_filename_structure.format(show)

        # If the output file name does not include the ID of the show,
        # (i.e., if the feature_filename_structure does not include {})
        # the feature_filename_structure is updated to use the output_feature_filename
        if output_feature_filename is not None:
            self.feature_filename_structure = output_feature_filename
        feature_filename = self.feature_filename_structure.format(show)

        # Open audio file, get the signal and possibly the sampling frequency
        signal, sample_rate = read_audio(audio_filename, self.sampling_frequency)
        if signal.ndim == 1:
            signal = signal[:, numpy.newaxis]

        # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
        length, chan = signal.shape

        # If the size of the signal is not enough for one frame, return zero features
        if length < self.window_sample:
            cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE)
            energy = numpy.empty((0, 1), dtype=PARAM_TYPE)
            fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE)
            label = numpy.empty((0, 1), dtype='int8')

        else:
            # Random noise is added to the input signal to avoid zero frames.
            numpy.random.seed(0)
            signal[:, channel] += 0.0001 * numpy.random.randn(signal.shape[0])

            dec = self.shift_sample * 250 * 25000 + self.window_sample
            dec2 = self.window_sample - self.shift_sample
            start = 0
            end = min(dec, length)

            # Process the signal by batch to avoid problems for very long signals
            while start < (length - dec2):
                logging.info('process part : %f %f %f',
                             start / self.sampling_frequency,
                             end / self.sampling_frequency,
                             length / self.sampling_frequency)

                if self.feature_type == 'mfcc':
                    # Extract cepstral coefficients, energy and filter banks
                    cep, energy, _, fb = mfcc(signal[start:end, channel],
                                              fs=self.sampling_frequency,
                                              lowfreq=self.lower_frequency,
                                              maxfreq=self.higher_frequency,
                                              nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0,
                                              nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0,
                                              nwin=self.window_size,
                                              shift=self.shift,
                                              nceps=self.ceps_number,
                                              get_spec=False,
                                              get_mspec=True,
                                              prefac=self.pre_emphasis)
                elif self.feature_type == 'plp':
                    cep, energy, _, fb = plp(signal[start:end, channel],
                                             nwin=self.window_size,
                                             fs=self.sampling_frequency,
                                             plp_order=self.ceps_number,
                                             shift=self.shift,
                                             get_spec=False,
                                             get_mspec=True,
                                             prefac=self.pre_emphasis,
                                             rasta=self.rasta_plp)
                
                # Perform feature selection
                label, threshold = self._vad(cep, energy, fb, signal[start:end, channel])
                if len(label) < len(energy):
                    label = numpy.hstack((label, numpy.zeros(len(energy)-len(label), dtype='bool')))

                start = end - dec2
                end = min(end + dec, length)
                if cep.shape[0] > 0:
                    logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes/1024/1024,
                                 len(cep[-1]),
                                 cep[-1].nbytes/len(cep[-1]))

        # Compute the lean and std of fb and cepstral coefficient comuted for all selected frames
        energy_mean = energy[label].mean(axis=0)
        energy_std = energy[label].std(axis=0)
        fb_mean = fb[label, :].mean(axis=0)
        fb_std = fb[label, :].std(axis=0)
        cep_mean = cep[label, :].mean(axis=0)
        cep_std = cep[label, :].std(axis=0)
        # bnf_mean = bnf[label, :].mean(axis=0)
        # bnf_std = bnf[label, :].std(axis=0)

        # Create the HDF5 file
        # Create the directory if it dosn't exist
        dir_name = os.path.dirname(feature_filename)  # get the path
        if not os.path.exists(dir_name) and (dir_name is not ''):
            os.makedirs(dir_name) 

        h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core')
        if "cep" not in self.save_param:
            cep = None
            cep_mean = None
            cep_std = None
        if "energy" not in self.save_param:
            energy = None
            energy_mean = None
            energy_std = None
        if "fb" not in self.save_param:
            fb = None
            fb_mean = None
            fb_std = None
        if "bnf" not in self.save_param:
            bnf = None
            bnf_mean = None
            bnf_std = None
        if "vad" not in self.save_param:
            label = None
        logging.info(label)
       
        write_hdf5(show, h5f,
                   cep, cep_mean, cep_std,
                   energy, energy_mean, energy_std,
                   fb, fb_mean, fb_std,
                   bnf, bnf_mean, bnf_std,
                   label)

        return h5f

コード例 #10

0

ファイルを表示

ファイル: features_extractor.py プロジェクト: stevecassidy/sidekit

    def extract_from_signal(self,
                            signal,
                            sample_rate,
                            noise_file_name=None,
                            snr=10,
                            reverb_file_name=None,
                            reverb_level=-26.):
        """
        Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features
        for a single channel from a given audio file.

        :param show: ID if the show
        :param channel: channel number (0 if mono file)
        :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show
        :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show
        :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed
        :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering

        :return: an hdf5 file handler
        """
        if signal.ndim == 1:
            signal = signal[:, numpy.newaxis]

        # AJOUTER  LE BRUITAGE ET REVERB DU SIGNAL SI NECESSAIRE
        if noise_file_name is not None:
            signal[:, 0] = _add_noise(signal[:, 0], noise_file_name, snr,
                                      sample_rate)

        if reverb_file_name is not None:
            signal[:, 0] = _add_reverb(signal[:, 0], reverb_file_name,
                                       sample_rate, reverb_level)

        # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required
        length, chan = signal.shape

        # If the size of the signal is not enough for one frame, return zero features
        if length < self.window_sample:
            cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE)
            energy = numpy.empty((0, 1), dtype=PARAM_TYPE)
            fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE)
            label = numpy.empty((0, 1), dtype='int8')

        else:
            # Random noise is added to the input signal to avoid zero frames.
            numpy.random.seed(0)
            signal[:, 0] += 0.0001 * numpy.random.randn(signal.shape[0])

            dec = self.shift_sample * 250 * 25000 + self.window_sample
            dec2 = self.window_sample - self.shift_sample
            start = 0
            end = min(dec, length)

            # Process the signal by batch to avoid problems for very long signals
            while start < (length - dec2):
                logging.info('process part : %f %f %f',
                             start / self.sampling_frequency,
                             end / self.sampling_frequency,
                             length / self.sampling_frequency)

                if self.feature_type == 'mfcc':
                    # Extract cepstral coefficients, energy and filter banks
                    cep, energy, _, fb = mfcc(signal[start:end, 0],
                                              fs=self.sampling_frequency,
                                              lowfreq=self.lower_frequency,
                                              maxfreq=self.higher_frequency,
                                              nlinfilt=self.filter_bank_size if
                                              self.filter_bank == "lin" else 0,
                                              nlogfilt=self.filter_bank_size if
                                              self.filter_bank == "log" else 0,
                                              nwin=self.window_size,
                                              shift=self.shift,
                                              nceps=self.ceps_number,
                                              get_spec=False,
                                              get_mspec=True,
                                              prefac=self.pre_emphasis)
                elif self.feature_type == 'plp':
                    cep, energy, _, fb = plp(signal[start:end, 0],
                                             nwin=self.window_size,
                                             fs=self.sampling_frequency,
                                             plp_order=self.ceps_number,
                                             shift=self.shift,
                                             get_spec=False,
                                             get_mspec=True,
                                             prefac=self.pre_emphasis,
                                             rasta=self.rasta_plp)

                # Perform feature selection
                label, threshold = self._vad(cep, energy, fb, signal[start:end,
                                                                     0])

                if len(label) < len(energy):
                    label = numpy.hstack((label,
                                          numpy.zeros(len(energy) - len(label),
                                                      dtype='bool')))

                start = end - dec2
                end = min(end + dec, length)
                if cep.shape[0] > 0:
                    logging.info(
                        '!! size of signal cep: %f len %d type size %d',
                        cep[-1].nbytes / 1024 / 1024, len(cep[-1]),
                        cep[-1].nbytes / len(cep[-1]))

        return label, energy, cep, fb

コード例 #11

0

ファイルを表示

ファイル: dataset.py プロジェクト: theo2021/Automatic-language-identification

    def __data_generation_cnn(self, list_dirs_temp):

        # Initialization
        y = np.empty((self.batch_size), dtype=int)
        if self.feat == 'combined':
            X1 = np.empty((self.batch_size, *self.dim1, self.n_channels))
            X2 = np.empty((self.batch_size, *self.dim2, self.n_channels))
        else:
            X = np.empty((self.batch_size, *self.dim, self.n_channels))

        # Generate data
        labels = []
        for i, item_path in enumerate(list_dirs_temp):

            if self.feat == 'melspec':
                waveform, sr = sf.read(item_path)
                mel = librosa.feature.melspectrogram(waveform, sr=16000)
                ps_db = librosa.power_to_db(mel, ref=np.max).reshape(
                    (*self.dim, 1))
                X[i, ] = (ps_db - np.mean(ps_db)) / np.var(ps_db)
            elif self.feat == 'mfcc':
                if self.precomputed:
                    X[i, ] = np.load(item_path)
                else:
                    waveform, sr = sf.read(item_path)
                    feat_item = mfcc(waveform + 1e-9,
                                     maxfreq=sr / 2.0,
                                     nwin=.128,
                                     shift=.032)[0]
                    feat_delta1 = compute_delta(feat_item)
                    feat_delta2 = compute_delta(feat_delta1)
                    feat = np.concatenate(
                        (feat_item, feat_delta1, feat_delta2),
                        axis=1).T.reshape((*self.dim, 1))
                    X[i, ] = (feat - np.mean(feat)) / np.var(feat)
            elif self.feat == 'plp':
                if self.precomputed:
                    X[i, ] = np.load(item_path)
                else:
                    waveform, sr = sf.read(item_path)
                    feat_item = plp(waveform + 1e-9,
                                    fs=sr,
                                    rasta=False,
                                    nwin=.128,
                                    shift=.032)[0]
                    feat_delta1 = compute_delta(feat_item)
                    feat_delta2 = compute_delta(feat_delta1)
                    feat = np.concatenate(
                        (feat_item, feat_delta1, feat_delta2),
                        axis=1).T.reshape((*self.dim, 1))
                    X[i, ] = (feat - np.mean(feat)) / np.var(feat)
            elif self.feat == 'combined':
                waveform, sr = sf.read(item_path)
                mel = librosa.feature.melspectrogram(waveform, sr=16000)
                ps_db = librosa.power_to_db(mel, ref=np.max).reshape(
                    (*self.dim2, 1))
                X2[i, ] = (ps_db - np.mean(ps_db)) / np.var(ps_db)
                feat_item = mfcc(waveform + 1e-9,
                                 maxfreq=sr / 2.0,
                                 nwin=.128,
                                 shift=.032)[0]
                feat_delta1 = compute_delta(feat_item)
                feat_delta2 = compute_delta(feat_delta1)
                feat = np.concatenate((feat_item, feat_delta1, feat_delta2),
                                      axis=1).T.reshape((*self.dim1, 1))
                X1[i, ] = (feat - np.mean(feat)) / np.var(feat)
            # Store class
            label = item_path.split('/')[-3]
            labels.append(item_path)
            y[i] = self.target_to_class[label]

        target = keras.utils.to_categorical(y, num_classes=self.n_classes)
        if self.feat == 'combined':
            return (X1, X2), target
        return X, target