Ejemplo n.º 1
0
def get_plp_dd(wav_fn, norm):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    audio = Audio.load(wav_fn)
    processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01,
                              low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2)
    plp_static = processor.process(audio, vtln_warp=1.0)
    d_processor = DeltaPostProcessor(order=2)
    plp_deltas = d_processor.process(plp_static)
    features = np.float64(plp_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features, axis=0)

    return features
Ejemplo n.º 2
0
def get_mfcc_vtln(wav_fn, f, norm, lang):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    ref = os.path.basename(f).replace(".wav", "")
    if not os.path.isfile("warps_{}.pkl".format(lang)):
        if os.path.isfile('warps_{}.txt'.format(lang)):
            factors = {}
            with open('warps_{}.txt'.format(lang), mode='r',
                      encoding='utf-8') as opfile:
                wop = opfile.read().split('\n')
                for line in wop:
                    if len(line) > 1:
                        l_sp = line.split()
                        factors[l_sp[0]] = float(l_sp[1])
                        print(factors)
            with open('warps_{}.pkl'.format(lang), mode='wb') as opfile:
                pickle.dump(factors, opfile)
        else:
            print('no warp factors found')
            exit()
    with open("warps_{}.pkl".format(lang), mode="rb") as op:
        factors = pickle.load(op)
    warp = float(factors[ref])
    audio = Audio.load(wav_fn)
    processor = MfccProcessor(sample_rate=audio.sample_rate,
                              window_type="hamming",
                              frame_length=0.025,
                              frame_shift=0.01,
                              cepstral_lifter=26.0,
                              low_freq=0,
                              vtln_low=60,
                              vtln_high=7200,
                              high_freq=audio.sample_rate / 2)
    d_processor = DeltaPostProcessor(order=2)
    mfcc_static = processor.process(audio, vtln_warp=warp)
    mfcc_deltas = d_processor.process(mfcc_static)
    features = np.float64(mfcc_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features,
                                                                   axis=0)

    return features
Ejemplo n.º 3
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # MFCC parameters
        processor = MfccProcessor(sample_rate=sample_rate)
        processor.dither = self.dither
        processor.preemph_coeff = self.preemph_coeff
        processor.remove_dc_offset = self.remove_dc_offset
        processor.window_type = self.window_type
        processor.blackman_coeff = self.blackman_coeff
        processor.vtln_low = self.vtln_low
        processor.vtln_high = self.vtln_high
        processor.energy_floor = self.energy_floor
        processor.raw_energy = self.raw_energy
        processor.cepstral_lifter = self.cepstral_lifter
        processor.htk_compat = self.htk_compat

        processor.low_freq = self.mfccLowFreq
        processor.high_freq = self.mfccHighFreq  # defines it as (nyquist - 100)
        processor.use_energy = self.e
        processor.num_ceps = self.coefs
        processor.snip_edges = False  # end with correct number of frames

        # MFCC extraction
        #audio = Audio(data=y, sample_rate=sample_rate)
        mfcc = processor.process(audio)
        # compute deltas
        if self.D:
            # define first or second order derivative
            if not self.DD:
                derivative_proc = DeltaPostProcessor(order=1)
            else:
                derivative_proc = DeltaPostProcessor(order=2)

            # process Mfccs
            mfcc = derivative_proc.process(mfcc)

        # Compute CMVN
        if self.with_cmvn:
            # define cmvn
            postproc = CmvnPostProcessor(self.get_dimension(), stats=None)

            # accumulate stats
            stats = postproc.accumulate(mfcc)

            # process cmvn
            mfcc = postproc.process(mfcc)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            mfcc = self.concatenate_with_pitch(mfcc.data, pitch.data)

        else:
            mfcc = mfcc.data

        return mfcc