Ejemplo n.º 1
0
def get_plp_dd(wav_fn, norm):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    audio = Audio.load(wav_fn)
    processor = PlpProcessor(sample_rate=audio.sample_rate, window_type="hamming",frame_length=0.025, frame_shift=0.01,
                              low_freq=0, vtln_low=60, vtln_high=7200, high_freq=audio.sample_rate/2)
    plp_static = processor.process(audio, vtln_warp=1.0)
    d_processor = DeltaPostProcessor(order=2)
    plp_deltas = d_processor.process(plp_static)
    features = np.float64(plp_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features, axis=0)

    return features
Ejemplo n.º 2
0
def get_mfcc_vtln(wav_fn, f, norm, lang):
    """Return the MFCCs with deltas and delta-deltas for a audio file."""
    ref = os.path.basename(f).replace(".wav", "")
    if not os.path.isfile("warps_{}.pkl".format(lang)):
        if os.path.isfile('warps_{}.txt'.format(lang)):
            factors = {}
            with open('warps_{}.txt'.format(lang), mode='r',
                      encoding='utf-8') as opfile:
                wop = opfile.read().split('\n')
                for line in wop:
                    if len(line) > 1:
                        l_sp = line.split()
                        factors[l_sp[0]] = float(l_sp[1])
                        print(factors)
            with open('warps_{}.pkl'.format(lang), mode='wb') as opfile:
                pickle.dump(factors, opfile)
        else:
            print('no warp factors found')
            exit()
    with open("warps_{}.pkl".format(lang), mode="rb") as op:
        factors = pickle.load(op)
    warp = float(factors[ref])
    audio = Audio.load(wav_fn)
    processor = MfccProcessor(sample_rate=audio.sample_rate,
                              window_type="hamming",
                              frame_length=0.025,
                              frame_shift=0.01,
                              cepstral_lifter=26.0,
                              low_freq=0,
                              vtln_low=60,
                              vtln_high=7200,
                              high_freq=audio.sample_rate / 2)
    d_processor = DeltaPostProcessor(order=2)
    mfcc_static = processor.process(audio, vtln_warp=warp)
    mfcc_deltas = d_processor.process(mfcc_static)
    features = np.float64(mfcc_deltas._to_dict()["data"])
    if norm == "cmvn":
        features = (features - np.mean(features, axis=0)) / np.std(features,
                                                                   axis=0)

    return features
Ejemplo n.º 3
0
def test_params():
    d = DeltaPostProcessor()
    d.order = 0
    with pytest.raises(ValueError):
        d.window = 0
    with pytest.raises(ValueError):
        d.window = 2000
    d.window = 1

    assert d.get_params() == {'order': 0, 'window': 1}

    p = {'order': 0, 'window': 1}
    d = DeltaPostProcessor()
    assert d.get_params()['order'] == 2
    d.set_params(**p)
    assert d.get_params() == p
Ejemplo n.º 4
0
def test_ndims():
    with pytest.raises(ValueError) as err:
        DeltaPostProcessor().ndims
    assert 'output dimension for delta processor depends on input' in str(err)
Ejemplo n.º 5
0
def test_output(mfcc, order, window):
    delta = DeltaPostProcessor(order=order, window=window).process(mfcc)
    assert delta.shape[0] == mfcc.shape[0]
    assert delta.shape[1] == mfcc.shape[1] * (order + 1)
    assert np.array_equal(delta.times, mfcc.times)
    assert delta.data[:, :mfcc.shape[1]] == pytest.approx(mfcc.data)
Ejemplo n.º 6
0
    def get_features(self, y, sample_rate):
        """Feature extraction

        Parameters
        ----------
        y : (n_samples, 1) numpy array
            Waveform
        sample_rate : int
            Sample rate

        Returns
        -------
        data : (n_frames, n_dimensions) numpy array
            Features
        """
        # scale the audio signal between -1 and 1 before
        # creating audio object w/ shennong: Do this because
        # when pyannote uses "data augmentation", it normalizes
        # the signal, but when loading the data without data
        # augmentation it doesn't normalize it.
        y = y / np.max((-np.min(y), np.max(y)))

        # create audio object for shennong
        audio = Audio(data=y, sample_rate=sample_rate)

        # MFCC parameters
        processor = MfccProcessor(sample_rate=sample_rate)
        processor.dither = self.dither
        processor.preemph_coeff = self.preemph_coeff
        processor.remove_dc_offset = self.remove_dc_offset
        processor.window_type = self.window_type
        processor.blackman_coeff = self.blackman_coeff
        processor.vtln_low = self.vtln_low
        processor.vtln_high = self.vtln_high
        processor.energy_floor = self.energy_floor
        processor.raw_energy = self.raw_energy
        processor.cepstral_lifter = self.cepstral_lifter
        processor.htk_compat = self.htk_compat

        processor.low_freq = self.mfccLowFreq
        processor.high_freq = self.mfccHighFreq  # defines it as (nyquist - 100)
        processor.use_energy = self.e
        processor.num_ceps = self.coefs
        processor.snip_edges = False  # end with correct number of frames

        # MFCC extraction
        #audio = Audio(data=y, sample_rate=sample_rate)
        mfcc = processor.process(audio)
        # compute deltas
        if self.D:
            # define first or second order derivative
            if not self.DD:
                derivative_proc = DeltaPostProcessor(order=1)
            else:
                derivative_proc = DeltaPostProcessor(order=2)

            # process Mfccs
            mfcc = derivative_proc.process(mfcc)

        # Compute CMVN
        if self.with_cmvn:
            # define cmvn
            postproc = CmvnPostProcessor(self.get_dimension(), stats=None)

            # accumulate stats
            stats = postproc.accumulate(mfcc)

            # process cmvn
            mfcc = postproc.process(mfcc)

        # Compute Pitch
        if self.with_pitch:
            # extract pitch
            pitch = self.get_pitch(audio, self.pitchFmin, self.pitchFmax)

            mfcc = self.concatenate_with_pitch(mfcc.data, pitch.data)

        else:
            mfcc = mfcc.data

        return mfcc
import argparse
import glob
import os
import pickle
import numpy as np
import pandas as pd
from pathlib import Path

from shennong.audio import Audio
from shennong.features.processor.mfcc import MfccProcessor
from shennong.features.postprocessor.delta import DeltaPostProcessor
from shennong.features.processor.bottleneck import BottleneckProcessor

mfcc_processor  = MfccProcessor(sample_rate=8000)
delta_processor = DeltaPostProcessor(order=2)
bnf_processor   = BottleneckProcessor(weights='BabelMulti')

parser = argparse.ArgumentParser(
    description='example: python wav_to_shennong-feats.py mfcc wrm-pd',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

parser.add_argument('features', help='features to extract using the Shennong library (mfcc or bnf), use _all_ for both')
parser.add_argument('dataset', help = 'name of dataset, use _all_ to iterate over all')

parser.add_argument('--feats_dir',  default='data/interim/features', help = "directory for features")
parser.add_argument('--datasets_dir', default='data/raw/datasets', help = "directory for raw datasets and labels files")

parser.add_argument('--queries_dir',  default='queries', help = "directory with .wav files for queries")
parser.add_argument('--references_dir',  default='references', help = "directory with .wav files for references")