def getModulation(x, fmin, fmax, bands, sr):
    cf = erbspace(fmin * Hz, fmax * Hz, bands)
    m = []
    for i in range(x.shape[1]):
        gfb = Gammatone(Sound(x[:, i], samplerate=sr * Hz), cf)
        m.append(gfb.process())
    return np.asarray(m)
Example #2
0
def define_f_bands(stt=180, stp=7000, n_bands=32, kind='log'):
    '''
    Defines log-spaced frequency bands...generally this is for auditory
    spectrogram extraction. Brian used 180 - 7000 Hz, so for now those
    are the defaults.

    INPUTS
    --------
        stt : int
            The starting frequency
        stp : int
            The end frequency
        n_bands : int
            The number of bands to calculate
        kind : string, ['log', 'erb']
            What kind of spacing will we use for the frequency bands.
    '''
    if kind == 'log':
        aud_fs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int)
    elif kind == 'lin':
        aud_fs = np.linspace(stt, stp, n_bands).astype(int)
    elif kind == 'erb':
        aud_fs = hears.erbspace(stt*Hz, stp*Hz, n_bands)
    else:
        raise NameError("I don't know what kind of spacing that is")
    return aud_fs
Example #3
0
def define_f_bands(stt=180, stp=7000, n_bands=32, kind='log'):
    '''
    Defines log-spaced frequency bands...generally this is for auditory
    spectrogram extraction. Brian used 180 - 7000 Hz, so for now those
    are the defaults.

    INPUTS
    --------
        stt : int
            The starting frequency
        stp : int
            The end frequency
        n_bands : int
            The number of bands to calculate
        kind : string, ['log', 'erb']
            What kind of spacing will we use for the frequency bands.
    '''
    if kind == 'log':
        aud_fs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int)
    elif kind == 'lin':
        aud_fs = np.linspace(stt, stp, n_bands).astype(int)
    elif kind == 'erb':
        aud_fs = hears.erbspace(stt * Hz, stp * Hz, n_bands)
    else:
        raise NameError("I don't know what kind of spacing that is")
    return aud_fs
Example #4
0
def create_center_frequencies(stt=180, stp=7000, n_bands=32, kind='log'):
    '''
    Define center frequencies for spectrograms.

    Generally this is for auditory spectrogram extraction. Most auditory
    analysis uses 180 - 7000 Hz, so for now those
    are the defaults.

    Parameters
    ----------
    stt : float | int
        The starting frequency
    stp : float | int
        The end frequency
    n_bands : int
        The number of bands to calculate
    kind : 'log' | 'erb'
        Whether to use log or erb spacing

    Returns
    -------
    freqs : array, shape (n_frequencies,)
        An array of center frequencies.
    '''
    if kind == 'log':
        freqs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int)
    elif kind == 'erb':
        freqs = hears.erbspace(stt * Hz, stp * Hz, n_bands)
    else:
        print("I don't know what kind of spacing that is")
    return freqs
Example #5
0
def create_center_frequencies(stt=180, stp=7000, n_bands=32, kind='log'):
    '''
    Define center frequencies for spectrograms.

    Generally this is for auditory spectrogram extraction. Most auditory
    analysis uses 180 - 7000 Hz, so for now those
    are the defaults.

    Parameters
    ----------
    stt : float | int
        The starting frequency
    stp : float | int
        The end frequency
    n_bands : int
        The number of bands to calculate
    kind : 'log' | 'erb'
        Whether to use log or erb spacing

    Returns
    -------
    freqs : array, shape (n_frequencies,)
        An array of center frequencies.
    '''
    if kind == 'log':
        freqs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int)
    elif kind == 'erb':
        freqs = hears.erbspace(stt * Hz, stp * Hz, n_bands)
    else:
        print("I don't know what kind of spacing that is")
    return freqs
Example #6
0
def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav,
        gammatones, spectrograms, filterbanks):
#def extract_features(fname, bdir):
    if fname[-4:] != '.wav':
        return
    rawfname = bdir+'/'+fname[:-4]+'.rawaudio'
    wavfname = bdir+'/'+fname
    tempfname = bdir+'/'+fname[:-4]+'_temp.wav'
    # temp fname with .wav for sox
    mfccfname = bdir+'/'+fname[:-4]+mfc_extension
    if sox:
        shutil.move(wavfname, tempfname)
        call(['sox', tempfname, wavfname])
        #call(['sox', '-G', tempfname, '-r 16k', wavfname])
        # w/o headers, sox uses extension
        shutil.move(tempfname, rawfname)
    if htk_mfc:
        call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
    srate = 16000
    #srate, sound = wavfile.read(wavfname)
    sound, srate = readwav(wavfname)
    if stereo_wav and len(sound.shape) == 2: # in mono sound is a list
        sound = 0.5 * (sound[:, 0] + sound[:, 1])
        # for stereo wav, sum both channels
    if gammatones:
        gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy'
        tmp_snd = loadsound(wavfname)
        gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS)
        gamma_fb = Gammatone(tmp_snd, gamma_cf)
        with open(gammatonefname, 'w') as o_f:
            npsave(o_f, gamma_fb.process())
    if spectrograms:
        powerspec, _, _, _ = specgram(sound, NFFT=int(srate
            * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate
                * SPECGRAM_OVERLAP)) # TODO
        specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy'
        with open(specgramfname, 'w') as o_f:
            npsave(o_f, powerspec.T)
    if filterbanks:
        # convert to Mel filterbanks
        fbanks = Spectral(nfilt=N_FBANKS,      # nb of filters in mel bank
                     alpha=0.97,               # pre-emphasis
                     do_dct=False,             # we do not want MFCCs
                     compression='log',
                     fs=srate,                 # sampling rate
                     lowerf=50,                # lower frequency
                     frate=FBANKS_RATE,        # frame rate
                     wlen=FBANKS_WINDOW,       # window length
                     nfft=1024,                # length of dft
                     do_deltas=False,          # speed
                     do_deltasdeltas=False     # acceleration
                     )
        sound /= np.abs(sound).max(axis=0)  # TODO put that as option
        fbank = fbanks.transform(sound)
        fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy'
        with open(fbanksfname, 'w') as o_f:
            npsave(o_f, fbank)
    # TODO wavelets scattergrams / scalograms
    print "dealt with file", wavfname
def process(folder, 
        debug=False, 
        htk_mfc=False, 
        forcemfcext=False,
        stereo_wav=False, 
        gammatones=False,
        spectrograms=False):
    """ debug output? HCopy for MFCC? wav are stereo? produce gammatones? """

    # first find if we produce normalized MFCC, otherwise note it in the ext
    # because we can then normalize on the whole corpus with another py script
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config', 'r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'
    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC extension:", mfc_extension

    # run through all the folders and files in the path "folder"
    # and put a header to the waves, save the originals as .rawaudio
    # use HCopy to produce MFCC files according to "wav_config" file
    for d, ds, fs in os.walk(folder):
        for fname in fs:
            if fname[-4:] != '.wav':
                continue
            rawfname = d+'/'+fname[:-4]+'.rawaudio'
            wavfname = d+'/'+fname
            tempfname = d+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox
            mfccfname = d+'/'+fname[:-4]+mfc_extension
            shutil.move(wavfname, tempfname)
            call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension
            shutil.move(tempfname, rawfname)
            if htk_mfc:
                call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
            sr = 16000
            sr, sound = wavfile.read(wavfname)
            if stereo_wav and len(sound.shape) == 2: # in mono sound is a list
                sound = sound[:,1] # for stereo wav, arbitrarily take channel 1
            if gammatones:
                from brian import Hz, kHz
                from brian.hears import loadsound, erbspace, Gammatone
                gammatonefname = d+'/'+fname[:-4]+'_gamma.npy'
                tmp_snd = loadsound(wavfname)
                cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS)
                fb = Gammatone(tmp_snd, cf)
                with open(gammatonefname, 'w') as of:
                    numpy.save(of, fb.process())
            if spectrograms:
                from pylab import specgram
                Pxx, freqs, bins, im = specgram(sound, NFFT=int(sr * SPECGRAM_WINDOW), Fs=sr, noverlap=int(sr * SPECGRAM_OVERLAP))
                specgramfname = d+'/'+fname[:-4]+'_specgram.npy'
                with open(specgramfname, 'w') as of:
                    numpy.save(of, Pxx.T)
            print "dealt with file", wavfname
Example #8
0
def drnl(sound, n_channels=50, uncompressed=True):
  """ use predefined cochlear model, see Lopez-Poveda et al 2001"""
  cf = erbspace(100*Hz, 8000*Hz, n_channels)    # centre frequencies of ERB scale
                                        #  (equivalent rectangular bandwidth)
                                        #  between 100 and 8000 Hz
  drnl_filter = DRNL(sound, cf, type='human')
                                        # use DNRL model, see documentation
  print 'processing sound'
  out = drnl_filter.process()           # get array of channel activations
  if not uncompressed:
      out = out.clip(0.0)                    # -> fast oscillations can't be downsampled otherwise
      out = resample(out, int(round(sound.nsamples/1000.0)))
                                        # downsample sound for memory reasons
  return out
Example #9
0
def drnl(sound, n_channels=50, uncompressed=True):
    """ use predefined cochlear model, see Lopez-Poveda et al 2001"""
    cf = erbspace(100 * Hz, 8000 * Hz,
                  n_channels)  # centre frequencies of ERB scale
    #  (equivalent rectangular bandwidth)
    #  between 100 and 8000 Hz
    drnl_filter = DRNL(sound, cf, type='human')
    # use DNRL model, see documentation
    print 'processing sound'
    out = drnl_filter.process()  # get array of channel activations
    if not uncompressed:
        out = out.clip(
            0.0)  # -> fast oscillations can't be downsampled otherwise
        out = resample(out, int(round(sound.nsamples / 1000.0)))
        # downsample sound for memory reasons
    return out
Example #10
0
def process(folder,
            debug=False,
            htk_mfc=False,
            forcemfcext=False,
            stereo_wav=False,
            gammatones=False,
            spectrograms=False,
            filterbanks=False,
            sox=True):
    """ applies to all *.wav in folder """

    # first find if we produce normalized MFCC, otherwise note it in the ext
    # because we can then normalize on the whole corpus with another py script
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config', 'r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'
    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC extension:", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone
        except ImportError:
            print >> sys.stderr, "You need Brian Hears"
            print >> sys.stderr, "http://www.briansimulator.org/docs/\
                    hears.html"

            sys.exit(-1)
    if spectrograms:
        try:
            from pylab import specgram
        except ImportError:
            print >> sys.stderr, "You need Pylab"
            sys.exit(-1)
    fbanks = None
    if filterbanks:
        try:
            sys.path.append('../spectral')
            from spectral import Spectral
        except ImportError:
            print >> sys.stderr, "You need spectral (in the parent folder)"
            print >> sys.stderr, "https://github.com/mwv/spectral"
            sys.exit(-1)

    # run through all the folders and files in the path "folder"
    # and put a header to the waves, save the originals as .rawaudio
    # use HCopy to produce MFCC files according to "wav_config" file
    for bdir, _, files in os.walk(folder):
        for fname in files:
            if fname[-4:] != '.wav':
                continue
            rawfname = bdir + '/' + fname[:-4] + '.rawaudio'
            wavfname = bdir + '/' + fname
            tempfname = bdir + '/' + fname[:-4] + '_temp.wav'
            # temp fname with .wav for sox
            mfccfname = bdir + '/' + fname[:-4] + mfc_extension
            if sox:
                shutil.move(wavfname, tempfname)
                call(['sox', tempfname, wavfname])
                # w/o headers, sox uses extension
                shutil.move(tempfname, rawfname)
            if htk_mfc:
                call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
            srate = 16000
            srate, sound = wavfile.read(wavfname)
            if stereo_wav and len(sound.shape) == 2:  # in mono sound is a list
                sound = sound[:, 0] + sound[:, 1]
                # for stereo wav, sum both channels
            if gammatones:
                gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy'
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname, 'w') as o_f:
                    npsave(o_f, gamma_fb.process())
            if spectrograms:
                powerspec, _, _, _ = specgram(
                    sound,
                    NFFT=int(srate * SPECGRAM_WINDOW),
                    Fs=srate,
                    noverlap=int(srate * SPECGRAM_OVERLAP))  # TODO
                specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy'
                with open(specgramfname, 'w') as o_f:
                    npsave(o_f, powerspec.T)
            if filterbanks:
                # convert to Mel filterbanks
                if fbanks == None:  # assume parameters are fixed
                    fbanks = Spectral(
                        nfilt=N_FBANKS,  # nb of filters in mel bank
                        alpha=0.97,  # pre-emphasis
                        do_dct=False,  # we do not want MFCCs
                        fs=srate,  # sampling rate
                        frate=FBANKS_RATE,  # frame rate
                        wlen=FBANKS_WINDOW,  # window length
                        nfft=1024,  # length of dft
                        do_deltas=False,  # speed
                        do_deltasdeltas=False  # acceleration
                    )
                fbank = fbanks.transform(sound)[0]  # first dimension is for
                # deltas & deltasdeltas
                fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy'
                with open(fbanksfname, 'w') as o_f:
                    npsave(o_f, fbank)
            # TODO wavelets scattergrams / scalograms
            print "dealt with file", wavfname
Example #11
0
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True):
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config','r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'

    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC Extension is", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone

        except ImportError:
            print >> sys.stderr, "You need Brian Hears"

            sys.exit(-1)

    if spectograms:
        try:
            from pylab import specgram

        except ImportError:
            print >> sys.stderr,'You need Pylab'
            sys.exit(-1)

    fbanks = None
    if filterbanks:
        try:
            sys.path.append('../spectral')
            from spectral import Spectral

        except ImportError:
            print >> sys.stderr, 'you need spectral (in the parent folder)'

    for bdir, _ , files in  os.walk(folder):
        for fname in files:
            if fname[-4:] != '.WAV':
                continue
            rawfname= bdir + '/' + fname[:-4]+'.rawaudio'
            wavfname = bdir + '/'+ fname
            tempfname = bdir + '/' + fname[:-4] + '_temp.wav'
            mfccfname = bdir + '/' + fname[:-4] + '.txt'
            if sox:
                shutil.move(wavfname, tempfname)
                call(['sox',tempfname,wavfname])
                shutil.move(tempfname,wavfname)

            if htk_mfcc:
                call(['HCopy','-C','wav_config',wavfname,mfccfname])
            srate = 16000

            srate, sound = wavfile.read(wavfname)
            if stereo_wave and len(sound.shape == 2):
                sound = sound[:,0]+ sound[:,1]
            if gammatones:
                gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy'
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname,'w') as o_f:
                    npsave(o_f, gamma_fb.process())

            if spectograms:
                powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window))
                specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy'
                with open(specgramfname,'w') as o_f:
                    npsave(o_f , powerspec.T)
            if filterbanks:
                if fbanks ==None:
                    fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False)
                fbank = fbanks.transform(sound)[0]
                fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy'
                with open(fbanksfname,'w') as o_f:
                    npsave(o_f, fbank)
            print "Dealt with the file ", wavfname
Example #12
0
# TODO load the following parameters from wav_config
SAMPLING_RATE = 16000  # Hz
MFCC_TIMESTEP = 10  # 10 ms
HAMMING_SIZE = 25  # 25 ms
N_MFCC_COEFFS = 39  # as in Mohamed et al. / Dahl et al. (Hinton group) papers
N_FILTERBANK_COEFFS = 40  # as in Acoustic Modeling using Deep Belief Networks
# Mohamed et al.
TALKBOX_FBANKS = False
if TALKBOX_FBANKS:
    from scikits.talkbox.features import mfcc as tbmfcc
DEBUG = False

N_GAMMATONES = 50  # c.f. http://www.briansimulator.org/docs/hears.html
# and http://www.briansimulator.org/docs/examples-hears_approximate_gammatone.html#example-hears-approximate-gammatone
center_frequencies = erbspace(100 * Hz, 1000 * Hz, N_GAMMATONES)

TEST = True  # test numpy serialization

usage = """
    python timit_to_numpy.py MLF_FILENAME.mlf [--gamma]
output files are MLF_FILENAME_xdata.npy, MLF_FILENAME_xfbank.npy,
MLF_FILENAME_xgamma.npy, and MLF_FILENAME_ylabels.npy
    """


def compute_speed_and_accel(x):
    tmp_diff = np.pad(np.diff(x, axis=0), ((0, 1), (0, 0)),
                      'constant',
                      constant_values=(0.0, 0.0))
    tmp_accel = np.pad(np.diff(tmp_diff, axis=0), ((0, 1), (0, 0)),
Example #13
0
# TODO load the following parameters from wav_config
SAMPLING_RATE = 16000 # Hz
MFCC_TIMESTEP = 10 # 10 ms
HAMMING_SIZE = 25 # 25 ms
N_MFCC_COEFFS = 39 # as in Mohamed et al. / Dahl et al. (Hinton group) papers
N_FILTERBANK_COEFFS = 40 # as in Acoustic Modeling using Deep Belief Networks
                         # Mohamed et al.
TALKBOX_FBANKS = False
if TALKBOX_FBANKS:
    from scikits.talkbox.features import mfcc as tbmfcc
DEBUG = False

N_GAMMATONES = 50 # c.f. http://www.briansimulator.org/docs/hears.html
                  # and http://www.briansimulator.org/docs/examples-hears_approximate_gammatone.html#example-hears-approximate-gammatone
center_frequencies = erbspace(100*Hz, 1000*Hz, N_GAMMATONES)

TEST = True # test numpy serialization
 
usage = """
    python timit_to_numpy.py MLF_FILENAME.mlf [--gamma]
output files are MLF_FILENAME_xdata.npy, MLF_FILENAME_xfbank.npy,
MLF_FILENAME_xgamma.npy, and MLF_FILENAME_ylabels.npy
    """


def compute_speed_and_accel(x):
    tmp_diff = np.pad(np.diff(x, axis=0), ((0, 1), (0, 0)), 
            'constant', constant_values=(0.0, 0.0))
    tmp_accel = np.pad(np.diff(tmp_diff, axis=0), ((0, 1), (0, 0)), 
            'constant', constant_values=(0.0, 0.0))
from brian.hears import erbspace, Gammatone, Sound
from brian import Hz
from scipy.signal import hilbert
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics.pairwise import paired_distances
from collections import OrderedDict
import sys
import os
import scipy
import numpy as np

kGfmin, kGfmax, kGbands = 26, 6950, 12
kMfmin, kMfmax, kMbands = 0.5, 100, 12
kEsr = 400
kSR = 16000
cfG = erbspace(kGfmin * Hz, kGfmax * Hz, kGbands)
cfM = erbspace(kMfmin * Hz, kMfmax * Hz, kMbands)

# kEfmin = [0.5, 4.5, 10.5, 20.5]
# kEfmax = [4. , 10., 20., 100. ]

kEfmin = cfM[:-1]
kEfmax = cfM[1:]


def getGammatone(x, fmin, fmax, bands, sr):
    cf = erbspace(fmin * Hz, fmax * Hz, bands)
    gfb = Gammatone(Sound(x, samplerate=sr * Hz), cf)
    gamma = gfb.process()
    return gamma
Example #15
0
def process(
    folder,
    debug=False,
    htk_mfc=False,
    forcemfcext=False,
    stereo_wav=False,
    gammatones=False,
    spectrograms=False,
    filterbanks=False,
    sox=True,
):
    """ applies to all *.wav in folder """

    # first find if we produce normalized MFCC, otherwise note it in the ext
    # because we can then normalize on the whole corpus with another py script
    mfc_extension = ".mfc_unnorm"
    wcfg = open("wav_config", "r")
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = ".mfc"
    if forcemfcext:
        mfc_extension = ".mfc"
    print "MFC extension:", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone
        except ImportError:
            print >> sys.stderr, "You need Brian Hears"
            print >> sys.stderr, "http://www.briansimulator.org/docs/\
                    hears.html"
            sys.exit(-1)
    if spectrograms:
        try:
            from pylab import specgram
        except ImportError:
            print >> sys.stderr, "You need Pylab"
            sys.exit(-1)
    fbanks = None
    if filterbanks:
        try:
            sys.path.append("../spectral")
            from spectral import Mel
        except ImportError:
            print >> sys.stderr, "You need spectral (in the parent folder)"
            print >> sys.stderr, "https://github.com/mwv/spectral"
            sys.exit(-1)

    # run through all the folders and files in the path "folder"
    # and put a header to the waves, save the originals as .rawaudio
    # use HCopy to produce MFCC files according to "wav_config" file
    for bdir, _, files in os.walk(folder):
        for fname in files:
            if fname[-4:] != ".wav":
                continue
            rawfname = bdir + "/" + fname[:-4] + ".rawaudio"
            wavfname = bdir + "/" + fname
            tempfname = bdir + "/" + fname[:-4] + "_temp.wav"
            # temp fname with .wav for sox
            mfccfname = bdir + "/" + fname[:-4] + mfc_extension
            if sox:
                shutil.move(wavfname, tempfname)
                call(["sox", tempfname, wavfname])
                # w/o headers, sox uses extension
                shutil.move(tempfname, rawfname)
            if htk_mfc:
                call(["HCopy", "-C", "wav_config", wavfname, mfccfname])
            srate = 16000
            srate, sound = wavfile.read(wavfname)
            if stereo_wav and len(sound.shape) == 2:  # in mono sound is a list
                sound = sound[:, 0] + sound[:, 1]
                # for stereo wav, sum both channels
            if gammatones:
                gammatonefname = bdir + "/" + fname[:-4] + "_gamma.npy"
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname, "w") as o_f:
                    npsave(o_f, gamma_fb.process())
            if spectrograms:
                powerspec, _, _, _ = specgram(
                    sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)
                )  # TODO
                specgramfname = bdir + "/" + fname[:-4] + "_specgram.npy"
                with open(specgramfname, "w") as o_f:
                    npsave(o_f, powerspec.T)
            if filterbanks:
                # convert to Mel filterbanks
                if fbanks == None:  # assume parameters are fixed
                    fbanks = Mel(
                        nfilt=N_FBANKS,  # nb of filters in mel bank
                        alpha=0.97,  # pre-emphasis
                        fs=srate,  # sampling rate
                        frate=FBANKS_RATE,  # frame rate
                        wlen=FBANKS_WINDOW,  # window length
                        nfft=1024,  # length of dft
                        mel_deltas=False,  # speed
                        mel_deltasdeltas=False,  # acceleration
                    )
                fbank = fbanks.transform(sound)[0]  # first dimension is for
                # deltas & deltasdeltas
                fbanksfname = bdir + "/" + fname[:-4] + "_fbanks.npy"
                with open(fbanksfname, "w") as o_f:
                    npsave(o_f, fbank)
            # TODO wavelets scattergrams / scalograms
            print "dealt with file", wavfname
def getGammatone(x, fmin, fmax, bands, sr):
    cf = erbspace(fmin * Hz, fmax * Hz, bands)
    gfb = Gammatone(Sound(x, samplerate=sr * Hz), cf)
    gamma = gfb.process()
    return gamma
Example #17
0
    def __init__(self, which_set, frame_length, overlap=0,
                 n_channels=64, frames_per_example=1, start=0,
                 stop=None, audio_only=False, n_prev_phones=0,
                 n_next_phones=0, samples_to_predict=1,
                 filter_fn=None, rng=_default_seed, gtfb_data_path='/home/jfsantos/data/pylearn2data/timit/readable'):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only
        self.n_prev_phones = n_prev_phones
        self.n_next_phones = n_next_phones
        self.samples_to_predict = samples_to_predict
        self.n_channels = n_channels
        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        self.fc = erbspace(80*hertz, 5*khertz, self.n_channels)

        # Load data from disk
        self._load_data(which_set, gtfb_data_path)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMITGTFB._mean) / TIMITGTFB._std

        if filter_fn is not None:
            filter_fn = eval(filter_fn)
            indexes = filter_fn(self.speaker_info_list[self.speaker_id])
            self.raw_wav = self.raw_wav[indexes]
            if not self.audio_only:
                self.phones = self.phones[indexes]

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]

        examples_per_sequence = [0]
        self.phone_rel_dur = []

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                tot_n_frames = samples_sequence.shape[0]
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phone_list = numpy.asarray([k for k, g in itertools.groupby(phones_sequence)])
                phone_duration = [len(list(g)) for k, g in itertools.groupby(phones_sequence)]
                phone_position = numpy.cumsum(phone_duration)
                frame_position = numpy.arange(0, tot_n_frames*self.overlap, self.overlap)
                seq_phones = numpy.empty((tot_n_frames, 1+self.n_prev_phones+self.n_next_phones), dtype=int)
                phone_rel_dur = numpy.empty(tot_n_frames, dtype=float)
                for frame in range(tot_n_frames):
                    cur_phone_idx = (frame_position[frame] < phone_position).argmax()
                    if cur_phone_idx == 0:
                        phone_rel_dur[frame] = frame_position[frame]/float(phone_duration[cur_phone_idx])
                    else:
                        phone_rel_dur[frame] = (frame_position[frame] - phone_position[cur_phone_idx-1])/float(phone_duration[cur_phone_idx])
                    if self.n_prev_phones > 0:
                        if cur_phone_idx - self.n_prev_phones < 0:
                            seq_phones[frame,0:self.n_prev_phones] = 5 # code for silent frame
                        else:
                            seq_phones[frame,0:self.n_prev_phones] = phone_list[cur_phone_idx-self.n_prev_phones:cur_phone_idx] # prev phones
                    if self.n_next_phones > 0:
                        if cur_phone_idx + self.n_next_phones >= len(phone_list):
                            seq_phones[frame,-self.n_next_phones:] = 5 # code for silent frame
                        else:
                            seq_phones[frame,-self.n_next_phones] = phone_list[cur_phone_idx+1:cur_phone_idx+self.n_next_phones+1] #next phones
                    seq_phones[frame,self.n_prev_phones] = phone_list[cur_phone_idx]
                self.phone_rel_dur.append(phone_rel_dur)
                self.phones[sequence_id] = seq_phones

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            # s = Sound(samples_sequence, samplerate=16*khertz)
            # fb = Gammatone(s, self.fc)
            # y = fb.process()
            # channel_energy = []
            # Compute energy per channel
            # for ch in range(self.n_channels):
            #     y_ch = segment_axis(y[:,ch], frame_length, overlap)
            #     y_energy = numpy.sum(y_ch**2, axis=1)
            #     channel_energy.append(y_energy)
            
            # channel_energy = numpy.vstack(channel_energy).T
            # channel_energy = samples_sequence
            # if self.n_next_phones == 0:
            #     self.raw_wav[sequence_id] = channel_energy[self.n_prev_phones:]
            # else:
            #     self.raw_wav[sequence_id] = channel_energy[self.n_prev_phones:-self.n_next_phones]

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_sequence.shape[0]-(self.n_prev_phones+self.n_next_phones)
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
#        numpy.save('%s_gtfb_%sch.npy'%(which_set, str(self.n_channels)), self.samples_sequences)
        if not self.audio_only:
            self.phones_sequences = self.phones
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.n_channels * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.n_channels)
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example])
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            num_phones = numpy.max([numpy.max(sequence) for sequence
                                    in self.phones]) + 1
            phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index].ravel())
                return rval

            phone_rel_dur_space = VectorSpace(dim=1)
            phone_rel_dur_source = 'phone_rel_dur'
            def phone_rel_dur_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phone_rel_dur[sequence_index][example_index])
                return rval

            space_components.extend([phones_space, phone_rel_dur_space])
            source_components.extend([phones_source, phone_rel_dur_source])
            map_fn_components.extend([phones_map_fn, phone_rel_dur_map_fn])
            batch_components.extend([None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))