Beispiel #1
0
def load_data_shared(ind):

    #Training and testing data
    timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',')
    timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',')
    timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train]
    timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',')
    timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',')   
    timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test]

    fs = 16000
    datalen = 1280
    narr = np.array([13, 26, 39]); #Number of features in each frame
    i=0; j=0;
    
    trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_train:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            trainfeature[i,:] = mfcc_flat
        elif ind == 1:
            trainfeature[i,:] = fbank_flat
        else:
            trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat))
        i = i+1
        
    testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_test:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            testfeature[j,:] = mfcc_flat
        elif ind == 1:
            testfeature[j,:] = fbank_flat
        else:
            testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat))
        j = j+1

    training_data = (trainfeature, timit_vwlname_train)
    test_data = (testfeature, timit_vwlname_test)

    # For now, I am using test data as validating data. Should change later.
    validation_data = test_data

    def shared(data):
        """Place the data into shared variables.  This allows Theano to copy
        the data to the GPU, if one is available.

        """
        shared_x = theano.shared(
            np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
        shared_y = theano.shared(
            np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
        return shared_x, T.cast(shared_y, "int32")
    
    return [shared(training_data), shared(validation_data), shared(test_data)]
Beispiel #2
0
def svm_baseline():

    #### Change here
    ind = 0;  # 0 for mfcc,  1 for filterbank,  2 for both
    narr = np.array([13, 26, 39]); # corresponding length of feature in a frame


    #Training and testing data
    timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',')
    timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',')
    timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train]
    timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',')
    timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',')     
    timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test]

    fs = 16000
    datalen = 1280
    i=0; j=0;
    trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_train:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            trainfeature[i,:] = mfcc_flat
        elif ind == 1:
            trainfeature[i,:] = fbank_flat
        else:
            trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat))
        i = i+1
        
    testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind]))
    for x in timit_data_test:
        fbank_flat = logfbank(x,fs).flatten()
        mfcc_flat = mfcc(x,fs).flatten()
        if ind == 0:
            testfeature[j,:] = mfcc_flat
        elif ind == 1:
            testfeature[j,:] = fbank_flat
        else:
            testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat))
        j = j+1

    training_data = (list(trainfeature), timit_vwlname_train)
    test_data = (list(testfeature), timit_vwlname_test)


    # train
    clf = svm.SVC()
    clf.fit(training_data[0], training_data[1])
    # test
    predictions = [int(a) for a in clf.predict(test_data[0])]
    num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1]))
    print "Using svm_baseline classifier:"
    print "%s of %s values correct.  %s percent " % (num_correct, len(test_data[1]),
        (num_correct*100)/len(test_data[1]))
Beispiel #3
0
def getFeatures(signal, rate):
	"""
	Extracts Important Vocal Features

	author: chris
	"""
	if signal.shape[0] > mem_cut_off:
		mfcc,fbank = getFeatures(signal[mem_cut_off:], rate)
		return np.concatenate((fs.mfcc(signal[:mem_cut_off],rate),mfcc)), np.concatenate((fs.logfbank(signal,rate),fbank))
	else:
		return fs.mfcc(signal,rate), fs.logfbank(signal,rate)
Beispiel #4
0
def getFeatures(signal, rate):
    """
	Extracts Important Vocal Features

	author: chris
	"""
    if signal.shape[0] > mem_cut_off:
        mfcc, fbank = getFeatures(signal[mem_cut_off:], rate)
        return np.concatenate((fs.mfcc(signal[:mem_cut_off],
                                       rate), mfcc)), np.concatenate(
                                           (fs.logfbank(signal, rate), fbank))
    else:
        return fs.mfcc(signal, rate), fs.logfbank(signal, rate)
Beispiel #5
0
def compute_mfb(filename):
    '''Compute Mel filterbank features on a song and store them in a binary file with the Numpy format.
    
    Argument :
        filename: filename of the wav file located in settings.DIR_SONGS (without path, without wav extension)
    
    Returns: 0 if success
    
    The output file is located in settings.DIR_MEL_FEATURES.
    '''
    tmax = settings.TMAX

    (rate, sig) = wav.read(settings.DIR_SONGS + filename + '.wav')

    if rate != 44100:
        print 'Warning : the rate is not 44100.'

    nSamples, nChannels = sig.shape
    if nChannels != 2:
        print 'Warning : the number of channels is not 2.'
    if nSamples > rate * tmax:
        sig = sig[:rate * tmax, :]  # take the 2 first minutes (for memory)

    sig = sig.mean(1)

    #mfcc_feat = mfcc(sig,rate)
    fbank_feat = logfbank(sig, rate)

    numpy.save(settings.DIR_MEL_FEATURES + filename + '.npy', fbank_feat)
    numpy.save(settings.DIR_SAMPLE_RATE + filename + '.npy', rate)

    return 0
Beispiel #6
0
def compute_dynamic_selected_features(filename):
    
    melFeatures = numpy.load(settings.DIR_MEL_FEATURES + filename + '.npy')
    tmax = settings.TMAX
    
    nPoints, nChannels = melFeatures.shape
    if nChannels != 26:
        print "Warning : 26 channels expected"
    
    nChannelsPerChannel = 13
    
    #timeSize = stft.stft_time_size(melFeatures[:,0], settings.FFT_SIZE, settings.OVERLAP)
    #dynamic_selected_features = numpy.zeros((timeSize, nChannels / 2 * nChannelsPerChannel))
    dynamic_selected_features = []
    
    for i in range(nChannels / 2):
        #dynamic_selected_features[:, i*nChannelsPerChannel:(i+1)*nChannelsPerChannel] = logfbank(melFeatures[:,i],100,settings.FFT_SIZE, settings.FFT_SIZE / settings.OVERLAP)[:,:13]
        A = logfbank(melFeatures[:,i],nPoints/tmax,settings.FFT_SIZE, float(settings.FFT_SIZE) / settings.OVERLAP)[:,:13]
        if i == 0:
            dynamic_selected_features = A
        else:
            dynamic_selected_features = numpy.append(dynamic_selected_features,A,axis=1)
            
    dynamic_selected_features = numpy.transpose(dynamic_selected_features)
    
    nFeatures, timeSize = dynamic_selected_features.shape
    
    featureVar = numpy.sqrt(abs(dynamic_selected_features*dynamic_selected_features).mean(1))
    dynamic_selected_features = dynamic_selected_features/numpy.tile(featureVar.reshape((nFeatures,1)), (1,timeSize))
        
    numpy.save(settings.DIR_SELECTED_FEATURES + filename + '.npy', dynamic_selected_features)   
    return dynamic_selected_features
        
Beispiel #7
0
def compute_mfb(filename):
    '''Compute Mel filterbank features on a song and store them in a binary file with the Numpy format.
    
    Argument :
        filename: filename of the wav file located in settings.DIR_SONGS (without path, without wav extension)
    
    Returns: 0 if success
    
    The output file is located in settings.DIR_MEL_FEATURES.
    '''
    tmax = settings.TMAX
    
    (rate,sig) = wav.read(settings.DIR_SONGS + filename + '.wav')
    
    if rate != 44100:
        print 'Warning : the rate is not 44100.'

    nSamples, nChannels = sig.shape
    if nChannels != 2:
        print 'Warning : the number of channels is not 2.'
    if nSamples > rate*tmax:
        sig = sig[:rate*tmax,:]  # take the 2 first minutes (for memory)
        
    sig = sig.mean(1)
    
    #mfcc_feat = mfcc(sig,rate)
    fbank_feat = logfbank(sig,rate)
    
    numpy.save(settings.DIR_MEL_FEATURES + filename + '.npy', fbank_feat)
    numpy.save(settings.DIR_SAMPLE_RATE + filename + '.npy', rate)
    
    return 0
Beispiel #8
0
def training():
    '''
    Takes input signal and searches current dataset for hit.
    If hit, then add to correct dataset.
    If miss, asks user for currect input and adds to dataset.
    '''

    print("please speak a word into the microphone")
    record_to_file('training.wav')
    print("done - result written to training.wav")

    (rate, sig) = wav.read("training.wav")

    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)

    recording = fbank_feat[1:3, :]

    testing = check_for_match(recording)

    verify = raw_input("did you say " + testing + " ")

    if verify is 'y':
        parse_array(recording, testing)

    if verify is 'n':
        correct_word = input("what word did you mean? ")
        print correct_word
        parse_array(recording, correct_word)
def LogFBank(data, samp):

    mfcc_feat = logfbank(data,samp)
    mMin = mfcc_feat.min()
    mMax = mfcc_feat.max()
    mfcc_feat -= mMin
    mfcc_feat *= 255/mfcc_feat.max()
    outImg = np.array(mfcc_feat, np.uint8)
    return outImg
Beispiel #10
0
    def generate(self,testsample):
        (rate,audio) = wav.read(testsample.path)

        # grab first channel
        one_channel = _extract_single_channel(audio)
        N = len(audio)
        fbank_feat = logfbank(one_channel,samplerate=rate) #winlen=1.0
        cols=fbank_feat.shape[0]*fbank_feat.shape[1]
        return fbank_feat.reshape((1,cols))[0]
Beispiel #11
0
def LogFBank(data, samp):

    mfcc_feat = logfbank(data, samp)
    mMin = mfcc_feat.min()
    mMax = mfcc_feat.max()
    mfcc_feat -= mMin
    mfcc_feat *= 255 / mfcc_feat.max()
    outImg = np.array(mfcc_feat, np.uint8)
    return outImg
Beispiel #12
0
def fbank_feature_extractor(wav_file_path):
    '''
    Extracts mfcc features for the wav file
    '''

    # Extracts mfcc features every 1/200th of a second.
    (rate, sig) = wav.read(wav_file_path)
    fbank_feat = logfbank(sig, rate)

    return fbank_feat
Beispiel #13
0
def analyzeLogBinergy(grain):
    windowSize = int(float(grain["frameCount"]))
    (rate, sig) = wav.read(grain["file"])
    windowedSignal = numpy.multiply(signal.hamming(windowSize), sig)
    energies = logfbank(signal=sig,
                        samplerate=rate,
                        winlen=.020,
                        winstep=.020,
                        nfilt=13,
                        nfft=windowSize)
    return energies.tolist()[0]
Beispiel #14
0
def sndFeature(snd, graph = False):
    #normalize rms here
    snd /= float(np.linalg.norm(snd))
    ft_mfcc = mfcc(snd, samplerate=sampFreq, nfilt=26, numcep=13)[0]
    ft_logf = logfbank(snd, sampFreq)[0]
    #print '{}\n*******\n{}'.format(ft_mfcc, ft_logf)
    #raw_input()
    ft = np.hstack((ft_mfcc, ft_logf))
    #print ft
    #raw_input()
    return ft
Beispiel #15
0
def sndFeature(snd, graph=False):
    #normalize rms here
    snd /= float(np.linalg.norm(snd))
    ft_mfcc = mfcc(snd, samplerate=sampFreq, nfilt=26, numcep=13)[0]
    ft_logf = logfbank(snd, sampFreq)[0]
    #print '{}\n*******\n{}'.format(ft_mfcc, ft_logf)
    #raw_input()
    ft = np.hstack((ft_mfcc, ft_logf))
    #print ft
    #raw_input()
    return ft
def get_track_features(track_name):
    (rate,sig) = wav.read(track_name)
    mfcc_feat = mfcc(sig,rate)
    fbank_feat = logfbank(sig,rate)

    num_segments = len(mfcc_feat)
    num_features = len(mfcc_feat[0])

    features_mean = _get_features_mean(mfcc_feat)
    cov_mat = _get_covariance_matrix(mfcc_feat)

    return (features_mean, cov_mat)
Beispiel #17
0
def get_track_features(track_name):
    (rate, sig) = wav.read(track_name)
    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)

    num_segments = len(mfcc_feat)
    num_features = len(mfcc_feat[0])

    features_mean = _get_features_mean(mfcc_feat)
    cov_mat = _get_covariance_matrix(mfcc_feat)

    return (features_mean, cov_mat)
Beispiel #18
0
 def frequency_banks(self, blockSize=600):
     if self.signal is None:
         self.read_recording()
     fbanks = numpy.zeros((0, 1, 26))
     start = 0
     while start < len(self.signal):
         end = start + blockSize * self.samplerate
         end = end if end < len(self.signal) else len(self.signal)
         block = self.signal[start:end]
         fbank = logfbank(block, self.samplerate, winlen=0.05, winstep=0.025)
         fbanks = numpy.concatenate((fbanks, numpy.reshape(fbank, (len(fbank), 1, 26))))
         start = end
     return fbanks
Beispiel #19
0
def make_dataset():
    files = sorted(os.listdir(file_path))
    data_mfcc, data_lmfb, target = [], [], []
    for file_name in files:
        target += [int(x) for x in file_name[:-4].split('_')]
        segments = segmentation(file_name, play=False, display=False)
        for segment in segments:
            data_mfcc.append(mfcc(segment, samplerate=Fs))
            data_lmfb.append(logfbank(segment, samplerate=Fs))
    f = open('dataset.pkl', 'wr+')
    pickle.dump({'data_mfcc': np.array(data_mfcc),
                'data_lmfb': np.array(data_lmfb),
                'target': np.array(target)}, f)
    f.close()
    return {'data_mfcc': np.array(data_mfcc),
            'data_lmfb': np.array(data_lmfb),
            'target': np.array(target)}
Beispiel #20
0
def compute_fbank(sig, rate, winlen=0.025,winstep=0.01, nfilt=39, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, include_energy=True, snip_edges = True):
	
	if snip_edges:
		#snip the edges
		sig = snip(sig, rate, winlen, winstep)
	
	#compute fbank features and energy
	(feat,energy) = logfbank(sig, rate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph)
	
	if include_energy:
		#append the energy
		fbank_feat = np.ndarray(shape=(feat.shape[0], feat.shape[1] + 1))
		fbank_feat[:,0:feat.shape[1]] = feat
		fbank_feat[:,feat.shape[1]] = energy
	else:
		fbank_feat = feat		
	
	return fbank_feat
def build_codebook(
        trgfile,
        codesize=32,
        fname=None
):  # given a training file constructs the codebook using kmeans
    (rate, sig) = wav.read(trgfile)
    print rate, sig.shape
    #get the spectral vectors
    print("MFCC generation begins")
    mfcc_feat = mfcc(sig, rate)
    print("MFCC generation ends")
    print mfcc_feat.shape
    print("Fbank creation begins")
    fbank_feat = logfbank(sig, rate)  #this has the spectral vectors now
    print("Fbank creation ends")
    print fbank_feat.shape
    print "codesize = ", codesize
    km = KMeans(n_clusters=codesize)
    km.fit(fbank_feat)
    if fname != None:
        pickle.dump(km, open(fname, 'wb'))
    return km
def short_to_mfcc(signal):

    global sampling

    mfcc_features = mfcc(signal,
                         samplerate=sampling,
                         winlen=0.025,
                         winstep=0.01,
                         numcep=13,
                         nfilt=26,
                         nfft=512,
                         lowfreq=0,
                         highfreq=1000,
                         preemph=0.97,
                         ceplifter=22,
                         appendEnergy=True)
    fbank_features = logfbank(signal, sampling)

    #print(fbank_features[1:3,:])

    #return fbank_features[1:3,:]
    return fbank_features[1:2, :]
Beispiel #23
0
def compute_dynamic_selected_features(filename):

    melFeatures = numpy.load(settings.DIR_MEL_FEATURES + filename + '.npy')
    tmax = settings.TMAX

    nPoints, nChannels = melFeatures.shape
    if nChannels != 26:
        print "Warning : 26 channels expected"

    nChannelsPerChannel = 13

    #timeSize = stft.stft_time_size(melFeatures[:,0], settings.FFT_SIZE, settings.OVERLAP)
    #dynamic_selected_features = numpy.zeros((timeSize, nChannels / 2 * nChannelsPerChannel))
    dynamic_selected_features = []

    for i in range(nChannels / 2):
        #dynamic_selected_features[:, i*nChannelsPerChannel:(i+1)*nChannelsPerChannel] = logfbank(melFeatures[:,i],100,settings.FFT_SIZE, settings.FFT_SIZE / settings.OVERLAP)[:,:13]
        A = logfbank(melFeatures[:, i], nPoints / tmax, settings.FFT_SIZE,
                     float(settings.FFT_SIZE) / settings.OVERLAP)[:, :13]
        if i == 0:
            dynamic_selected_features = A
        else:
            dynamic_selected_features = numpy.append(dynamic_selected_features,
                                                     A,
                                                     axis=1)

    dynamic_selected_features = numpy.transpose(dynamic_selected_features)

    nFeatures, timeSize = dynamic_selected_features.shape

    featureVar = numpy.sqrt(
        abs(dynamic_selected_features * dynamic_selected_features).mean(1))
    dynamic_selected_features = dynamic_selected_features / numpy.tile(
        featureVar.reshape((nFeatures, 1)), (1, timeSize))

    numpy.save(settings.DIR_SELECTED_FEATURES + filename + '.npy',
               dynamic_selected_features)
    return dynamic_selected_features
def vector_quantize(
    myfiles, outdir, model
):  #given a list of files transform them to spectral vectors and compute the KMeans VQ
    for f in myfiles:
        print "Quantizing: ", f
        (rate, sig) = wav.read(f)
        print rate, sig.shape
        #get the spectral vectors
        mfcc_feat = mfcc(sig, rate)
        print mfcc_feat.shape
        fbank_feat = logfbank(sig, rate)  #this has the spectral vectors now
        print fbank_feat.shape
        val = model.predict(fbank_feat)
        fcomps = os.path.split(f)  #file components path, filename
        fn = fcomps[-1].split('.')[0] + '_vq.txt'
        #outpath = os.path.join(fcomps[0], 'outputs')
        fn = os.path.join(outdir, fn)
        f = open(fn, 'wb')
        for v in val:
            f.write(str(v) + '\n')
        f.close()
        print 'output vector quantized file:  ', f, ' written'
    return
def logfbank_feature(sig,rate):
	'''
	this function is used to change the logfbank_feature of every frame into
	statistic value

	output features including:
	1. average of 26 features
	2. maximum ...
	3. minimum ...
	4. varience ...

	INPUT: logfbank_feat (FRAMENUM, 26)
	OUTPUT: ave_logfbank (26, )
			max_logfbank (26, )
			min_logfbank (26, )
			var_logfbank (26, )
	'''
	logfbank_feat = logfbank(sig,rate)
	ave_logfbank = np.mean(logfbank_feat, axis = 0)
	max_logfbank = np.max(logfbank_feat, axis = 0)
	min_logfbank = np.min(logfbank_feat, axis = 0)
	var_logfbank = np.var(logfbank_feat, axis = 0)

	return [ave_logfbank, max_logfbank, min_logfbank, var_logfbank]
Beispiel #26
0
def _speechFeatures():

    filename = sorted(glob.glob(outputDir + '/*.' + audioTargetFormat))[2]
    (rate, sig) = wav.read(filename)

    sig = sig[0:(rate * 10)]

    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate)
    print(fbank_feat[1:3, :])
    print(fbank_feat.shape)
    print(mfcc_feat.shape)

    fileoutName = filename.replace('.' + audioTargetFormat, '.png')
    fileoutName = 'test.png'
    print(fileoutName)
    fig = plt.figure(figsize=(12, 4))
    ax = fig.add_subplot(211)
    ax.contourf(np.transpose(mfcc_feat))
    plt.tight_layout()

    ax = fig.add_subplot(212)
    mfcc_sum = np.sum(np.transpose(np.sqrt(mfcc_feat * mfcc_feat)), 0)

    n = 6
    mfcc_sum_ref = mfcc_sum[:]
    for i in range(len(mfcc_sum_ref)):
        minidx = max(0, i - int(n / 2))
        maxidx = min(len(mfcc_sum_ref), i + (n - int(n / 2)))
        mfcc_sum[i] = np.sum(mfcc_sum_ref[minidx:maxidx]) / (maxidx - minidx)

    ax.plot(mfcc_sum)
    #ax.set_yscale('log')
    plt.tight_layout()

    plt.savefig(fileoutName, format='png', dpi=300)
Beispiel #27
0
def _speechFeatures():
    
    filename=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat))[2]
    (rate,sig) = wav.read(filename)

    sig = sig[0:(rate*10)]

    mfcc_feat = mfcc(sig,rate)
    fbank_feat = logfbank(sig,rate)
    print(fbank_feat[1:3,:])
    print(fbank_feat.shape)
    print(mfcc_feat.shape)

    fileoutName=filename.replace('.'+audioTargetFormat,'.png')
    fileoutName='test.png'
    print(fileoutName)
    fig = plt.figure(figsize=(12,4))
    ax = fig.add_subplot(211)
    ax.contourf(np.transpose(mfcc_feat))
    plt.tight_layout()

    ax = fig.add_subplot(212)
    mfcc_sum = np.sum(np.transpose(np.sqrt(mfcc_feat*mfcc_feat)),0)
    
    n=6
    mfcc_sum_ref = mfcc_sum[:]
    for i in range(len(mfcc_sum_ref)):
        minidx=max(0,i-int(n/2))
        maxidx=min(len(mfcc_sum_ref),i+(n-int(n/2)))
        mfcc_sum[i]=np.sum(mfcc_sum_ref[minidx:maxidx])/(maxidx-minidx)

    ax.plot(mfcc_sum)
    #ax.set_yscale('log')
    plt.tight_layout()

    plt.savefig(fileoutName,format='png',dpi=300)
def analyzeLogBinergy(grain):
    windowSize = int(float(grain["frameCount"]))
    (rate,sig) = wav.read(grain["file"])
    windowedSignal = numpy.multiply(signal.hamming(windowSize), sig)
    energies = logfbank(signal=sig, samplerate=rate, winlen=.020, winstep=.020, nfilt=13, nfft=windowSize)   
    return energies.tolist()[0]
Beispiel #29
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav
import pickle
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV

X = []
y = []

#	Feature extraction using mfcc features

a = "C:\\Project\\speech\\corpus\\"
for num in range(1, 6):
    for ite in range(1, 11):
        (rate, sig) = wav.read(a + str(num) + "\\" + str(ite) + ".wav")
        mfcc_feat = mfcc(sig, rate)
        print(len(logfbank(sig, rate).flatten()))
        X.append(logfbank(sig, rate)[:10000, :].flatten())
        print(num, ite)

        #y.append(num)
pickle.dump(X, open("XX.pkl", "wb"))
#pickle.dump( y, open( "y.pkl", "wb" ))
# print(fbank_feat[:500,:].flatten())
def get_data(path_to_audio, path_to_labels, delimiter_char, nb_features=13):
    (rate, sig) = wav.read(path_to_audio)
    target = np.genfromtxt(path_to_labels, dtype=long, delimiter=delimiter_char)
    labels = target[:, 1]
    if 1:  # Change Window Size
        window_size = 0.1 # Default window size is milliseconds
        if 1:
            mfcc_feat = mfcc(sig, rate, window_size, 0.1, nb_features)
        if 0:
            mfcc_feat = mfcc(sig, rate, window_size, 0.1, nb_features/2)
            fbank_feat = logfbank(sig, rate, window_size, 0.1, nb_features/2)
            #ssc_feat = ssc(sig, rate, window_size, 0.1, nb_features/2)
            temp = np.empty([mfcc_feat.shape[0],nb_features]);
            for i in range(len(mfcc_feat)):
                temp1 = np.append(mfcc_feat[i], fbank_feat[i])
                np.append(temp,temp1)
            mfcc_feat = temp
    if 0: # Change Window Size and step size when aggregation is done on labels
        window_size = 0.1
        window_step = 1
        mfcc_feat = mfcc(sig, rate, window_size, window_step, nb_features)
    if 1: # Normalize features
        print "Normalizing Features"
        for col in range(nb_features):
            min_col = np.amin(mfcc_feat[:, col])
            max_col = np.amax(mfcc_feat[:, col])
            range_col = max_col - min_col
            mfcc_feat[:, col] = (mfcc_feat[:, col] - min_col) / range_col
    if 1: # Low pass features
        print "Low Pass Filtering features"
        convolute_size = 4
        count = mfcc_feat.shape[0]
        new_feat = np.empty([count, nb_features])
        for i in range(count):
            if (i < convolute_size) or (i > count - 1 - convolute_size):
                new_feat[i, :] = mfcc_feat[i, :]
            else:
                row_ = mfcc_feat[i, :]
                for row_dex in range(1, 1 + convolute_size):
                    row_ = row_ + mfcc_feat[i + row_dex, :]
                    row_ = row_ + mfcc_feat[i - row_dex, :]
                new_feat[i, :] = row_ / (convolute_size * 2 + 1)
        mfcc_feat = new_feat
    if 0: # Aggregating labels by block
        print "Aggregation of labels on", window_step,"sec"
        count = labels.shape[0]
        aggregate_size = int(window_step*10)
        size_labels = ceil(float(count)/aggregate_size)
        modified_size = min(mfcc_feat.shape[0], size_labels)
        mfcc_feat = mfcc_feat[0:modified_size,:]
        new_labels = np.empty([modified_size])
        new_index = 0
        for i in range(0,int((modified_size-1)*aggregate_size+1),aggregate_size):
            (new_labels[new_index], count_) = stats.mode(labels[i:i+aggregate_size])
            new_index += 1
        labels = new_labels
    if 0: # Low pass labels
        print "Low Pass Filtering labels"
        convolute_size = 4
        count = labels.shape[0]
        new_labels = np.empty([count])
        for i in range(count):
            if (i < convolute_size) or (i > count - 1 - convolute_size):
                new_labels[i] = labels[i]
            else:
                row_ = labels[i]
                for row_dex in range(1, 1 + convolute_size):
                    row_ = row_ + labels[i + row_dex]
                    row_ = row_ + labels[i - row_dex]
                new_labels[i] = row_ / (convolute_size * 2 + 1)
        labels = new_labels
    if 1:  # get rid of background points = class 5
        print "Removing speaking parts"
        all_sound_count = 0
        non_verbal_count = 0
        for row in labels:
            if row != 5:
                non_verbal_count += 1
            all_sound_count += 1
        new_feat = np.empty([non_verbal_count, nb_features])
        new_target = np.empty([non_verbal_count, 2])
        new_labels = np.empty([non_verbal_count])
        count = 0
        dex = 0
        for row in labels:
            if row != 5:
                new_target[count, :] = target[dex, :]
                new_feat[count, :] = mfcc_feat[dex, :]
                new_labels[count] = labels[dex]
                count += 1
            dex += 1
        mfcc_feat = new_feat
        labels = new_labels
    labels = labels.astype('int')
    n_classes = np.unique(labels)
    return [mfcc_feat, labels, n_classes]
Beispiel #31
0
def main(setname):

  print "    Extract SIG features for ", setname, " ..."

  if setname == "train" :
    waveChannels = config['train_waveChannels'].strip().split()
    transcChannels = config['train_transcChannels_ctm'].strip().split()
  elif setname == "test" :
    waveChannels = config['test_waveChannels'].strip().split()
    transcChannels = config['test_transcChannels_ctm'].strip().split()
  else :
    print "Error!!! Define the set name train or test\n"
    return

  CHANNELS = int(config['CHANNELS'])
  

  for ch in range(CHANNELS):
    
    print "      Extract SIG features for CHANNEL ", str(ch+1)

    # load the wave files for each recording channel
    wavfiles = "data/lists/"+setname+"_CH_"+str(ch+1)+"_wav.list"
    if not waveChannels[ch] or not transcChannels[ch]:
      print "ERROR!!! SIG features need .CTM format of transcriptions"
      return
    wavfiles = waveChannels[ch]
    ctmfile  = transcChannels[ch]

    # save the SIG features into this file
    sig_feat_file = config['BASEDIR']+"/data/features/"+setname+"_CH_"+str(ch+1)+"_SIG.feat"
  
    ctm_array = load_ctm (ctmfile)
    
    ind = -1
    feat_matrix = np.array([])

    wav_doc = open(wavfiles, 'r')

    # for each wave file, compute the frame mfcc/energy. then assign the frames to the recognized words
    for wavfile in wav_doc:

      ind += 1

      (rate,sig) = wav.read(wavfile.strip())
      mfcc_feat = mfcc(sig,rate,winlen=0.02,winstep=0.01,numcep=12)
      fbank_feat =logfbank(sig,rate,winlen=0.02,winstep=0.01, nfilt=1)
  
      w2fr = load_ctm_info (ctm_array[ind], fbank_feat)
    
      sil_no = 0; sil_e = 0; min_sil_e = 1000; max_sil_e = -1000; sil_dur = 0; std_sil_dur = 0
      wrd_no = 0; wrd_e= 0; min_wrd_e = 1000; max_wrd_e = -1000; wrd_dur = 0; std_wrd_dur = 0
  
      for elem in w2fr:
        w = elem[0]
        t1= elem[1]
        t2= elem[2]
        e = elem[3]
        if w == "@bg":  # if it's noise
          sil_no += 1
          sil_e += e
          sil_dur += t2-t1+1
          if e < min_sil_e:
            min_sil_e = e
          elif e > max_sil_e:
            max_sil_e = e      
        else :          # if it's word
          wrd_no += 1
          wrd_e += e
          wrd_dur += t2-t1+1
          if e < min_wrd_e:
            min_wrd_e = e
          elif e > max_wrd_e:
            max_wrd_e = e


      # compute the following Features   
      feat_vector = np.array([])
    
      feat_vector = np.append(feat_vector, mfcc_feat.shape[0]/100 ); # total seg duration
  
      feat_vector = np.append(feat_vector, mfcc_feat.mean (axis = 0) )  # mean of mfcc
  
      feat_vector = np.append(feat_vector, fbank_feat.mean (axis = 0) ) # mean of energy
      feat_vector = np.append(feat_vector, fbank_feat.min (axis = 0) ) # min of energy
      feat_vector = np.append(feat_vector, fbank_feat.max (axis = 0) ) # max of energy

      feat_vector = np.append(feat_vector, (sil_e/sil_no) ) # mean noise energy
      feat_vector = np.append(feat_vector, min_sil_e )  # min of noise energy
      feat_vector = np.append(feat_vector, max_sil_e )  # max of noise energy
  
      feat_vector = np.append(feat_vector, (wrd_e/wrd_no) )  # mean of word energies
      feat_vector = np.append(feat_vector, min_wrd_e )  # min of word energies
      feat_vector = np.append(feat_vector, max_wrd_e )  # max of noise energies
  
      feat_vector = np.append(feat_vector, (wrd_e/wrd_no) / (sil_e/sil_no) ) # Signal to Noise ratio
    
      feat_vector = np.append(feat_vector, max_wrd_e - min_sil_e ) # max word energy - min noise energy
   
      feat_vector = np.append(feat_vector, sil_no ) # number of silences
   
      feat_vector = np.append(feat_vector, sil_no / wrd_no ) # silence to noise ratio
  
      feat_vector = np.append(feat_vector, wrd_no / wrd_dur ) # number of words per second (frame)
 
      if sil_dur == 0:
         feat_vector = np.append(feat_vector, 0 )
      else:
         feat_vector = np.append(feat_vector, sil_no / sil_dur )  # number of silences per second (frame)
  
      feat_vector = np.append(feat_vector, wrd_dur )  # total words duration
  
      feat_vector = np.append(feat_vector, sil_dur )  # total silence duration
  
      feat_vector = np.append(feat_vector, wrd_dur / wrd_no )  # mean of words duration
  
      feat_vector = np.append(feat_vector, sil_dur / sil_no ) # mean of silence duration
  
      feat_vector = np.append(feat_vector, sil_dur / wrd_dur ) # silence to word duration ratio
  
      feat_vector = np.append(feat_vector, wrd_dur - sil_dur ) # word duration - silence duration

      for elem in w2fr:
        w = elem[0]
        t1= elem[1]
        t2= elem[2]
        if w == "@bg":  # if it's noise
          std_sil_dur += math.pow((t2-t1+1) - (sil_dur/sil_no) , 2)
        else:           # if it's word
          std_wrd_dur += math.pow((t2-t1+1) - (wrd_dur/wrd_no), 2 )
       
  
      feat_vector = np.append(feat_vector, math.sqrt ( std_wrd_dur ) / wrd_no ) # std of the words duration
   
      feat_vector = np.append(feat_vector, math.sqrt ( std_sil_dur ) / wrd_no )  # std of the silence duration
    
      if len(feat_matrix) < 1:
        feat_matrix = feat_vector
      else:
        feat_matrix = np.vstack([ feat_matrix, feat_vector] )

    np.savetxt( sig_feat_file, feat_matrix , fmt='%.4f')
Beispiel #32
0
def extractLogFBank(path):
    os.system(sph2pipe + " -f wav " + path + " tmp.wav")
    (rate, sig) = wav.read("tmp.wav")
    feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
    os.remove("tmp.wav")
    return feats
Beispiel #33
0
 def getFBanks(self,waves):
     fbanks = []
     for wave in waves:
         fbanks.append(logfbank(wave[1],wave[0]))
     return fbanks
sampling_freq, signal = wavfile.read('datas/sounds/random_sound.wav')

# Take the first 10,000 samples for analysis
signal = signal[:10000]

# Extract the MFCC features
features_mfcc = mfcc(signal, sampling_freq)

# Print the parameters for MFCC
print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0])
print('Length of each feature =', features_mfcc.shape[1])

# Plot the features
features_mfcc = features_mfcc.T
plt.matshow(features_mfcc)
plt.title('MFCC')

# Extract the Filter Bank features
features_fb = logfbank(signal, sampling_freq)

# Print the parameters for Filter Bank
print('\nFilter bank:\nNumber of windows =', features_fb.shape[0])
print('Length of each feature =', features_fb.shape[1])

# Plot the features
features_fb = features_fb.T
plt.matshow(features_fb)
plt.title('Filter bank')

plt.show()
Beispiel #35
0
import math
from scipy.signal import lfilter
from scikits.talkbox import lpc

path = "/home/ponco/devel/mel_cepstral_coeff_neural/vowels/"

# First set up the figure, the axis, and the plot element we want to animate
fig = plt.figure()
ax = plt.axes(xlim=(0, 25), ylim=(-84, 80))
#ax = plt.axes(xlim=(0, 25), ylim=(0, 20))
line, = ax.plot([], [], lw=2)

#MEL
(rate,sig) = wav.read("Ah.wav")
mfcc_feat = mfcc(sig,rate,numcep=30,appendEnergy=False)
fbank_feat = logfbank(sig,rate,nfilt=40)

# initialization function: plot the background of each frame
def init():
    line.set_data([], [])
    return line,

# animation function.  This is called sequentially
def animate(i):
    #x = np.linspace(0, 12,13)
    x = np.linspace(0, 25,26)
    y = mfcc_feat[i,:]
    #y = fbank_feat[i,:]
    #print("x:" , x.shape)
    #print("y:" , y.shape)
    
Beispiel #36
0
def extractLogFBank(rate, sig):
    feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
    return feats
Beispiel #37
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav

(rate, sig) = wav.read("audio/s1_an_1.wav")
mfcc_feat = mfcc(sig, rate)
fbank_feat = logfbank(sig, rate)

print fbank_feat[1:3, :]
Beispiel #38
0
def main(setname):

    print "    Extract SIG features for ", setname, " ..."

    if setname == "train":
        waveChannels = config['train_waveChannels'].strip().split()
        transcChannels = config['train_transcChannels_ctm'].strip().split()
    elif setname == "test":
        waveChannels = config['test_waveChannels'].strip().split()
        transcChannels = config['test_transcChannels_ctm'].strip().split()
    else:
        print "Error!!! Define the set name train or test\n"
        return

    CHANNELS = int(config['CHANNELS'])

    for ch in range(CHANNELS):

        print "      Extract SIG features for CHANNEL ", str(ch + 1)

        # load the wave files for each recording channel
        wavfiles = "data/lists/" + setname + "_CH_" + str(ch + 1) + "_wav.list"
        if not waveChannels[ch] or not transcChannels[ch]:
            print "ERROR!!! SIG features need .CTM format of transcriptions"
            return
        wavfiles = waveChannels[ch]
        ctmfile = transcChannels[ch]

        # save the SIG features into this file
        sig_feat_file = config[
            'BASEDIR'] + "/data/features/" + setname + "_CH_" + str(
                ch + 1) + "_SIG.feat"

        ctm_array = load_ctm(ctmfile)

        ind = -1
        feat_matrix = np.array([])

        wav_doc = open(wavfiles, 'r')

        # for each wave file, compute the frame mfcc/energy. then assign the frames to the recognized words
        for wavfile in wav_doc:

            ind += 1

            (rate, sig) = wav.read(wavfile.strip())
            mfcc_feat = mfcc(sig, rate, winlen=0.02, winstep=0.01, numcep=12)
            fbank_feat = logfbank(sig,
                                  rate,
                                  winlen=0.02,
                                  winstep=0.01,
                                  nfilt=1)

            w2fr = load_ctm_info(ctm_array[ind], fbank_feat)

            sil_no = 0
            sil_e = 0
            min_sil_e = 1000
            max_sil_e = -1000
            sil_dur = 0
            std_sil_dur = 0
            wrd_no = 0
            wrd_e = 0
            min_wrd_e = 1000
            max_wrd_e = -1000
            wrd_dur = 0
            std_wrd_dur = 0

            for elem in w2fr:
                w = elem[0]
                t1 = elem[1]
                t2 = elem[2]
                e = elem[3]
                if w == "@bg":  # if it's noise
                    sil_no += 1
                    sil_e += e
                    sil_dur += t2 - t1 + 1
                    if e < min_sil_e:
                        min_sil_e = e
                    elif e > max_sil_e:
                        max_sil_e = e
                else:  # if it's word
                    wrd_no += 1
                    wrd_e += e
                    wrd_dur += t2 - t1 + 1
                    if e < min_wrd_e:
                        min_wrd_e = e
                    elif e > max_wrd_e:
                        max_wrd_e = e

            # compute the following Features
            feat_vector = np.array([])

            feat_vector = np.append(feat_vector, mfcc_feat.shape[0] / 100)
            # total seg duration

            feat_vector = np.append(feat_vector,
                                    mfcc_feat.mean(axis=0))  # mean of mfcc

            feat_vector = np.append(feat_vector,
                                    fbank_feat.mean(axis=0))  # mean of energy
            feat_vector = np.append(feat_vector,
                                    fbank_feat.min(axis=0))  # min of energy
            feat_vector = np.append(feat_vector,
                                    fbank_feat.max(axis=0))  # max of energy

            feat_vector = np.append(feat_vector,
                                    (sil_e / sil_no))  # mean noise energy
            feat_vector = np.append(feat_vector,
                                    min_sil_e)  # min of noise energy
            feat_vector = np.append(feat_vector,
                                    max_sil_e)  # max of noise energy

            feat_vector = np.append(feat_vector,
                                    (wrd_e / wrd_no))  # mean of word energies
            feat_vector = np.append(feat_vector,
                                    min_wrd_e)  # min of word energies
            feat_vector = np.append(feat_vector,
                                    max_wrd_e)  # max of noise energies

            feat_vector = np.append(feat_vector, (wrd_e / wrd_no) /
                                    (sil_e / sil_no))  # Signal to Noise ratio

            feat_vector = np.append(
                feat_vector,
                max_wrd_e - min_sil_e)  # max word energy - min noise energy

            feat_vector = np.append(feat_vector, sil_no)  # number of silences

            feat_vector = np.append(feat_vector,
                                    sil_no / wrd_no)  # silence to noise ratio

            feat_vector = np.append(
                feat_vector,
                wrd_no / wrd_dur)  # number of words per second (frame)

            if sil_dur == 0:
                feat_vector = np.append(feat_vector, 0)
            else:
                feat_vector = np.append(
                    feat_vector,
                    sil_no / sil_dur)  # number of silences per second (frame)

            feat_vector = np.append(feat_vector,
                                    wrd_dur)  # total words duration

            feat_vector = np.append(feat_vector,
                                    sil_dur)  # total silence duration

            feat_vector = np.append(feat_vector,
                                    wrd_dur / wrd_no)  # mean of words duration

            feat_vector = np.append(feat_vector, sil_dur /
                                    sil_no)  # mean of silence duration

            feat_vector = np.append(feat_vector, sil_dur /
                                    wrd_dur)  # silence to word duration ratio

            feat_vector = np.append(
                feat_vector,
                wrd_dur - sil_dur)  # word duration - silence duration

            for elem in w2fr:
                w = elem[0]
                t1 = elem[1]
                t2 = elem[2]
                if w == "@bg":  # if it's noise
                    std_sil_dur += math.pow((t2 - t1 + 1) - (sil_dur / sil_no),
                                            2)
                else:  # if it's word
                    std_wrd_dur += math.pow((t2 - t1 + 1) - (wrd_dur / wrd_no),
                                            2)

            feat_vector = np.append(feat_vector,
                                    math.sqrt(std_wrd_dur) /
                                    wrd_no)  # std of the words duration

            feat_vector = np.append(feat_vector,
                                    math.sqrt(std_sil_dur) /
                                    wrd_no)  # std of the silence duration

            if len(feat_matrix) < 1:
                feat_matrix = feat_vector
            else:
                feat_matrix = np.vstack([feat_matrix, feat_vector])

        np.savetxt(sig_feat_file, feat_matrix, fmt='%.4f')
Beispiel #39
0
            samples = trim_or_pad(samples, max_len_seconds * fs)
            if len(np.shape(samples)) == 2:
              samples = samples[:, 0]
            norm = sqrt(np.dot(samples, samples))
            print 'appending', f
            sounds.append((fs, np.array(samples) / norm))
        except (ValueError, TypeError):
          "Couldn't read wav file"

features = []
for fs, s in sounds:
  mfcc_feat = mfcc(s, fs)
  mfcc_feat = np.reshape(mfcc_feat, (1, np.shape(mfcc_feat)[0] * np.shape(mfcc_feat)[1]))
  ssc_feat = ssc(s, fs)
  ssc_feat = np.reshape(ssc_feat, (1, np.shape(ssc_feat)[0] * np.shape(ssc_feat)[1]))
  lfbank_feat = logfbank(s, fs)
  lfbank_feat = np.reshape(lfbank_feat, (1, np.shape(lfbank_feat)[0] * np.shape(lfbank_feat)[1]))

  #import pdb; pdb.set_trace()

  #ceps, mspec, spec = mfcc(s, fs = fs)
  #ceps = np.reshape(ceps, (1, np.shape(ceps)[0] * np.shape(ceps)[1]))
  
  features.append(np.hstack([mfcc_feat, ssc_feat, lfbank_feat]))
  #features.append(np.hstack([ssc_feat]))

M = np.vstack(features)
print np.shape(M)

pca = PCA(n_components=500)
V = pca.fit_transform(M)
Beispiel #40
0
#!/usr/bin/env python


from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav
from sklearn.metrics import mean_squared_error

(rate,sig) = wav.read("./testfiles/energy.wav")
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)

(rate2,sig2) = wav.read("./testfiles/64energyfiltered.wav")
mfcc_feat2 = mfcc(sig2,rate2)
fbank_feat2 = logfbank(sig2,rate2)

print mean_squared_error(mfcc_feat, mfcc_feat2[:46])
Beispiel #41
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav

(rate,data) = wav.read("demo.wav")
mfcc_feat = mfcc(data,rate)
fbank_feat = logfbank(data,rate)

print fbank_feat[1:3,:]
Beispiel #42
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav
import os

keyword = ["1", "2"]

print keyword.index("1")
print keyword.index("2")
print keyword.index("3")

sph2pipe = "/Users/evgeny/kaldi3/tools/sph2pipe_v2.5/sph2pipe"

path = "/Users/evgeny/timit/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV"

os.system(sph2pipe + " -f wav " + path + " tmp.wav")

window = 0.025
step = 0.01
nfilt = 40
fftsize = 512

(rate,sig) = wav.read("tmp.wav")

os.remove("tmp.wav")

mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
print sig, rate
print fbank_feat[1:3,:]
Beispiel #43
0
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from features import mfcc, logfbank

# Read input sound file
sampling_freq, audio = wavfile.read("input_freq.wav")

# Extract MFCC and Filter bank features
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

# Print parameters
print('\nMFCC:\nNumber of windows =', mfcc_features.shape[0])
print('Length of each feature =', mfcc_features.shape[1])
print('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0])
print('Length of each feature =', filterbank_features.shape[1])

# Plot the features
mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')

plt.show()
Beispiel #44
0
#get wav

(rate,sig) = wav.read("BF4.wav")


#MFCC
mfcc_feat_not_norm = mfcc(sig,rate)
max_mfcc = np.amax(mfcc_feat_not_norm)
mfcc_feat = (1/max_mfcc) * mfcc_feat_not_norm

mfcc_size = len(mfcc_feat[:,1]) # x dimensions MFCC


#Log Spec
fbank_feat_not_norm = logfbank(sig,rate)
max_log = np.amax(fbank_feat_not_norm)
fbank_feat = (1/max_log) * fbank_feat_not_norm
logSizeX = len(fbank_feat[1,:])# y dimensions log spec
logSizeY =len(fbank_feat[:,1])# x dimensions log spec


'''
#plotting Log Spec
fig = plt.figure(1)
ax = fig.add_subplot(2, 1, 1, projection='3d')
X = np.arange(0, logSizeX, 1)
Y = np.arange(0, logSizeY, 1)
X, Y = np.meshgrid(X, Y)
R = np.sqrt(X**2 + Y**2)
Z = fbank_feat
Beispiel #45
0
def extractLogFBank(path):
    os.system(sph2pipe + " -f wav " + path + " tmp.wav")
    (rate, sig) = wav.read("tmp.wav")
    feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
    os.remove("tmp.wav")
    return feats
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile 
from features import mfcc, logfbank

# Read input sound file
sampling_freq, audio = wavfile.read("input_freq.wav")

# Extract MFCC and Filter bank features
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

# Print parameters
print '\nMFCC:\nNumber of windows =', mfcc_features.shape[0]
print 'Length of each feature =', mfcc_features.shape[1]
print '\nFilter bank:\nNumber of windows =', filterbank_features.shape[0]
print 'Length of each feature =', filterbank_features.shape[1]

# Plot the features
mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')

plt.show()
Beispiel #47
0
print Sxx.shape
'''

with open('data/spectrogram_gabriel.pickle', 'rb') as f:
    (X_gab, y_gab) = pickle.load(f)

wav_file = 'data/SA1_RIFF.WAV'

spect_new = spectrogram_converter.spectrogram(wav_file)
(rate, sig) = wav.read(wav_file)
fbe = logfbank(sig,
               samplerate=rate,
               winlen=0.025,
               winstep=0.01,
               nfilt=26,
               nfft=512,
               lowfreq=0,
               highfreq=None,
               preemph=0.97)
fbe = np.fliplr(zip(*fbe[::-1]))

print 'Duration: %f' % duration(wav_file)
print 'Array Size Spect: %d' % spect_new.shape[2]
print 'Array Size FBE new: %d' % fbe.shape[1]
print 'Array Size FBE old: %d' % X_gab.shape[3]

f, (plt1, plt2, plt3) = plt.subplots(3, 1, sharey=False)
plt1.imshow(spect_new)
plt1.set_title('new spectrogram')
Beispiel #48
0
def extractLogFBank(rate, sig):
    feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
    return feats
Beispiel #49
0
__author__ = 'jasonboyer'
''' From https://github.com/jameslyons/python_speech_features example.py
'''
import sys
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav

if len(sys.argv) < 2:
    print("Plays a wave file.\n\nUsage: %s filename.wav" % sys.argv[0])
    sys.exit(-1)

(rate,sig) = wav.read(sys.argv[1])
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])
Beispiel #50
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav

(rate, data) = wav.read("demo.wav")
mfcc_feat = mfcc(data, rate)
fbank_feat = logfbank(data, rate)

print fbank_feat[1:3, :]
"""features.mfcc() - Mel Frequency Cepstral Coefficients
features.fbank() - Filterbank Energies
features.logfbank() - Log Filterbank Energies
features.ssc() - Spectral Subband Centroids
"""
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav
coun = 0
kkk = 7
while (kkk == 7):
    (rate, sig) = wav.read("blues.0000" + str(kkk) + ".wav")
    mfcc_feat = mfcc(sig, rate)
    fbank_feat = logfbank(sig, rate, winlen=0.03, winstep=0.03)

    #print fbank_feat[0]
    normalised = []
    for i in fbank_feat:
        sublist = []
        for j in i:
            sublist.append(int(round(j / 22 * 7)))
        normalised.append(sublist)
    with open("blue.txt", "a") as myfile:
        for i in normalised:
            print i
            for j in i:
                myfile.write(str(j))
                coun += 1
    kkk = kkk + 1
'''
kkk=10
Beispiel #52
0
from features import mfcc
from features import logfbank
import scipy.io.wavfile as wav
import os

keyword = ["1", "2"]

print keyword.index("1")
print keyword.index("2")
print keyword.index("3")

sph2pipe = "/Users/evgeny/kaldi3/tools/sph2pipe_v2.5/sph2pipe"

path = "/Users/evgeny/timit/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV"

os.system(sph2pipe + " -f wav " + path + " tmp.wav")

window = 0.025
step = 0.01
nfilt = 40
fftsize = 512

(rate, sig) = wav.read("tmp.wav")

os.remove("tmp.wav")

mfcc_feat = mfcc(sig, rate)
fbank_feat = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0)
print sig, rate
print fbank_feat[1:3, :]
Beispiel #53
0
def testmfcc(wavfile="../thecatsatonthemat.wav"):
    (rate,sig) = wav.read(wavfile)
    print "rate %s, len(sig) %s"%(rate, len(sig))
    mfcc_feat = mfcc(sig,rate)
    fbank_feat = logfbank(sig,rate)
    return fbank_feat
#y = np.array(pickle.load( open( "yy.pkl", "rb" ) ))
for i in range(1, 17):
    for j in range(1, 10):
        if i != predicted:
            y.append(0)
        else:
            y.append(1)

y = np.array(y)

X1_test = []
(rate, sig) = wav.read("s.wav")
mfcc_feat = mfcc(sig, rate)
#print(len(X1))
#print(len(logfbank(sig,rate)[:10000].flatten()))
X1_test = (logfbank(sig, rate).flatten()[:10000])
X1_test = np.array(X1_test)
model = SVC(kernel="linear")
model.fit(X1, y)
ans = model.predict(X1_test)

print("the prediction from speech :")
if (ans == 1):
    print(predicted)
else:
    print("not")

if (ans == 1):
    print("Validated")
else:
    print("not validated")