Example #1
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
    
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    return feat,energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
Example #4
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
        
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    
    return feat,energy
def feature_extract(wav_name, winlen=0.025, winstep=0.01):
    """This function returns (mfcc) feature vectors extracted from wav_name"""
    rate, signal = wav.read(wav_name)
    signal = numpy.sum(signal, axis=1)/signal.shape[1]
    signal = sigproc.framesig(signal, rate*winlen, rate*winstep)
    signal = vad.vad_filter(signal)
    signal = sigproc.deframesig(signal, 0, rate*winlen, rate*winstep)
    mfcc_feat = mfcc(signal, rate)
    return mfcc_feat
def feature_extract(wav_name, winlen=0.025, winstep=0.01):
    """This function returns (mfcc) feature vectors extracted from wav_name"""
    rate, signal = wav.read(wav_name)
    signal = numpy.sum(signal, axis=1) / signal.shape[1]
    signal = sigproc.framesig(signal, rate * winlen, rate * winstep)
    signal = vad.vad_filter(signal)
    signal = sigproc.deframesig(signal, 0, rate * winlen, rate * winstep)
    mfcc_feat = mfcc(signal, rate)
    return mfcc_feat
Example #7
0
def doDet(fileIn):
    (fsr,sig) = wav.read(fileIn)
    segChunks,segTimes = framesig(sig,seglen*fs,segstp*fs,'box',1,fs)
    
    if len(segChunks.shape) == 1:
        segChunks = np.reshape(segChunks,1,segChunks.reshape[0])
        
    #allOut = np.zeros((segChunks.shape[0],1))
    allFeats = np.zeros((segChunks.shape[0],nComp))
    #print allOut.shape
    for t in range(segChunks.shape[0]):
        seg = segChunks[t,:]
        mfcc_feat,mspec,logmelspec = mfcc(seg,fs,winlen=wlen,winstep=wstep,numcep=ncep,nfilt=numfilt,nfft=fftsz,lowfreq=0,highfreq=fs/2,preemph=0.97,ceplifter=22,appendEnergy=True)

        #if (math.isnan(numpy.sum(numpy.sum(mfcc_feat)))) or (math.isinf(numpy.sum(numpy.sum(mfcc_feat)))):
        #    print 'Escaping this Seg -- NaN or Inf occurres'
        #else:
        #    numpy.savetxt(fltoread.replace('AllData',mfccset).rstrip('.wav')+'_POSITIVE_'+tlist[0]+'_'+tlist[1]+'_'+str(t)+'.mfcc',mfc\
        #                      c_feat,delimiter=' ') 

        cdist=gmmmixt.predict_proba(mfcc_feat)
        hist = np.sum(cdist,axis=0)
        histfeat = hist/float(hist.shape[0])
        histfeat = histfeat.reshape(1,histfeat.shape[0])
        allFeats[t,:] = histfeat
        
    histKer = computeKernel(allFeats,trdata,1.0)
    kerId = np.arange(histKer.shape[0])+1
    kerId = np.reshape(kerId,(kerId.shape[0],1))
    teKer = map(list,np.hstack((kerId,histKer)))
    telax = [0]*len(teKer)
    plb,acc,probab = svm_predict(telax,teKer,svmMod,'-b 1 -q')
            
    lbs = svmMod.get_labels()
    #print str(probab) + str(lbs)
    probab = np.array(probab)
    if lbs[0] == 1:
        prob_f=probab[:,0]
    elif lbs[1] == 1:
        prob_f=probab[:,1]
    else:
        print 'Not possible'
        sys.exit()

    prob_f.reshape(prob_f.shape[0],1)
    if pOrc == 1:
        allOut = prob_f
    elif pOrc == 0:
        allOut = np.array(map(int,(prob_f > opPoint))).reshape(prob_f.shape[0],1)
        #print np.hstack((allOut,segTimes))
        np.savetxt(fileIn.rstrip('.wav')+'_res'+'.txt',allOut)
        return np.hstack((allOut,segTimes))
Example #8
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:numpy.ones((x,))):
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log

    return feat,energy
Example #9
0
def get_lpcc(filename):
    """
    gets lpccs for each frame in a wav file
    filename: name of wav file with .wav
    returns the lpcc features in each frame as a list of lists
    """
    print "Getting LPCC"
    (rate, sig) = wav.read(filename)
    frames = sigproc.framesig(sig, 0.025 * rate, 0.01 * rate)
    lpccs = [[]] * len(frames)
    for x in xrange(0, len(frames)):
        lpcc_feat = lpc(frames[x], 12)
        for feature in lpcc_feat[0]:
            feature = float(feature)
        lpccs[x] = lpcc_feat[0]

    return numpy.asarray(lpccs)
Example #10
0
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):
    """Compute MFCC features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param numcep: the number of cepstrum to return, default 13    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. 
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    """           
    # In fbank changed to do things on unique part of spectrum only i.e from frequency bins 1 to nfft/2+1
    # change in sigproc to use hamming window by default
    #MAKE SURE THAT nfft is even or next power of two after window length...in particular use something as NFFT=2^(ceil(log(winpts)/log(2)));

    #feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
    
    #K = nfft/2 + 1 # unique part of spectrum  0 to nfft/2 -- Already taken care of by numpy.fft.rfft -- returns unique part only

    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,'hamm')
    pspec = sigproc.powspec(frames,nfft) # in this power spectrum computation normalization has been done..check 1/nfft factor..removed as of now
    mspec = sigproc.magspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
        
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) # filter bank returned here is nfilt by nfft/2 + 1 
    featx = numpy.dot(pspec,fb.T) # compute the filterbank energies
    featx = numpy.where(featx == 0,numpy.finfo(float).eps,featx) # if feat is zero, we get problems with log
    
    feat = numpy.log(featx)
    logmelspec = feat
    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    feat = lifter(feat,ceplifter)
    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
    return feat,mspec,logmelspec
Example #11
0
def get_words(audio):

    winlen = 0.01
    winstep = 0.01
    sample_rate = 44100

    # in terms of number of 10 ms frames
    start_speech = 10
    end_silence = 5
    speech_leader = 5
    speech_trailer = 5

    (rate, sig) = wav.read(audio)
    frames = framesig(sig, winlen * sample_rate, winstep * sample_rate,
                      lambda x: numpy.ones((1, x)))
    word_list = []

    #calculate energy per frame and zcr per frame
    frame_energy = []
    zcr = []
    for i in range(0, len(frames)):
        energy = sum(1.0 * x * x for x in frames[i])
        zc = 0
        for j in range(1, len(frames[i])):
            if (frames[i][j] < 0
                    and frames[i][j - 1] > 0) or (frames[i][j] > 0
                                                  and frames[i][j - 1] < 0):
                zc = zc + 1
        frame_energy.append(energy)
        zcr.append(zc)

    #calculate final noise value
    avg_energy = sum(1.0 * x for x in frame_energy[0:9])
    avg_energy = avg_energy / 10
    avg_zcr = sum(1.0 * x for x in frame_energy[0:9])
    avg_zcr = avg_zcr / 10

    #calculate threshold
    upper_energy_threshold = 2 * avg_energy
    upper_zcr_threshold = 2 * avg_zcr
    lower_energy_threshold = 0.75 * avg_energy
    lower_zcr_threshold = 0.75 * avg_zcr
    '''
	print upper_energy_threshold
	print upper_zcr_threshold
	print lower_energy_threshold
	print lower_zcr_threshold
	'''
    started = False
    start_index = 0
    start_cnt = 0
    stop_index = 0
    stop_cnt = 0
    words = 0

    #print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS"

    for i in range(0, len(frames[10:])):
        if not started:
            if frame_energy[i] > upper_energy_threshold or zcr[
                    i] > upper_zcr_threshold:
                start_cnt += 1
            else:
                start_cnt = 0
            if start_cnt == start_speech:
                started = True
                start_index = i - start_speech + 1 - speech_leader
                start_index = max(0, start_index)
        else:
            if frame_energy[i] > upper_energy_threshold or zcr[
                    i] > upper_zcr_threshold:
                stop_cnt = 0
            else:
                stop_cnt += 1
            if stop_cnt == end_silence:
                stop_index = i - end_silence + 1 + speech_trailer
                stop_index = min(len(frames) - 1, stop_index)

                #print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
                wav.write("word" + str(words) + ".wav", rate,
                          sig[441 * start_index:441 * (stop_index + 1)])
                words += 1

                started = False
                start_index = start_cnt = 0
                stop_index = stop_cnt = 0

    if started:
        stop_index = len(frames) - 1
        #print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
        wav.write("word" + str(words) + ".wav", rate,
                  sig[441 * start_index:441 * (stop_index + 1)])
        words += 1

    return words
def chop(output_folder='chopped-words', audio_file='recording.wav'):
    global voiced, sig, lrms, min_signal

    (rate, sig) = wav.read(audio_file)

    frames = framesig(sig, winlen * sample_rate, winstep * sample_rate,
                      lambda x: np.ones((1, x)))

    lrms = [log_root_mean_square(x) for x in frames]
    min_signal = np.mean(lrms) / 2
    # min_signal = 0
    # print min_signal

    # dtype=int16 must for writing to wav file
    # result = np.array([], dtype=np.int16)

    for i in range(0, len(frames)):
        if classify(i):
            #result = np.append(result, sig[441*i : 441*(i+1)])
            voiced.append(True)
        else:
            voiced.append(False)

    started = False
    start_index = 0
    start_cnt = 0
    stop_index = 0
    stop_cnt = 0
    words = 0

    print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS"

    for i in range(0, len(frames)):
        if not started:
            if voiced[i]:
                start_cnt += 1
            else:
                start_cnt = 0
            if start_cnt == start_speech:
                started = True
                start_index = i - start_speech + 1 - speech_leader
                start_index = max(0, start_index)
        else:
            if voiced[i]:
                stop_cnt = 0
            else:
                stop_cnt += 1
            if stop_cnt == end_silence:
                stop_index = i - end_silence + 1 + speech_trailer
                stop_index = min(len(frames) - 1, stop_index)

                print ">>", start_index, stop_index, (stop_index -
                                                      start_index + 1) * 10
                wav.write(output_folder + "/word" + str(words) + ".wav", rate,
                          sig[441 * start_index:441 * (stop_index + 1)])
                #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index))
                words += 1

                started = False
                start_index = start_cnt = 0
                stop_index = stop_cnt = 0

    if started:
        stop_index = len(frames) - 1
        print ">>", start_index, stop_index, (stop_index - start_index +
                                              1) * 10
        wav.write(output_folder + "/word" + str(words) + ".wav", rate,
                  sig[441 * start_index:441 * (stop_index + 1)])
        #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index))
        words += 1

    return words
def logFilterbankFeatures(signal,
                          samplerate=16000,
                          winlen=0.0255,
                          winstep=0.01,
                          nfilt=40,
                          nfft=512,
                          lowfreq=133.3333,
                          highfreq=6855.4976,
                          preemph=0.97,
                          winSzForDelta=2):
    '''
    Computes log filterbank energies on a mel scale + total energy using 
    with the code taken from features.fbank, which does not accept
    window function as a param. 
    function from package 'python_speech_features', see
    http://python-speech-features.readthedocs.org/en/latest/ or
    https://github.com/jameslyons/python_speech_features

    Therefore it calculates the FFT of the signal and sums the the weighted
    bins, distributed on a mel scale. Weighting is done with tri-angular filters.
    For these filter energies + total energy, deltas are calculated.
    
    :parameters:
        - signal : np.ndarray, dtype=float
            input vector of the speech signal
        - samplerate : int
        - winlen: float
            length of analysis window in seconds
        - winstep: float
            step size between successive windows in seconds
        - nfilt: int
             number of filter energies to compute (total energy not included).
             e.g. 40 --> Output dim = (40+1)*3
        - nfft: int
            FFT size
        - lowfreq: int
            lower end on mel frequency scale, on which filter banks are distributed
        - highfreq: int
            upper end on mel frequency scale, on which filter banks are distributed
        - preemph: float
            pre-emphasis coefficient
        - deltafeat: np.ndarray, dtype=float
            deltas of the input features
        - winSzForDelta: int
            window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are
            for calculating the deltas
    :returns:
        - features: numpy.array: float
            feature-matrix. 1st dimension: time steps of 'winstep',
            2nd dim: feature dimension: (nfilt + 1)*3,
            +1 for energy, *3 because of deltas

    '''
    # Part of the following code is copied from function features.fbank
    # Unfortunately, one can't specify the window function in features.fbank
    # Hamming window is used here

    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal,
                              winlen * samplerate,
                              winstep * samplerate,
                              winfunc=hamming)
    pspec = sigproc.powspec(frames, nfft)
    energy = np.sum(pspec, 1)  # this stores the total energy in each frame
    energy = np.where(energy == 0,
                      np.finfo(float).eps,
                      energy)  # if energy is zero, we get problems with log
    fb = features.get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = np.dot(pspec, fb.T)  # compute the filterbank energies
    feat = np.where(feat == 0,
                    np.finfo(float).eps,
                    feat)  # if feat is zero, we get problems with log

    # Use log feature bank and log energy
    feat = np.column_stack((np.log(energy), np.log(feat)))
    # calculate delta and acceleration
    deltaFeat = delta(feat, winSzForDelta)
    accFeat = delta(deltaFeat, winSzForDelta)
    # stack features + delta + acceleration
    return np.concatenate((feat, deltaFeat, accFeat), axis=1)
def get_words(audio):
    (rate,sig) = wav.read(audio)
    frames = framesig(sig, winlen*sample_rate, winstep*sample_rate,lambda x:numpy.ones((1,x)))
    word_list = []
    
    #calculate energy per frame and zcr per frame
    frame_energy = []
    zcr = []
    for i in range(0,len(frames)):
        energy = sum(1.0*x*x for x in frames[i])
        zc = 0
        for j in range(1,len(frames[i])):
            if (frames[i][j]<0 and frames[i][j-1]>0) or (frames[i][j]>0 and frames[i][j-1]<0):
                zc = zc + 1
        frame_energy.append(energy)
        zcr.append(zc)
    
    #calculate final noise value
    avg_energy = sum(1.0*x for x in frame_energy[0:9])
    avg_energy = avg_energy/10
    avg_zcr = sum(1.0*x for x in frame_energy[0:9])
    avg_zcr = avg_zcr/10
    
    #calculate threshold
    upper_energy_threshold = 2*avg_energy
    upper_zcr_threshold = 2*avg_zcr
    lower_energy_threshold = 0.75*avg_energy
    lower_zcr_threshold = 0.75*avg_zcr

    print upper_energy_threshold
    print upper_zcr_threshold
    print lower_energy_threshold
    print lower_zcr_threshold
    
    started = False
    start_index = 0
    start_cnt = 0
    stop_index = 0
    stop_cnt = 0
    words = 0

    print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS"

    for i in range(0,len(frames[10:])):
        if not started:
            if frame_energy[i]>upper_energy_threshold or zcr[i]>upper_zcr_threshold:
                start_cnt += 1
            else:
                start_cnt = 0
            if start_cnt == start_speech:
                started = True
                start_index = i - start_speech + 1 - speech_leader
                start_index = max(0, start_index)
        else:
            if frame_energy[i]>upper_energy_threshold or zcr[i]>upper_zcr_threshold:
                stop_cnt = 0
            else:
                stop_cnt += 1
            if stop_cnt ==  end_silence:
                stop_index = i - end_silence + 1 + speech_trailer
                stop_index = min(len(frames)-1, stop_index)

                print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
                wav.write("word" + str(words) + ".wav",rate,sig[441*start_index:441*(stop_index+1)])
                words += 1
                
                started = False
                start_index = start_cnt = 0
                stop_index = stop_cnt = 0
        
    if started:
        stop_index = len(frames)-1
        print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
        wav.write("word" + str(words) + ".wav",rate,sig[441*start_index:441*(stop_index+1)])
        words+=1

    return words
def chop(output_folder = 'chopped-words', audio_file = 'recording.wav'):
    global voiced, sig, lrms, min_signal

    (rate,sig) = wav.read(audio_file)

    frames = framesig(sig, winlen*sample_rate, winstep*sample_rate,lambda x:np.ones((1,x)))

    lrms = [log_root_mean_square(x) for x in frames]
    min_signal = np.mean(lrms) / 2
    # min_signal = 0
    # print min_signal

    # dtype=int16 must for writing to wav file
    # result = np.array([], dtype=np.int16)

    for i in range(0,len(frames)):
        if classify(i):
            #result = np.append(result, sig[441*i : 441*(i+1)])
            voiced.append(True)
        else:
            voiced.append(False)

    started = False
    start_index = 0
    start_cnt = 0
    stop_index = 0
    stop_cnt = 0
    words = 0

    print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS"

    for i in range(0,len(frames)):
        if not started:
            if voiced[i]:
                start_cnt += 1
            else:
                start_cnt = 0
            if start_cnt == start_speech:
                started = True
                start_index = i - start_speech + 1 - speech_leader
                start_index = max(0, start_index)
        else:
            if voiced[i]:
                stop_cnt = 0
            else:
                stop_cnt += 1
            if stop_cnt ==  end_silence:
                stop_index = i - end_silence + 1 + speech_trailer
                stop_index = min(len(frames)-1, stop_index)

                print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
                wav.write(output_folder + "/word" + str(words) + ".wav",rate, sig[441*start_index : 441*(stop_index+1)])
                #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index))
                words += 1
                
                started = False
                start_index = start_cnt = 0
                stop_index = stop_cnt = 0
        
    if started:
        stop_index = len(frames)-1
        print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10
        wav.write(output_folder + "/word" + str(words) + ".wav",rate, sig[441*start_index : 441*(stop_index+1)])
        #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index))
        words += 1

    return words
Example #16
0
# Inicia extracción de caracteristicas a nivel de segmento
start = time.time()
window = 2
allUt = []
allLabels = []
utteranceByFeat = []
labelsByFeat = []
portionSelection = 1.0
for utterance in sessionTrain:
    features = []
    labels = []
    if np.random.uniform(1, 0) < portionSelection:
        label = utterance[1]
        channel1, channel2 = zip(*utterance[0])
        samplesSize = 1600
        samplesChn1 = sigproc.framesig(channel1, samplesSize, np.ceil(samplesSize * 0.1))
        samplesChn2 = sigproc.framesig(channel2, samplesSize, np.ceil(samplesSize * 0.1))
        allFeaturesVector = []
        for i in range(0, len(samplesChn1)):
            sampleLeft = samplesChn1[i]
            sampleRight = samplesChn2[i]
            currentFeaturesLeft = calcFeaturesVector(sampleLeft, 16000)
            currentFeaturesRight = calcFeaturesVector(sampleRight, 16000)
            allFeaturesVector.append(currentFeaturesLeft + currentFeaturesRight)
            bound = 2 * window + 1
            if i >= (bound):
                features.append(np.concatenate(allFeaturesVector[(i - bound) : i], axis=0))
                allUt.append(np.concatenate(allFeaturesVector[(i - bound) : i], axis=0))
                labels.append(encodeLabels(label))
                allLabels.append(encodeLabels(label))
def logFilterbankFeatures(signal,samplerate=16000,winlen=0.0255,winstep=0.01,
          nfilt=40,nfft=512,lowfreq=133.3333,highfreq=6855.4976,preemph=0.97,
          winSzForDelta=2):
    '''
    Computes log filterbank energies on a mel scale + total energy using 
    with the code taken from features.fbank, which does not accept
    window function as a param. 
    function from package 'python_speech_features', see
    http://python-speech-features.readthedocs.org/en/latest/ or
    https://github.com/jameslyons/python_speech_features

    Therefore it calculates the FFT of the signal and sums the the weighted
    bins, distributed on a mel scale. Weighting is done with tri-angular filters.
    For these filter energies + total energy, deltas are calculated.
    
    :parameters:
        - signal : np.ndarray, dtype=float
            input vector of the speech signal
        - samplerate : int
        - winlen: float
            length of analysis window in seconds
        - winstep: float
            step size between successive windows in seconds
        - nfilt: int
             number of filter energies to compute (total energy not included).
             e.g. 40 --> Output dim = (40+1)*3
        - nfft: int
            FFT size
        - lowfreq: int
            lower end on mel frequency scale, on which filter banks are distributed
        - highfreq: int
            upper end on mel frequency scale, on which filter banks are distributed
        - preemph: float
            pre-emphasis coefficient
        - deltafeat: np.ndarray, dtype=float
            deltas of the input features
        - winSzForDelta: int
            window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are
            for calculating the deltas
    :returns:
        - features: numpy.array: float
            feature-matrix. 1st dimension: time steps of 'winstep',
            2nd dim: feature dimension: (nfilt + 1)*3,
            +1 for energy, *3 because of deltas

    '''
    # Part of the following code is copied from function features.fbank
    # Unfortunately, one can't specify the window function in features.fbank
    # Hamming window is used here
    
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,winfunc=hamming)
    pspec = sigproc.powspec(frames,nfft)
    energy = np.sum(pspec,1) # this stores the total energy in each frame
    energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log  
    fb = features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = np.dot(pspec,fb.T) # compute the filterbank energies
    feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log
    
    # Use log feature bank and log energy
    feat = np.column_stack((np.log(energy),np.log(feat)))
    # calculate delta and acceleration
    deltaFeat = delta(feat, winSzForDelta)
    accFeat = delta(deltaFeat, winSzForDelta)
    # stack features + delta + acceleration
    return np.concatenate((feat,deltaFeat,accFeat),axis=1)