Ejemplo n.º 1
0
 def test_frame_sig(self):
     n = 10000124
     frame_len = 37
     frame_step = 13
     x = np.random.rand(n)
     t0 = time.time()
     y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
     t1 = time.time()
     y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
     t_new = time.time() - t1
     t_old = t1 - t0
     self.assertTupleEqual(y_old.shape, y_new.shape)
     np.testing.assert_array_equal(y_old, y_new)
     self.assertLess(t_new, t_old)
     print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
def get_fft_spectrum(filename, buckets):
    signal = load_wav(filename, c.SAMPLE_RATE)
    signal *= 2**15

    # print(filename)
    print(buckets)

    while (len(signal) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101):
        signal = np.append(signal, 0)

    # get FFT spectrum
    signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
    signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    # print(len(fft))
    fft_norm = normalize_frames(fft.T)
    # print(len(fft_norm.T))

    # truncate to max bucket sizes
    rsize = max(k for k in buckets if k <= len(fft_norm.T))
    # print(rsize)
    rstart = int((len(fft_norm.T) - rsize) / 2)
    # print(rstart)
    out = fft_norm[:, rstart:rstart + rsize]
    # print(len(out))

    return out
Ejemplo n.º 3
0
    def ssc(signal,
            samplerate=16000,
            winlen=0.025,
            winstep=0.01,
            nfilt=26,
            nfft=512,
            lowfreq=0,
            highfreq=None,
            preemph=0.97,
            winfunc=lambda x: np.ones((x, ))):

        highfreq = highfreq or samplerate / 2
        signal = sigproc.preemphasis(signal, preemph)
        frames = sigproc.framesig(signal, winlen * samplerate,
                                  winstep * samplerate, winfunc)
        pspec = sigproc.powspec(frames, nfft)
        pspec = np.where(pspec == 0,
                         np.finfo(float).eps,
                         pspec)  # if things are all zeros we get problems

        fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
        feat = np.dot(pspec, fb.T)  # compute the filterbank energies
        R = np.tile(np.linspace(1, samplerate / 2, np.size(pspec, 1)),
                    (np.size(pspec, 0), 1))

        return np.dot(pspec * R, fb.T) / feat
Ejemplo n.º 4
0
def read_and_process_audio(filename, buckets):
    signal = read_audio(filename, c.SAMPLE_RATE)

    # # Filter out non-speech frequencies
    # lowcut, highcut = c.FILTER_RANGE
    # signal = butter_bandpass_filter(signal, lowcut, highcut, c.SAMPLE_RATE, 1)

    # # Normalize signal
    # signal = normalize(signal)

    signal *= 2**15

    # Process signal to get FFT spectrum
    signal = rm_dc_n_dither(signal, c.SAMPLE_RATE)
    signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    fft_norm = normalize_frames(fft.T)

    # Truncate to middle MAX_SEC seconds
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1] - rsize) / 2)
    out = fft_norm[:, rstart:rstart + rsize]

    return out
Ejemplo n.º 5
0
def get_fft_spectrum(filename, buckets):
    signal = load_wav(filename, c.SAMPLE_RATE)
    signal *= 2**15

    # get FFT spectrum (applying hamming)
    signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
    signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    fft_norm = normalize_frames(
        fft.T
    )  #TODO may remove variance normalization to see other poorer results

    # truncate to max bucket sizes
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1] - rsize) / 2)
    out = fft_norm[:, rstart:rstart + rsize]

    # print("fft_spectrum shape{}".format(out.shape))

    # if(out.shape[1] == c.MAX_SEC*100):
    # save_fft_spectrum(filename,out)

    return out
Ejemplo n.º 6
0
def logspec(signal, samplerate, conf):
    '''
    Compute log magnitude spectrogram features from an audio signal.

    Args:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
        samplerate: the samplerate of the signal we are working with.
        conf: feature configuration

    Returns:
        A numpy array of size (NUMFRAMES by numfreq) containing features. Each
        row holds 1 feature vector, a numpy vector containing the log magnitude
        spectrum of the corresponding frame
    '''
    signal = sigproc.preemphasis(signal, float(conf['preemph']))

    winfunc = _get_winfunc(conf['winfunc'])

    frames = sigproc.framesig(signal,
                              float(conf['winlen']) * samplerate,
                              float(conf['winstep']) * samplerate, winfunc)
    logspec = sigproc.logmagspec(frames, int(conf['nfft']))

    return logspec
Ejemplo n.º 7
0
    def fbank(signal,
              samplerate=16000,
              winlen=0.025,
              winstep=0.01,
              nfilt=26,
              nfft=512,
              lowfreq=0,
              highfreq=None,
              preemph=0.97,
              winfunc=lambda x: np.ones((x, ))):

        highfreq = highfreq or samplerate / 2
        signal = sigproc.preemphasis(signal, preemph)
        frames = sigproc.framesig(signal, winlen * samplerate,
                                  winstep * samplerate, winfunc)
        pspec = sigproc.powspec(frames, nfft)
        energy = np.sum(pspec, 1)  # this stores the total energy in each frame
        energy = np.where(
            energy == 0,
            np.finfo(float).eps,
            energy)  # if energy is zero, we get problems with log

        fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
        feat = np.dot(pspec, fb.T)  # compute the filterbank energies
        feat = np.where(feat == 0,
                        np.finfo(float).eps,
                        feat)  # if feat is zero, we get problems with log

        return feat, energy
Ejemplo n.º 8
0
def angspec(signal, samplerate, conf):
    """
    Compute angular spectrogram features from an audio signal.

    Args:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
        samplerate: the samplerate of the signal we are working with.
        conf: feature configuration

    Returns:
        A numpy array of size (NUMFRAMES by numfreq) containing features. Each
        row holds 1 feature vector, a numpy vector containing the angular
        spectrum of the corresponding frame
    """

    raise BaseException('Not yet implemented')
    signal = sigproc.preemphasis(signal, float(conf['preemph']))

    winfunc = _get_winfunc(conf['winfunc'])

    frames = sigproc.framesig(signal,
                              float(conf['winlen']) * samplerate,
                              float(conf['winstep']) * samplerate, winfunc)
    angspec = sigproc.angspec(frames, int(conf['nfft']))

    return angspec
Ejemplo n.º 9
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    return feat,energy
Ejemplo n.º 10
0
def spec_sub(signal):
    NFFT = 1024
    frames = sigproc.framesig(signal, 256, 128)
    print(frames.shape)
    cspec = np.fft.fft(frames, NFFT)
    pspec = abs(cspec)
    print(pspec.shape)
    pspec *= pspec
    phase = np.angle(cspec)

    noise_est = np.mean(pspec[40:50])
    print(noise_est)
    clean_spec = pspec - noise_est
    print("1")
    #print (clean_spec)
    clean_spec[clean_spec < 0] = 0
    print("2")
    #print(clean_spec)
    clean_spec **= 0.5
    clean_spec *= np.exp(phase)
    print("3")
    #print(clean_spec)
    reconstructed_frames = np.fft.ifft(clean_spec, NFFT)
    reconstructed_frames = np.real(reconstructed_frames)
    print(reconstructed_frames.shape)
    reconstructed_frames = reconstructed_frames[
        0:reconstructed_frames.shape[0], 0:256]
    print(reconstructed_frames.shape)

    #print(reconstructed_frames.shape)
    #print(reconstructed_frames)
    enhanced_signal = sigproc.deframesig(reconstructed_frames, len(signal),
                                         256, 128)
    #print(enhanced_signal)
    return enhanced_signal
def extract(filtered_audio, Fs):
    nfft = 256
    num_bins = 40
    start_frequency = 150
    end_frequency = 3200
    c = filtered_audio.shape[0]
    features = []
    for i in range(c):
        framed_audio = sigproc.framesig(filtered_audio[i], 256, 128)
        features.append([])
        j = framed_audio.shape[0]

        if j > 10:
            req_frames = framed_audio[int(j / 2) - 5:int(j / 2) + 5]
            print(req_frames)
        for k in range(len(req_frames)):
            peak_amp, peak_freq = sp_peak_amp_freq.peakFreq(req_frames[k], 50)
            pitch_periods = sp_pitch_period.pitch_period(req_frames[k], Fs)
            form = formants.formant(req_frames[k])
            cep = LPCC.lpcc(req_frames[k])
            real_cc = RCC.rcc(req_frames[k])
            lsfs = lsf.LSF(req_frames[k])
            hjorth_parameters = hjorth.params(req_frames[k])
            wavelet = dwt.wenergy(req_frames[k], 'db7', 5)
            features[i].extend(lsfs)
            features[i].extend(hjorth_parameters)
            features[i].extend(wavelet)
    return features
Ejemplo n.º 12
0
def RT_CNN():
    print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE))
    model = vggvox_model()  # Creates a VGGVox model
    model.load_weights(
        c.WEIGHTS_FILE)  # Load the weights of the trained models
    model.summary()  # Print a summary of the loaded model

    print("Loading embeddings from enroll")
    toLoad = load("data/model/RTSP_CNN.out")
    enroll_embs = []
    speakers = []
    for spk, embs in toLoad.items():
        for e in embs:
            enroll_embs.append(e)
            speakers.append(spk)
        print(spk)

    count = 0
    buffer = AudioBuffer()

    start_time = time.time()
    while count < 3:
        count += 1
        buffer.record(chunk_size=c.SAMPLE_RATE)
        data = buffer.get_data()
        data = np.frombuffer(data, 'int16')
    buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP)

    data *= 2**15

    while (len(data) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101):
        data = np.append(data, 0)

    # get FFT spectrum
    data = remove_dc_and_dither(data, c.SAMPLE_RATE)
    data = sigproc.preemphasis(data, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(data,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    fft_norm = normalize_frames(fft.T)

    # truncate to max bucket sizes
    rsize = max(k for k in buckets if k <= len(fft_norm.T))
    rstart = int((len(fft_norm.T) - rsize) / 2)
    x = fft_norm[:, rstart:rstart + rsize]

    test_embs = np.squeeze(model.predict(x.reshape(1, *x.shape, 1)))
    distances = []

    for embs in enroll_embs:
        distances.append(euclidean(test_embs, embs))

    print(len(speakers))

    idx = np.argmin(distances)

    print(speakers[idx])
    print("Ok, ", time.time() - start_time - 3, " seconds")
Ejemplo n.º 13
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Spectral Subband Centroid features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))

    return numpy.dot(pspec*R,fb.T) / feat
Ejemplo n.º 14
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
        winfunc=lambda x:np.ones((x,))):
    """Compute Spectral Subband Centroid features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the sample rate of the signal we are working with, in Hz.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    pspec = np.where(pspec == 0,np.finfo(float).eps,pspec) # if things are all zeros we get problems

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = np.dot(pspec,fb.T) # compute the filterbank energies
    R = np.tile(np.linspace(1,samplerate/2,np.size(pspec,1)),(np.size(pspec,0),1))

    return np.dot(pspec*R,fb.T) / feat
Ejemplo n.º 15
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
Ejemplo n.º 16
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:np.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the sample rate of the signal we are working with, in Hz.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = np.sum(pspec,1) # this stores the total energy in each frame
    energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = np.dot(pspec,fb.T) # compute the filterbank energies
    feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log

    return feat,energy
def get_fft_spectrum(filename, start, end):
    signal = load_wav(filename, c.SAMPLE_RATE)
    signal *= 2**15

    # get FFT spectrum
    signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)  # 数字滤波器,去除直流和颤动成分
    signal = sigproc.preemphasis(signal,
                                 coeff=c.PREEMPHASIS_ALPHA)  # 对输入信号进行预加重
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)  # 将信号框成重叠帧
    # print("===================")
    # print(frames.shape)
    # print("===================")
    # exit(0)
    spem = sigproc.logpowspec(frames, c.NUM_FFT)  # 计算语谱图
    # print("===================")
    # print(spem)
    # print("===================")
    # print(spem.shape)
    # print("===================")
    # exit(0)

    spem_norm = normalize_frames(spem.T)  # 减去均值,除以标准差

    length = spem_norm.shape[1]
    reserve_length = length - (length % 100)

    # out = fft_norm[:,0:reserve_length]    # test
    out = spem_norm[:, start:end]  # train

    return out
Ejemplo n.º 18
0
def get_fft_spectrum(filename, buckets):
    # load the signal with librosa
    signal = load_wav(filename, c.SAMPLE_RATE)
    # multiply the signal to get the power of 2?
    signal *= 2**15

    # get FFT spectrum

    # not sure what functions below do
    signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
    signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)

    # build frames to pass in fft
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)

    # get fft spetctrum
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))

    # normalize each frame by mean and std
    fft_norm = normalize_frames(fft.T)
    # truncate to max bucket sizes

    # ????
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1] - rsize) / 2)
    out = fft_norm[:, rstart:rstart + rsize]

    return out
Ejemplo n.º 19
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Mel-filterbank energy features from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log

    return feat,energy
Ejemplo n.º 20
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    print "preemph %s"%(preemph)
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    matchframes(frames[0], frames[1])
    pspec = sigproc.powspec(frames,nfft)
    energy = pylab.sum(pspec,1) # this stores the total energy in each frame
    energy = pylab.where(energy == 0, pylab.finfo(float).eps, energy) # if energy is zero, we get problems with log
    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    print "len(fb) %s"%(len(fb))
    colour = "k-"
    for i in range(len(fb)):
        if colour == "k-":
            colour = "r-"
        else:
            colour = "k-"
        startedplot = False
        midpoint = 0
        for j in range(len(fb[i])):
            if fb[i][j] > 0:
                if startedplot == False:
                    startedplot = j
                if j > 0:
                    pylab.plot([j-1, j], [fb[i][j-1], fb[i][j]], colour)
                    if fb[i][j] == 1.0:
                        midpoint = j
            else:
                if not startedplot == False:
                    pylab.plot([j-1, j], [fb[i][j-1], 0], colour)
                    try:
                        print "slope to midpoint %.3f, slope from midpoint %.3f"%(1.0/float(midpoint-startedplot), 1.0/float(midpoint-j+1))
                    except:
                        pass
                    break
    pylab.show()
    feat = pylab.dot(pspec, fb.T) # compute the filterbank energies
    feat = pylab.where(feat == 0, pylab.finfo(float).eps, feat) # if feat is zero, we get problems with log
    return feat, energy
Ejemplo n.º 21
0
def vad(sig, rate, winlen, winstep):
    '''do voice activity detection

    args:
        sig: the input signal as a numpy array
        rate: the sampling rate
        winlen: the window length
        winstep: the window step

    Returns:
        a numpy array of indices containing speech frames
    '''

    #apply preemphasis
    sig = sigproc.preemphasis(sig, 0.97)

    #do windowing windowing
    frames = sigproc.framesig(sig, winlen * rate, winstep * rate)

    #compute the squared frames and center them around zero mean
    sqframes = np.square(frames)
    sqframes = sqframes - sqframes.mean(1, keepdims=True)

    #compute the cross correlation between the frames and their square
    corr = np.array(map(partial(np.correlate, mode='same'), frames, sqframes))

    #compute the mel power spectrum of the correlated signal
    corrfft = np.fft.rfft(corr, 512)
    fb = base.get_filterbanks(26, 512, rate, 0, rate / 2)
    E = np.absolute(np.square(corrfft).dot(fb.T))

    #do noise sniffing at the front and the back and select the lowest energy
    Efront = E[:20, :].mean(0)
    Eback = E[-20:, :].mean(0)
    if Efront.sum() < Eback.sum():
        Enoise = Efront
    else:
        Enoise = Eback

    #at every interval compute the mean ratio between the maximal energy in that
    #interval and the noise energy
    width = 12

    #apply max pooling to the energy
    Emax = maximum_filter(E, size=[width, 1], mode='constant')

    #compute the ratio between the smoothed energy and the noise energy
    ratio = np.log((Emax / Enoise).mean(axis=1))
    ratio = ratio / np.max(ratio)

    speechframes = np.where(ratio > 0.2)[0]

    return speechframes
Ejemplo n.º 22
0
def admission(x, samplerate=8000, frame_ms=25, overlap_ms=5):
    frame_len = int(samplerate / frame_ms)
    overlap = int(samplerate / overlap_ms)
    frames = sigutil.framesig(x, frame_len, overlap, signal.hanning)
    acpeaks = ac_peaks(frames)
    f0 = f0_acf(frames)
    rse = RSE_soundsense(x, samplerate, frame_ms)

    x_timebased = np.linspace(0, len(frames) * frame_len, len(frames))
    x_rse = np.linspace(0, len(frames) * frame_len, len(frames))

    return {"acpeaks": acpeaks, "f0": f0, "rse": rse}
Ejemplo n.º 23
0
def admission(x, samplerate=8000, frame_ms=25, overlap_ms=5):
    frame_len = int(samplerate/frame_ms)
    overlap = int(samplerate/overlap_ms)
    frames = sigutil.framesig(x, frame_len, overlap, signal.hanning)
    acpeaks = ac_peaks(frames)
    f0 = f0_acf(frames)
    rse = RSE_soundsense(x, samplerate, frame_ms)

    x_timebased = np.linspace(0,len(frames)*frame_len, len(frames))
    x_rse = np.linspace(0,len(frames)*frame_len, len(frames))

    return {"acpeaks":acpeaks, "f0":f0, "rse":rse}
Ejemplo n.º 24
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): 
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
    
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
Ejemplo n.º 25
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):       
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
        
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    
    return feat,energy
Ejemplo n.º 26
0
def fbank(signal,
          samplerate=16000,
          winlen=0.025,
          winstep=0.01,
          nfilt=26,
          nfft=512,
          lowfreq=0,
          highfreq=None,
          preemph=0.95):
    """
    Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95. 
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    # print type(signal[0])
    frames = sigproc.framesig(signal,
                              winlen * samplerate,
                              winstep * samplerate,
                              winfunc=hamming_window)
    powspec = sigproc.powspec(frames, nfft)
    # numpy.savetxt("result.txt", powspec, delimiter=",")
    energy = numpy.sum(powspec,
                       1)  # this stores the total energy in each frame
    energy = numpy.where(
        energy == 0,
        numpy.finfo(float).eps, energy
    )  # if energy is zero, we get problems with log, use numpy.finfo(float).eps to replace 0

    filterbanks = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    # print powspec.shape, filterbanks.shape
    feat = numpy.dot(powspec, filterbanks.T)  # compute the filterbank energies
    feat = numpy.where(feat == 0,
                       numpy.finfo(float).eps,
                       feat)  # if feat is zero, we get problems with logs
    # print feat.shape
    return feat, energy
Ejemplo n.º 27
0
def get_fft_spectrum(signal):
    # padding zero
    n_sample = signal.shape[0]
    singal_len = int(c.DURA*c.SR)
    if n_sample < singal_len:
        signal = np.hstack((signal,np.zeros(singal_len-n_sample)))
    else:
        signal = signal[(n_sample-singal_len)//2:(n_sample+singal_len)//2]
    signal = np.array(signal)
    signal *= 2**15
    signal = remove_dc_and_dither(signal,c.SR)
    signal = sigproc.preemphasis(signal,coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(signal,frame_len=c.FRAME_LEN*c.SR,frame_step=c.FRAME_STEP*c.SR,winfunc=np.hamming)
    fft = abs(np.fft.fft(frames,n= c.N_FFT))
    fft_norm = normalization_frames(fft.T)
    return fft_norm
Ejemplo n.º 28
0
def fbankVTLP(signal,
              samplerate=16000,
              winlen=0.025,
              winstep=0.01,
              nfilt=26,
              nfft=512,
              lowfreq=0,
              highfreq=None,
              preemph=0.97,
              appendEnergy=False,
              alpha=1.0):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate)
    pspec = sigproc.powspec(frames, nfft)
    energy = numpy.sum(pspec, 1)  # this stores the total energy in each frame
    energy = numpy.where(energy == 0,
                         numpy.finfo(float).eps,
                         energy)  # if energy is zero, we get problems with log

    fb = get_filterbanksVTLP(nfilt, nfft, samplerate, lowfreq, highfreq, alpha)
    feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
    feat = numpy.where(feat == 0,
                       numpy.finfo(float).eps,
                       feat)  # if feat is zero, we get problems with log

    if appendEnergy:
        feat = numpy.c_[feat, numpy.log(
            energy
        )]  # replace first cepstral coefficient with log of frame energy
    return feat, energy
Ejemplo n.º 29
0
def get_fft_spectrum(signal):
    signal *= 2**15

    # get FFT spectrum
    signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
    signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
    frames = sigproc.framesig(signal,
                              frame_len=c.FRAME_LEN * c.SAMPLE_RATE,
                              frame_step=c.FRAME_STEP * c.SAMPLE_RATE,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=c.NUM_FFT))
    fft_norm = normalize_frames(fft.T)
    rsize = 500
    rstart = int((fft_norm.shape[1] - rsize) / 2)
    out = fft_norm[:, rstart:rstart + rsize]

    return out
Ejemplo n.º 30
0
def fbank(signal, samplerate, conf):
    '''
    Compute fbank features from an audio signal.
	从一个声音信号中计算fbank特征向量
    Args:
	参数:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
			要计算特征的声音信号,一个N*1维的数组
        samplerate: the samplerate of the signal we are working with.
			要处理信号的采样率
        conf: feature configuration
			特征的配置

    Returns:
	返回值:
        A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
        vector containing the signal energy
		返回一个包含特征向量的numpy数组,一个包含信号能量的numpy向量
    '''

    highfreq = int(conf['highfreq'])
    if highfreq < 0:
        highfreq = samplerate/2

    signal = sigproc.preemphasis(signal, float(conf['preemph']))
    frames = sigproc.framesig(signal, float(conf['winlen'])*samplerate,
                              float(conf['winstep'])*samplerate)
    pspec = sigproc.powspec(frames, int(conf['nfft']))

    # this stores the total energy in each frame
    energy = numpy.sum(pspec, 1)

    # if energy is zero, we get problems with log
    energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)

    filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']),
                                 samplerate, int(conf['lowfreq']), highfreq)

    # compute the filterbank energies
    feat = numpy.dot(pspec, filterbank.T)

    # if feat is zero, we get problems with log
    feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat)

    return feat, energy
Ejemplo n.º 31
0
def get_fft_spectrum(filename, buckets):
	signal = load_wav(filename,c.SAMPLE_RATE)
	signal *= 2**15

	# get FFT spectrum
	signal = remove_dc_and_dither(signal, c.SAMPLE_RATE)
	signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA)
	frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN*c.SAMPLE_RATE, frame_step=c.FRAME_STEP*c.SAMPLE_RATE, winfunc=np.hamming)
	fft = abs(np.fft.fft(frames,n=c.NUM_FFT))
	fft_norm = normalize_frames(fft.T)

	# truncate to max bucket sizes
	rsize = max(k for k in buckets if k <= fft_norm.shape[1])
	rstart = int((fft_norm.shape[1]-rsize)/2)
	out = fft_norm[:,rstart:rstart+rsize]
	# print(out.shape)
	# exit(0)
	return out
Ejemplo n.º 32
0
def fbank(signal, samplerate, conf):
    """
    Compute fbank features from an audio signal.

    Args:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
        samplerate: the samplerate of the signal we are working with.
        conf: feature configuration

    Returns:
        A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
        vector containing the signal energy
    """

    raise BaseException('Not yet implemented')
    highfreq = int(conf['highfreq'])
    if highfreq < 0:
        highfreq = samplerate / 2

    signal = sigproc.preemphasis(signal, float(conf['preemph']))
    frames = sigproc.framesig(signal,
                              float(conf['winlen']) * samplerate,
                              float(conf['winstep']) * samplerate)
    pspec = sigproc.powspec(frames, int(conf['nfft']))

    # this stores the total energy in each frame
    energy = numpy.sum(pspec, 1)

    # if energy is zero, we get problems with log
    energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)

    filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']),
                                 samplerate, int(conf['lowfreq']), highfreq)

    # compute the filterbank energies
    feat = numpy.dot(pspec, filterbank.T)

    # if feat is zero, we get problems with log
    feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat)

    return feat, energy
Ejemplo n.º 33
0
def get_fft_spectrum(filename, buckets):
    signal = load_wav(filename, 16000)
    signal *= 2**15

    # get FFT spectrum
    signal = remove_dc_and_dither(signal, 16000)
    signal = sigproc.preemphasis(signal, coeff=0.97)
    frames = sigproc.framesig(signal,
                              frame_len=0.025 * 16000,
                              frame_step=0.01 * 16000,
                              winfunc=np.hamming)
    fft = abs(np.fft.fft(frames, n=512))
    fft_norm = normalize_frames(fft.T)

    # truncate to max bucket sizes
    rsize = max(k for k in buckets if k <= fft_norm.shape[1])
    rstart = int((fft_norm.shape[1] - rsize) / 2)
    out = fft_norm[:, rstart:rstart + rsize]

    return out
Ejemplo n.º 34
0
def ssc(signal, samplerate, conf):
    '''
    Compute ssc features from an audio signal.

    Args:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
        samplerate: the samplerate of the signal we are working with.
        conf: feature configuration

    Returns:
        A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy
        vector containing the signal log-energy
    '''

    highfreq = int(conf['highfreq'])
    if highfreq < 0:
        highfreq = samplerate / 2
    signal = sigproc.preemphasis(signal, float(conf['preemph']))
    frames = sigproc.framesig(signal,
                              float(conf['winlen']) * samplerate,
                              float(conf['winstep']) * samplerate)
    pspec = sigproc.powspec(frames, int(conf['nfft']))

    # this stores the total energy in each frame
    energy = numpy.sum(pspec, 1)

    # if energy is zero, we get problems with log
    energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy)

    filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']),
                                 samplerate, int(conf['lowfreq']), highfreq)

    # compute the filterbank energies
    feat = numpy.dot(pspec, filterbank.T)
    tiles = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)),
                       (numpy.size(pspec, 0), 1))

    return numpy.dot(pspec * tiles, filterbank.T) / feat, numpy.log(energy)
Ejemplo n.º 35
0
def frames(signal, samplerate, conf):
    """
    Compute frames from an audio signal.

    Args:
        signal: the audio signal from which to compute features. Should be an
            N*1 array
        samplerate: the samplerate of the signal we are working with.
        conf: feature configuration

    Returns:
        A numpy array of size (NUMFRAMES by winlen) containing features. Each
        row holds 1 feature vector
    """
    signal = sigproc.preemphasis(signal, float(conf['preemph']))

    winfunc = _get_winfunc(conf['winfunc'])

    frames = sigproc.framesig(signal,
                              float(conf['winlen']) * samplerate,
                              float(conf['winstep']) * samplerate, winfunc)

    return frames
glottal_flow = np.concatenate([raw_glottal_flow[segment['start']:segment['stop']]
                               for segment in voiced_segments if segment['is_speech']])

wav_samples = wav_samples / float(pow(2, 15))  # to float

assert len(glottal_flow) == len(wav_samples),\
    f"Inconsistent length: glottal flow ({len(glottal_flow):d}) / wav samples ({len(wav_samples):d})"

# Normalize
wav_samples = wav_samples / np.linalg.norm(wav_samples)
glottal_flow = glottal_flow / np.linalg.norm(glottal_flow)

# Frame
# sample_frames = framesig(wav_samples, 0.025 * fs, 0.01 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True)
# flow_frames = framesig(glottal_flow, 0.025 * fs, 0.01 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True)
sample_frames = framesig(wav_samples, 0.5 * fs, 0.5 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True)
flow_frames = framesig(glottal_flow, 0.5 * fs, 0.5 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True)

# Some constants
x0 = 0.1  # half glottal width at rest position, cm
tau = 1e-3  # time delay for surface wave to travel half glottal height T, 1 ms
eta = 1.  # nonlinear factor for energy dissipation at large amplitude
c = 5000  # air particle velocity, cm/s
d = 1.75  # length of vocal folds, cm
M = 0.5  # mass, g/cm^2
B = 100  # damping, dyne s/cm^3

# Initial conditions
alpha = 0.8  # if > 0.5 delta, stable-like oscillator
beta = 0.32
delta = 1.  # asymmetry parameter