def abs_preemph_fft(signal,
                    samplerate=16000,
                    winlen=0.08,
                    winstep=0.04,
                    nfft=2048,
                    preemph=0.97,
                    winfunc=lambda x: numpy.ones((x, ))):
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    feat = sigproc.magspec(frames, nfft)
    feat = sigproc.preemphasis(feat, 1)
    return feat
Example #2
0
def specdecomp(signal,
               samplerate=16000,
               winlen=0.025,
               winstep=0.01,
               nfft=512,
               lowfreq=0,
               highfreq=None,
               preemph=0.97,
               winfunc=lambda x: np.ones((x, )),
               decomp='complex'):
    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 
    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
    """

    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    if decomp == 'time' or decomp == 'frames':
        return frames

    complex_spec = np.fft.rfft(frames, nfft)
    if decomp == 'magnitude' or decomp == 'mag' or decomp == 'abs':
        return np.abs(complex_spec)
    elif decomp == 'phase' or decomp == 'angle':
        return np.angle(complex_spec)
    elif decomp == 'power' or decomp == 'powspec':
        return sigproc.powspec(frames, nfft)
    else:
        return complex_spec
    return spect
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:numpy.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
        
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    
    return feat,energy
Example #4
0
def ssc(signal,
        samplerate=16000,
        winlen=0.025,
        winstep=0.01,
        nfilt=55,
        nfft=2048,
        lowfreq=0,
        highfreq=None,
        preemph=0.97,
        winfunc=lambda x: numpy.ones((x, ))):
    """Compute Spectral Subband Centroid features from an audio signal.
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    pspec = sigproc.powspec(frames, nfft)
    pspec = numpy.where(pspec == 0,
                        numpy.finfo(float).eps,
                        pspec)  # if things are all zeros we get problems

    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)),
                   (numpy.size(pspec, 0), 1))

    return numpy.dot(pspec * R, fb.T) / feat
Example #5
0
def fbank(signal,
          samplerate=16000,
          winlen=0.025,
          winstep=0.01,
          nfilt=55,
          nfft=2048,
          lowfreq=0,
          highfreq=None,
          preemph=0.97,
          winfunc=lambda x: numpy.ones((x, ))):
    """Compute Mel-filterbank energy features from an audio signal.
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    pspec = sigproc.powspec(frames, nfft)
    energy = numpy.sum(pspec, 1)  # this stores the total energy in each frame
    energy = numpy.where(energy == 0,
                         numpy.finfo(float).eps,
                         energy)  # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
    feat = numpy.where(feat == 0,
                       numpy.finfo(float).eps,
                       feat)  # if feat is zero, we get problems with log

    return feat, energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
        winfunc=lambda x:numpy.ones((x,))):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
    
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
Example #7
0
def cspec(signal,
          samplerate=16000,
          winlen=0.025,
          winstep=0.01,
          nfft=512,
          preemph=0.97,
          winfunc=lambda x: np.ones((x, ))):
    """Compute STFT coeeficients from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfft: the FFT size. Default is 512.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
        You can use numpy window functions here e.g. winfunc=np.hamming
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features.
        Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed)
    """
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    if np.shape(frames)[1] > nfft:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. '
            + 'Increase NFFT to avoid.',
            np.shape(frames)[1], nfft)

    return np.fft.rfft(frames, nfft)
Example #8
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
        winfunc=lambda x:numpy.ones((x,))):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))

    return numpy.dot(pspec*R,fb.T) / feat
Example #9
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:numpy.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log

    return feat,energy
def return_mfcc(signal):
    new_signal = sigproc.preemphasis(np.asarray(signal), coeff=0.95)
    mfcc_feat = mfcc(new_signal,
                     RATE,
                     winlen=WIN_LEN,
                     nfft=CHUNK_SAMPLES * 2,
                     winfunc=np.hamming)
    return mfcc_feat
Example #11
0
def load_frames(in_file, srate=16000):
    '''load frames from either single wav or npy
    @in_file: input wav or npy
    @return: raw frames in the size of [num_frames, frame_len]
    '''
    signal, srate = librosa.load(in_file, srate)
    pre_emphed = sigproc.preemphasis(signal, coeff=0.95)
    frames = sigproc.framesig(pre_emphed, 0.025 * srate, 0.01 * srate)
    return frames
Example #12
0
 def _powspec_cord(self, data, fs):
     kwargs = self._cording_params
     preemph = kwargs["preemph"]
     winlen = kwargs["winlen"]
     winstep = kwargs["winstep"]
     winfunc = kwargs["winfunc"]
     fft_size = int(winlen * fs)
     data = preemphasis(data, preemph)
     frames = framesig(data, winlen * fs, winstep * fs, winfunc)
     return powspec(frames, fft_size)
Example #13
0
def run():
    rate, signal = wav.read("../recordings/english.wav")
    sigproc.preemphasis(signal)
    # filter_signal(signal)

    frame_size = 0.02  # second
    frame_step = 0.01  # second

    mfcc_feat = mfcc(signal, rate, winlen=frame_size, winstep=frame_step)

    n_components = 2
    n_components = get_min_n_components(mfcc_feat)
    print("No. of identified components: {0}".format(n_components))
    gmm = GMM(n_components=n_components, covariance_type='full')
    labels = gmm.fit(mfcc_feat).predict(mfcc_feat)

    plt.scatter(mfcc_feat[:, 0], mfcc_feat[:, 1], c=labels)
    plt.show()

    plt.plot(signal)
    plt.show()
Example #14
0
def Make_Spect(wav_path,
               windowsize,
               stride,
               window=np.hamming,
               bandpass=False,
               lowfreq=0,
               highfreq=0,
               preemph=0.97,
               duration=False,
               nfft=None,
               normalize=True):
    """
    read wav as float type. [-1.0 ,1.0]
    :param wav_path:
    :param windowsize:
    :param stride:
    :param window: default to np.hamming
    :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1).
    """

    # samplerate, samples = wavfile.read(wav_path)
    samples, samplerate = sf.read(wav_path, dtype='float32')

    if bandpass and highfreq > lowfreq:
        samples = butter_bandpass_filter(data=samples,
                                         cutoff=[lowfreq, highfreq],
                                         fs=samplerate)

    signal = sigproc.preemphasis(samples, preemph)
    frames = sigproc.framesig(signal,
                              windowsize * samplerate,
                              stride * samplerate,
                              winfunc=window)

    if nfft == None:
        nfft = int(windowsize * samplerate)

    pspec = sigproc.powspec(frames, nfft)
    pspec = np.where(pspec == 0, np.finfo(float).eps, pspec)
    # S = librosa.stft(samples, n_fft=int(windowsize * samplerate),
    #                  hop_length=int((windowsize-stride) * samplerate),
    #                  window=window(int(windowsize * samplerate)))  # 进行短时傅里叶变换,参数意义在一开始有定义
    # feature, _ = librosa.magphase(S)
    # feature = np.log1p(feature)  # log1p操作
    feature = np.log(pspec).astype(np.float32)
    # feature = feature.transpose()
    if normalize:
        feature = normalize_frames(feature)

    if duration:
        return feature, len(samples) / samplerate

    return feature
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False):
    """Return complex spec
    """
    rate, sig = wav.read(wav_)

    sig = preemphasis(sig, PREEMPH)
    frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC)
    complex_spec = np.fft.rfft(frames, NFFT)

    time_scaled_complex_spec = None
    if with_time_scaled:
        time_scaled_frames = np.arange(frames.shape[-1]) * frames
        time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT)

    return complex_spec, time_scaled_complex_spec
def fft_sam(signal,
            samplerate=16000,
            winlen=0.08,
            winstep=0.04,
            nfft=2048,
            preemph=0.97,
            winfunc=lambda x: numpy.ones((x, ))):
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    feat = numpy.float32(numpy.absolute(numpy.fft.fft(
        frames, nfft)))[:, 0:int(numpy.floor(nfft / 2)) + 1]
    for i in range(0, len(feat)):
        feat[i, 1:] = feat[i, 1:] - feat[i, :-1]
    return feat
Example #17
0
def job(input_name, output_name):
    audio, _ = librosa.load(input_name, mono=True, sr=samplerate)
    if len(audio) == 0:
        return False
    signal = sigproc.preemphasis(audio, 0.97)
    x = sigproc.framesig(signal, winlen, winstep, np.hanning)
    if len(x) == 0:
        return False
    x = sigproc.powspec(x, nfft)
    x = np.dot(x, banks)
    x = np.where(x == 0, np.finfo(float).eps, x)
    x = np.log(x).astype(dtype=np.float32)
    if np.isnan(np.sum(x)):
        return False
    np.save(output_name, x)
    return True
Example #18
0
def make_spectrogram():
    global RUNNING_SPECTOGRAM, FINISHED_SPECTOGRAM
    data = q.get()

    if len(RUNNING_SPECTOGRAM) < FEED_LENGTH:
        # preemphasis the signal to weight up high frequencies
        signal = sigproc.preemphasis(data, coeff=0.95)
        # apply mfcc on the frames
        mfcc_feat = mfcc(signal, RATE, winlen=1 / (RATE / CHUNK_SAMPLES), nfft=CHUNK_SAMPLES * 2, winfunc=np.hamming)
        RUNNING_SPECTOGRAM = np.vstack([mfcc_feat, RUNNING_SPECTOGRAM])
        connection.send_spectogram(mfcc_feat, len(RUNNING_SPECTOGRAM))
        print(len(RUNNING_SPECTOGRAM))
    else:
        FINISHED_SPECTOGRAM = RUNNING_SPECTOGRAM
        RUNNING_SPECTOGRAM = np.empty([0, 13], dtype='int16')
        globals.EXAMPLE_READY = True
        globals.MIC_TRIGGER = False
Example #19
0
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False):
    """Return complex spec
    """
    sig, rate = librosa.load(wav_, sr=sr)
    #print(rate,sig)

    sig = preemphasis(sig, PREEMPH)
    frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC)
    complex_spec = np.fft.rfft(frames, NFFT)

    time_scaled_complex_spec = None
    if with_time_scaled:
        time_scaled_frames = np.arange(frames.shape[-1]) * frames
        time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT)

    print(complex_spec.shape, time_scaled_complex_spec.shape)
    return complex_spec, time_scaled_complex_spec
Example #20
0
def psf_compute_logpowspec(signal,
                           samplerate=16000,
                           winlen=0.025,
                           winstep=0.01,
                           nfilt=26,
                           nfft=512,
                           lowfreq=0,
                           highfreq=None,
                           preemph=0.97,
                           winfunc=lambda x: np.ones((x, ))):
    # Note: nfft must bigger than winlen
    # otherwise, will lost some info

    # 1 pre emphasis 预加重
    signal = sigproc.preemphasis(signal, preemph)
    # signal == (185876,) 16000

    # 2 数据分帧
    # (sample_rate = 16000,  winlen=0.025,      winstep=0.01)
    #  采样率 16000          窗 0.025s(400)     窗移 0.01s(100)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    # print(frames.shape)
    # (1161, 400)

    # 第11帧数据是完全相等的, 说明没有进行计算, 只是进行分帧处理.
    # signal 计算的第11帧数据为 400*10 - 400*11
    # frames 分帧好的第11帧数据为 frames[10]
    # print(signal[1600:1610])
    # print(frames[10,:10])

    # 3 计算能量谱
    # nfft: 傅立叶频率数量,
    # 最终每帧数据会得到 (nfft/2+1) 个频率幅度向量.
    # 1.0 / NFFT   *  numpy.square(magspec(frames, NFFT))
    # 归一化?        取平方
    # pspec = sigproc.powspec(frames,nfft)

    # 3 直接计算log spec, 内部通过 sigproc.powspec 计算了能量谱
    logpspec = sigproc.logpowspec(frames, nfft).T
    # energy = np.sum(pspec,1) # this stores the total energy in each frame
    # energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log
    return logpspec
Example #21
0
    def _vad_wav(self, wav_data):

        # 高通濾波
        frequence = 1600
        HZ_L = 20
        HZ_H = 600
        l_ = 2 * HZ_L / frequence
        h_ = 2 * HZ_H / frequence
        b, a = signal.butter(8, [l_, h_], 'bandpass')
        wav_data_f = signal.filtfilt(b, a,
                                     wav_data).astype(np.int16)  # data为要过滤的信号

        vad_wav = np.asarray([0], dtype=np.int16)
        status_wav = np.asarray([0], dtype=np.int16)
        wav_window = self.vad._load_wav(wav_data_f)
        max_en = wav_data.max()
        for wav_one in wav_window:
            # 0= 静音, 1= 可能开始 , 2=语音段, 3 =結束
            status = self.vad.speech_status(wav_one, max_en)
            status_data = np.ones_like(wav_one) * status * 3000
            status_wav = np.concatenate((status_wav, status_data))
            if status != 0:
                vad_wav = np.concatenate((vad_wav, wav_one))

        if self.show_wav:
            x1 = np.linspace(0, len(wav_data) - 1, len(wav_data))
            x2 = np.linspace(0, len(vad_wav) - 1, len(vad_wav))
            plt.subplot(211)
            plt.plot(x1, wav_data)
            plt.plot(x1, status_wav[1:])
            plt.subplot(212)
            plt.plot(x2, vad_wav)
            plt.show()
        if self.save_vad_file:
            self._save_wav_file(wav_data)
            self._save_wav_file(wav_data_f)
            self._save_wav_file(vad_wav)
        # 預加重.
        vad_wav = sigproc.preemphasis(vad_wav, coeff=0.9).astype(np.int16)

        return vad_wav
Example #22
0
def compute_mfcc(signal,
                 sr=16000,
                 winlen=0.032,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 nfft=512,
                 lowfreq=0,
                 highfreq=None,
                 preemph=0.97,
                 ceplifter=22,
                 appendEnergy=True,
                 window=np.hamming):

    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal,
                              winlen * sr,
                              winstep * sr,
                              winfunc=window)
    magspec = np.absolute(np.fft.rfft(frames, nfft))
    powspec = 1.0 / nfft * np.square(magspec)
    energy = np.sum(powspec, 1)  # this stores the total energy in each frame
    energy = np.where(energy == 0,
                      np.finfo(float).eps,
                      energy)  # if energy is zero, we get problems with log
    fb = base.get_filterbanks(nfilt, nfft, sr, lowfreq, highfreq)
    feat = np.dot(powspec, fb.T)  # compute the filterbank energies
    feat = np.where(feat == 0,
                    np.finfo(float).eps,
                    feat)  # if feat is zero, we get problems with log
    feat = np.log10(feat)
    feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep]
    feat = base.lifter(feat, ceplifter)
    if appendEnergy:
        feat[:, 0] = np.log10(
            energy
        )  # replace first cepstral coefficient with log of frame energy

    return feat
Example #23
0
def generate(audiopath, binsize=320, numcontext=0, istxt=False):
    if istxt:
        samples = np.loadtxt(audiopath, dtype=np.int16, delimiter=',')
        print(samples.shape)
        samplerate = 16000
    else:
        samplerate, samples = wav.read(audiopath)
        if len(samples.shape) == 2:
            samples = samples[:, 0]

    #normalized db
    samples = data_rep.normalize_to_db(samples, 0)
    samples = sigproc.preemphasis(samples, 0.97)

    s = stft(samples, binsize)
    sshow, freq = logscale_spec(s, factor=20.0, sr=samplerate)
    ims = 20. * np.log10(np.abs(sshow) / 10e-6)  # amplitude to decibel

    _, numcep = ims.shape
    # For each time slice of the training set, we need to copy the context this makes
    # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
    # because of:
    #  - numcep dimensions for the current mfcc feature set
    #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
    # => so numcep + 2*numcontext*numcep
    train_inputs = np.array([], np.float32)
    train_inputs.resize((ims.shape[0], numcep + 2 * numcep * numcontext))

    # Prepare pre-fix post fix context
    empty_spectrogram = np.array([])
    empty_spectrogram.resize((numcep))

    # Prepare train_inputs with past and future contexts
    time_slices = range(train_inputs.shape[0])
    context_past_min = time_slices[0] + numcontext
    context_future_max = time_slices[-1] - numcontext
    for time_slice in time_slices:
        # Reminder: array[start:stop:step]
        # slices from indice |start| up to |stop| (not included), every |step|

        # Add empty context data of the correct size to the start and end

        # Pick up to numcontext time slices in the past, and complete with empty
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_spectrogram
                                 for empty_slots in range(need_empty_past))
        data_source_past = ims[max(0, time_slice - numcontext):time_slice]

        # Pick up to numcontext time slices in the future, and complete with empty
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_spectrogram
                                   for empty_slots in range(need_empty_future))
        data_source_future = ims[time_slice + 1:time_slice + numcontext + 1]

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, numcontext * numcep)
        now = ims[time_slice]
        future = np.reshape(future, numcontext * numcep)

        train_inputs[time_slice] = np.concatenate((past, now, future))
    return train_inputs
    :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1).
    """

    # samplerate, samples = wavfile.read(wav_path)
<<<<<<< HEAD
    samples, samplerate = sf.read(wav_path, dtype='float32')
=======
    samples, samplerate = sf.read(wav_path, dtype='int16')
    if not len(samples) > 0:
        raise ValueError('wav file is empty?')
>>>>>>> Server/Server

    if bandpass and highfreq > lowfreq:
        samples = butter_bandpass_filter(data=samples, cutoff=[lowfreq, highfreq], fs=samplerate)

    signal = sigproc.preemphasis(samples, preemph)
    frames = sigproc.framesig(signal, windowsize * samplerate, stride * samplerate, winfunc=window)

    if nfft == None:
        nfft = int(windowsize * samplerate)

    pspec = sigproc.powspec(frames, nfft)
    pspec = np.where(pspec == 0, np.finfo(float).eps, pspec)

    if log_scale == True:
        feature = np.log(pspec).astype(np.float32)
    else:
        feature = pspec.astype(np.float32)
    # feature = feature.transpose()
    if normalize:
        feature = normalize_frames(feature)
Example #25
0
def audio_features(params, img_audio, audio_path, append_name, node_list):
    output_file = params['output_file']
    # create pytable atom for the features
    f_atom = tables.Float32Atom()
    count = 1
    # keep track of the nodes for which no features could be made, places
    # database contains some empty audio files
    invalid = []
    for node in node_list:
        print(f'processing file: {count}')
        count += 1
        # create a group for the desired feature type
        audio_node = output_file.create_group(node, params['feat'])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        # get the caption file names corresponding to the image of this node
        caption_files = img_audio[base_name][1]

        for cap in caption_files:
            # remove extension from the caption filename
            base_capt = cap.split('.')[0]
            # remove folder path from file names (Places/coco database)
            if '/' in base_capt:
                base_capt = base_capt.split('/')[-1]
            if '-' in base_capt:
                base_capt = base_capt.replace('-', '_')
            # read audio samples
            try:
                input_data, fs = librosa.load(os.path.join(audio_path, cap),
                                              sr=None)
                # in the places database some of the audiofiles are empty
                if len(input_data) == 0:
                    break
            except:
                # try to repair broken files, some files had a wrong header.
                # In Places I found some that could not be fixed however
                try:
                    fix_wav(os.path.join(audio_path, cap))
                    #input_data = read(os.path.join(audio_path, cap))
                except:
                    # the loop will break, if no valid audio features could
                    # be made for this image, the entire node is deleted.
                    break
            # set the fft size to the power of two equal to or greater than
            # the window size.
            window_size = int(fs * params['t_window'])
            exp = 1
            while True:
                if np.power(2, exp) - window_size >= 0:
                    fft_size = np.power(2, exp)
                    break
                else:
                    exp += 1

###############################################################################
# create audio features
            if params['feat'] == 'raw':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                features = sigproc.framesig(input_data,
                                            frame_len=window_size,
                                            frame_step=frame_shift,
                                            winfunc=params['windowing'])

            elif params['feat'] == 'freq_spectrum':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                frames = sigproc.framesig(input,
                                          frame_len=window_size,
                                          frame_step=frame_shift,
                                          winfunc=params['windowing'])
                # create the power spectrum
                features = sigproc.powspec(frames, fft_size)

            elif params['feat'] == 'fbanks':
                # create mel filterbank features
                [features, energy] = base.fbank(input_data,
                                                samplerate=fs,
                                                winlen=params['t_window'],
                                                winstep=params['t_shift'],
                                                nfilt=params['nfilters'],
                                                nfft=fft_size,
                                                lowfreq=0,
                                                highfreq=None,
                                                preemph=params['alpha'],
                                                winfunc=params['windowing'])

            elif params['feat'] == 'mfcc':
                # create mfcc features
                features = base.mfcc(input_data,
                                     samplerate=fs,
                                     winlen=params['t_window'],
                                     winstep=params['t_shift'],
                                     numcep=params['ncep'],
                                     nfilt=params['nfilters'],
                                     nfft=fft_size,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=params['alpha'],
                                     ceplifter=0,
                                     appendEnergy=params['use_energy'],
                                     winfunc=params['windowing'])

            # apply cepstral mean variance normalisation
            if params['normalise']:
                features = (features - features.mean(0)) / features.std(0)
            # optionally add the deltas and double deltas
            if params['use_deltas']:

                single_delta = base.delta(features, params['delta_n'])
                double_delta = base.delta(single_delta, params['delta_n'])
                features = np.concatenate(
                    [features, single_delta, double_delta], 1)
###############################################################################
# create new leaf node in the feature node for the current audio
# file
            feature_shape = np.shape(features)[1]
            f_table = output_file.create_earray(audio_node,
                                                append_name + base_capt,
                                                f_atom, (0, feature_shape),
                                                expectedrows=5000)

            # append new data to the tables
            f_table.append(features)
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could
            # be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions
            # features could be created
            output_file.remove_node(node, recursive=True)
    print(invalid)
    print(f'There were {len(invalid)} files that could not be processed')