コード例 #1
0
 def test_frame_sig(self):
     n = 10000124
     frame_len = 37
     frame_step = 13
     x = np.random.rand(n)
     t0 = time.time()
     y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
     t1 = time.time()
     y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
     t_new = time.time() - t1
     t_old = t1 - t0
     self.assertTupleEqual(y_old.shape, y_new.shape)
     np.testing.assert_array_equal(y_old, y_new)
     self.assertLess(t_new, t_old)
     print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
コード例 #2
0
def specdecomp(signal,
               samplerate=16000,
               winlen=0.025,
               winstep=0.01,
               nfft=512,
               lowfreq=0,
               highfreq=None,
               preemph=0.97,
               winfunc=lambda x: np.ones((x, )),
               decomp='complex'):
    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 
    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
    """

    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    if decomp == 'time' or decomp == 'frames':
        return frames

    complex_spec = np.fft.rfft(frames, nfft)
    if decomp == 'magnitude' or decomp == 'mag' or decomp == 'abs':
        return np.abs(complex_spec)
    elif decomp == 'phase' or decomp == 'angle':
        return np.angle(complex_spec)
    elif decomp == 'power' or decomp == 'powspec':
        return sigproc.powspec(frames, nfft)
    else:
        return complex_spec
    return spect
コード例 #3
0
ファイル: Features.py プロジェクト: sdwfrost/ECMLDeepAudio
def specspreadcent_xtr_func(sig, fs, args, winlen, winstep):
    chnkd_sig = framesig(sig, winlen * fs, winstep * fs)
    """Spectral spread and centroid of windows -  based on pyAudioAnalysis.audioFeatureExtraction library
    [github.com/tyiannak/pyAudioAnalysis]"""
    #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args)
    centroids, spreads = [], []
    for chnk in chnkd_sig:
        spec = get_win_fft(chnk, winlen, fs)
        ind = (np.arange(1, len(spec) + 1)) * (fs / (2.0 * len(spec)))
        Xt = spec.copy()
        Xt = Xt / Xt.max()
        NUM = np.sum(ind * Xt)
        DEN = np.sum(Xt) + eps

        # Centroid:
        C = (NUM / DEN)

        # Spread:
        S = np.sqrt(np.sum(((ind - C)**2) * Xt) / DEN)

        # Normalize:
        C = C / (fs / 2.0)
        S = S / (fs / 2.0)

        centroids.append(C)
        spreads.append(S)
    res = [[cent, spread] for cent, spread in zip(centroids, spreads)]
    return np.array(res)
コード例 #4
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:numpy.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
        
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
    
    return feat,energy
コード例 #5
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
        winfunc=lambda x:numpy.ones((x,))):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)    
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)    
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 
    """          
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
    
    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
    
    return numpy.dot(pspec*R,fb.T) / feat
コード例 #6
0
 def compute_rmfcc(self, pad, save=False):
     filepaths = self.wavpaths
     frames = {}
     fno = 0
     self.af = {}
     self.rmfcc = {}
     for f in filepaths:
         if fno%10 == 0:
             print fno
         fno += 1
         sig, sr = librosa.load(f[0])
         frames = sigproc.framesig(sig, frame_len=500, frame_step=250)
         af = []
         rmfcc = []
         for frame in frames:
             analysis_filt = lpc.autocor(frame, 12)
             af.append(analysis_filt)
             residual = np.array(list(analysis_filt(frame)))
             temp = list(python_speech_features.mfcc(frame, sr, winlen=0.022))
             rmfcc.append(temp[0])
         rmfcc,_ = self.pad_sequence_into_array(np.array(rmfcc).transpose(), maxlen=pad)
         print rmfcc.shape
         self.rmfcc[f[1]] = rmfcc
         self.af[f[1]] = af
     with open('analysis_filt.pkl', 'wb') as f:
         pickle.dump(self.af, f)
コード例 #7
0
def extract_features_file(filename):
    """Extract feature vectors for file

    Frames the file according to WINLEN and WINSTEP and extracts a featurevector for every frame.

    :param filename: filename of the IRMAS dataset

    :return: 2D-numpy array with the shape (numframes, numfeatures) NOTE: if numframes == 1 it returns a single featurevector

    """

    # read wav file
    data = wav.read(filename)
    data = data[1]
    data = data[:, 1]

    frames = framesig(data, winlen_samp, winlen_samp)

    features = None

    for frame in frames:
        if features is None:
            features = extract_features_window(frame)
        else:
            new_features = extract_features_window(frame)
            features = np.vstack((features, new_features))

    return features
コード例 #8
0
ファイル: MFCC.py プロジェクト: suhail511/speaker-to-chat
def ssc(signal,
        samplerate=16000,
        winlen=0.025,
        winstep=0.01,
        nfilt=55,
        nfft=2048,
        lowfreq=0,
        highfreq=None,
        preemph=0.97,
        winfunc=lambda x: numpy.ones((x, ))):
    """Compute Spectral Subband Centroid features from an audio signal.
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    pspec = sigproc.powspec(frames, nfft)
    pspec = numpy.where(pspec == 0,
                        numpy.finfo(float).eps,
                        pspec)  # if things are all zeros we get problems

    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)),
                   (numpy.size(pspec, 0), 1))

    return numpy.dot(pspec * R, fb.T) / feat
コード例 #9
0
ファイル: MFCC.py プロジェクト: suhail511/speaker-to-chat
def fbank(signal,
          samplerate=16000,
          winlen=0.025,
          winstep=0.01,
          nfilt=55,
          nfft=2048,
          lowfreq=0,
          highfreq=None,
          preemph=0.97,
          winfunc=lambda x: numpy.ones((x, ))):
    """Compute Mel-filterbank energy features from an audio signal.
    """
    highfreq = highfreq or samplerate / 2
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    pspec = sigproc.powspec(frames, nfft)
    energy = numpy.sum(pspec, 1)  # this stores the total energy in each frame
    energy = numpy.where(energy == 0,
                         numpy.finfo(float).eps,
                         energy)  # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)
    feat = numpy.dot(pspec, fb.T)  # compute the filterbank energies
    feat = numpy.where(feat == 0,
                       numpy.finfo(float).eps,
                       feat)  # if feat is zero, we get problems with log

    return feat, energy
コード例 #10
0
ファイル: compute-stft-feats.py プロジェクト: weiwchu/espnet
def cspec(signal,
          samplerate=16000,
          winlen=0.025,
          winstep=0.01,
          nfft=512,
          preemph=0.97,
          winfunc=lambda x: np.ones((x, ))):
    """Compute STFT coeeficients from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfft: the FFT size. Default is 512.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
        You can use numpy window functions here e.g. winfunc=np.hamming
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features.
        Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed)
    """
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    if np.shape(frames)[1] > nfft:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. '
            + 'Increase NFFT to avoid.',
            np.shape(frames)[1], nfft)

    return np.fft.rfft(frames, nfft)
コード例 #11
0
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
          winfunc=lambda x:numpy.ones((x,))):
    """Compute Mel-filterbank energy features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
        second return value is the energy in each frame (total energy, unwindowed)
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log

    return feat,energy
コード例 #12
0
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
        winfunc=lambda x:numpy.ones((x,))):
    """Compute Spectral Subband Centroid features from an audio signal.

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    """
    highfreq= highfreq or samplerate/2
    signal = sigproc.preemphasis(signal,preemph)
    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
    pspec = sigproc.powspec(frames,nfft)
    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems

    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))

    return numpy.dot(pspec*R,fb.T) / feat
コード例 #13
0
def stft(sig, rate):
    frames = sigproc.framesig(sig,
                              FRAME_LENGTH*rate,
                              FRAME_SHIFT*rate,
                              winfunc=squared_hann)
    spec = np.fft.rfft(frames, int(FRAME_LENGTH*rate))
    # adding 1e-7 just to avoid problems with log(0)
    return np.log10(np.absolute(spec)+1e-7)  # Log 10 for easier dB calculation
コード例 #14
0
def prepare_tfrecord(example_paths,
                     destination_path,
                     max_len,
                     nfft=256,
                     noverlap=128):
    '''
  Converts a set of inputs into spectrograms and saves them to disk in
  a TensorFlow native format.

  :return A tuple container (min_val, max_val. mean) for the set.
  '''

    features_min, features_max = None, None
    features_count, features_sum = 0, 0
    # Open a TFRecords file for writing.
    writer = tf.python_io.TFRecordWriter(destination_path)
    for idx in range(len(example_paths)):
        # Load an audio file for preprocessing.
        try:
            samples, _ = librosa.load(example_paths[idx])
        except NoBackendError:
            print('Warning: Could not load {}.'.format(example_paths[idx]))
            continue
        # Pad or shorten the number audio samples to max length.
        if samples.shape[0] < max_len:
            samples = np.pad(samples, (0, max_len - samples.shape[0]),
                             'constant')
        elif samples.shape[0] > max_len:
            samples = samples[:max_len]
        # Generate a log power spectrum of the audio samples.
        spectrum = np.abs(
            logpowspec(framesig(samples, nfft, noverlap, winfunc=np.hanning),
                       nfft,
                       norm=0))
        spectrum = np.transpose(np.flip(spectrum, 1)).astype(np.float32)
        label = int(os.path.split(example_paths[idx])[-1].split('-')[1])
        # Keep track of the dataset statistics.
        new_min = np.min(spectrum)
        new_max = np.max(spectrum)
        if features_min is not None and features_max is not None:
            features_min = new_min if features_min > new_min else features_min
            features_max = new_max if features_max < new_max else features_max
        else:
            features_min = new_min
            features_max = new_max
        features_count += np.prod(spectrum.shape)
        features_sum += np.sum(spectrum)
        # Write the final spectrum and label to disk.
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'spectrum': bytes_feature(spectrum.flatten().tostring()),
                'label': int64_feature(label)
            }))
        writer.write(example.SerializeToString())
    writer.close()
    # Return the dataset statistics.
    return features_min, features_max, float(features_sum) / features_count
コード例 #15
0
def load_frames(in_file, srate=16000):
    '''load frames from either single wav or npy
    @in_file: input wav or npy
    @return: raw frames in the size of [num_frames, frame_len]
    '''
    signal, srate = librosa.load(in_file, srate)
    pre_emphed = sigproc.preemphasis(signal, coeff=0.95)
    frames = sigproc.framesig(pre_emphed, 0.025 * srate, 0.01 * srate)
    return frames
コード例 #16
0
def calc_sp(mix, clean, data_type, Win_Length, Offset_Length):
    """
    This func is to calculate the features and corresponding labels in the time domain
    :param mix:   1-D vector
    :param clean:  1-D vector
    :param data_type: 
    :param Win_Length: the length of the window function
    :param Offset_Length: the offset length between adjanct frames
    :return: 
    """ """
    """
    n_window = Win_Length
    n_offset = Offset_Length

    mix_x = framesig(mix, frame_len=n_window, frame_step=n_offset)
    clean_x = framesig(clean, frame_len=n_window, frame_step=n_offset)

    return mix_x, clean_x
コード例 #17
0
ファイル: Features.py プロジェクト: sdwfrost/ECMLDeepAudio
def zcr_xtr_func(sig, fs, args, winlen, winstep):
    chnkd_sig = framesig(sig, winlen * fs, winstep * fs)
    """Sign-change rate of signal per frame."""

    #chnkd_sig = raw_chnkd_xtr_func(chnkd_sig_lst, fs, args)
    zcr_wins = []
    for chnk in chnkd_sig:
        zcr_win = np.sum(chnk[:-1] * chnk[1:] < 0)
        zcr_wins.append(zcr_win)
    return np.array(zcr_wins)
コード例 #18
0
 def _powspec_cord(self, data, fs):
     kwargs = self._cording_params
     preemph = kwargs["preemph"]
     winlen = kwargs["winlen"]
     winstep = kwargs["winstep"]
     winfunc = kwargs["winfunc"]
     fft_size = int(winlen * fs)
     data = preemphasis(data, preemph)
     frames = framesig(data, winlen * fs, winstep * fs, winfunc)
     return powspec(frames, fft_size)
コード例 #19
0
def extract(sig):
    # framing
    sig_frames = sigproc.framesig(sig=sig, frame_len=FRAME_LENGTH, frame_step=FRAME_STEP)
    frames_feats = None

    def concat_feats(feat_coeffs):
        return np.concatenate((frames_feats, feat_coeffs), axis=1)

    # region calculate mfcc features
    mfcc_feat = mfcc(signal=sig, samplerate=SAMPLE_RATE, winlen=WINDOW_LENGTH, winstep=WINDOW_STEP,
                     numcep=13, preemph=PRE_EMPH, winfunc=WINDOW_FUNCTION)
    mfcc_feat_delta = delta(mfcc_feat, 20)
    mfcc_feat_delta_delta = delta(mfcc_feat_delta, 20)

    frames_feats = mfcc_feat
    frames_feats = concat_feats(mfcc_feat_delta)
    frames_feats = concat_feats(mfcc_feat_delta_delta)

    # endregion

    # region calculate zero cross rating
    def zcr(frames):
        def sign(x):
            return 1 if x >= 0 else -1

        zcrs = []
        for frame in frames:
            zc_rate = 0
            for i in range(1, len(frame)):
                zc_rate += abs(sign(frame[i]) - sign(frame[i - 1])) / 2
            zcrs.append(zc_rate / len(frame))
        return zcrs

    zcrs = zcr(sig_frames)
    frames_feats = concat_feats(np.array([zcrs]).reshape(len(zcrs), 1))

    # endregion

    # region calculate energy
    def autocorrelate(frames, eta):
        energys = []
        for frame in frames:
            total_sum = 0
            for i in range(eta, len(frame)):
                total_sum += frame[i] * frame[i - eta]
            energy = 1 / len(frame) * total_sum
            energys.append(energy)
        return energys

    energys = autocorrelate(sig_frames, 0)
    frames_feats = concat_feats(np.array([energys]).reshape(len(energys), 1))
    # endregion

    # frames_feats = frames_feats/frames_feats.max(axis=1).reshape(frames_feats.shape[0],1)
    return frames_feats/100000
コード例 #20
0
 def parse_audio(self, audio_path):
     if self.augment:
         y = load_randomly_augmented_audio(audio_path, self.sample_rate)
     else:
         y = load_audio(audio_path)
     if self.noiseInjector:
         add_noise = np.random.binomial(1, self.noise_prob)
         if add_noise:
             y = self.noiseInjector.inject_noise(y)
     # Split audio into frames
     frame_len_ = self.sample_rate*self.window_size
     frame_step_ = self.sample_rate*self.window_stride
     frames = sigproc.framesig(y,frame_len=frame_len_,frame_step=frame_step_)
     # Compute features
     features = None
     if self.feature_type=='rawspeech':
         # Raw speech signal (dimension = 1 X # of samples)
         y = y.reshape((1,len(y)))
         features = y
         features = torch.FloatTensor(features)
     elif self.feature_type=='rawframes':
         # Raw speech frames (dimension = # of frames X frame length)
         features = frames
         features = torch.FloatTensor(features.transpose())
     elif self.feature_type=='spectrogram':
         # Spectrogram
         features = sigproc.magspec(frames,NFFT=int(frame_len_))
         features = torch.FloatTensor(features.transpose())
     elif self.feature_type=='mfcc':
         # MFCCs
         mfcc_feat = mfcc(y,self.sample_rate,winlen=self.window_size,winstep=self.window_stride,
                          numcep=13,nfilt=26)
         delta = mfccdelta(mfcc_feat,2)
         deltadelta = mfccdelta(delta,2)
         mfcc_feat = torch.FloatTensor(mfcc_feat.transpose())
         delta = torch.FloatTensor(delta.transpose())
         deltadelta = torch.FloatTensor(deltadelta.transpose())
         features = torch.cat((mfcc_feat,delta,deltadelta),0)
     elif self.feature_type=='logmel':
         # Log Mel-FB features
         logmel_feat = logfbank(y,self.sample_rate,winlen=self.window_size,winstep=self.window_stride,nfilt=26)
         delta = mfccdelta(logmel_feat,2)
         deltadelta = mfccdelta(delta,2)
         logmel_feat = torch.FloatTensor(logmel_feat.transpose())
         delta = torch.FloatTensor(delta.transpose())
         deltadelta = torch.FloatTensor(deltadelta.transpose())
         features = torch.cat((logmel_feat,delta,deltadelta),0)
     if self.normalize:
         mean = torch.mean(features,0,keepdim=True)
         mean = torch.cat([mean]*features.size(0))
         std = torch.std(features,0,keepdim=True)
         std =torch.cat([std]*features.size(0))
         features = (features-mean)/std
     return features
コード例 #21
0
ファイル: Features.py プロジェクト: sdwfrost/ECMLDeepAudio
def energy_xtr_func(sig, fs, args, winlen, winstep):
    chnkd_sig = framesig(sig, winlen * fs, winstep * fs)
    """Sum of squares of signal values, normalised by window length."""

    #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args)
    #for chnkd_sig, fs in zip(chnkd_sigs, fs_lst):
    energy_wins = []
    for chnk in chnkd_sig:
        nrm_energy_win = 1. / len(chnk) * np.sum(chnk**2)
        energy_wins.append(nrm_energy_win)
    return np.array(energy_wins)
コード例 #22
0
def power_(signal,
           samplerate=16000,
           winlen=0.08,
           winstep=0.04,
           nfft=2048,
           preemph=0.97,
           winfunc=lambda x: numpy.ones((x, ))):
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    feat = sigproc.powspec(frames, nfft)
    return feat
コード例 #23
0
ファイル: Utils.py プロジェクト: Amri95/snn-thesis
def get_features(file_name):
    (rate, sig) = wavfile.read(file_name)

    with contextlib.closing(wave.open(file_name, 'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)

    # Frame our signal into 20 frames with 50% overlap
    number_of_frames = 40
    frame_len = len(sig) / (number_of_frames * (.5) + .5)
    frames = framesig(sig, frame_len, frame_len * .5)

    # A list of 20 frequency lists for each frame. 6 frequency bands with the average energy of each
    features = []
    band0 = []
    band1 = []
    band2 = []
    band3 = []
    band4 = []
    band5 = []
    for frame in frames:
        spectrum, freqs, t, img = specgram(frame, Fs=rate)
        i = 0
        bands = []
        for freq in freqs:
            if freq <= 400:
                band0.extend(spectrum[i])
            elif freq > 400 and freq <= 800:
                band1.extend(spectrum[i])
            elif freq > 800 and freq <= 1600:
                band2.extend(spectrum[i])
            elif freq > 1600 and freq <= 2800:
                band3.extend(spectrum[i])
            elif freq > 2800 and freq <= 4400:
                band4.extend(spectrum[i])
            elif freq > 4400:
                band5.extend(spectrum[i])
            i += 1
        bands.append(sum(band0) / len(band0))
        bands.append(sum(band1) / len(band1))
        bands.append(sum(band2) / len(band2))
        bands.append(sum(band3) / len(band3))
        bands.append(sum(band4) / len(band4))
        bands.append(sum(band5) / len(band5))
        features.append(bands)

    values = []
    for feature in features:
        for f in feature:
            values.append(f)

    return values
コード例 #24
0
def Make_Spect(wav_path,
               windowsize,
               stride,
               window=np.hamming,
               bandpass=False,
               lowfreq=0,
               highfreq=0,
               preemph=0.97,
               duration=False,
               nfft=None,
               normalize=True):
    """
    read wav as float type. [-1.0 ,1.0]
    :param wav_path:
    :param windowsize:
    :param stride:
    :param window: default to np.hamming
    :return: return spectrogram with shape of (len(wav/stride), windowsize * samplerate /2 +1).
    """

    # samplerate, samples = wavfile.read(wav_path)
    samples, samplerate = sf.read(wav_path, dtype='float32')

    if bandpass and highfreq > lowfreq:
        samples = butter_bandpass_filter(data=samples,
                                         cutoff=[lowfreq, highfreq],
                                         fs=samplerate)

    signal = sigproc.preemphasis(samples, preemph)
    frames = sigproc.framesig(signal,
                              windowsize * samplerate,
                              stride * samplerate,
                              winfunc=window)

    if nfft == None:
        nfft = int(windowsize * samplerate)

    pspec = sigproc.powspec(frames, nfft)
    pspec = np.where(pspec == 0, np.finfo(float).eps, pspec)
    # S = librosa.stft(samples, n_fft=int(windowsize * samplerate),
    #                  hop_length=int((windowsize-stride) * samplerate),
    #                  window=window(int(windowsize * samplerate)))  # 进行短时傅里叶变换,参数意义在一开始有定义
    # feature, _ = librosa.magphase(S)
    # feature = np.log1p(feature)  # log1p操作
    feature = np.log(pspec).astype(np.float32)
    # feature = feature.transpose()
    if normalize:
        feature = normalize_frames(feature)

    if duration:
        return feature, len(samples) / samplerate

    return feature
コード例 #25
0
 def test_frame_sig(self):
     n = 10000124
     frame_len = 37
     frame_step = 13
     x = np.random.rand(n)
     t0 = time.time()
     y_old = sigproc.framesig(x,
                              frame_len=frame_len,
                              frame_step=frame_step,
                              stride_trick=False)
     t1 = time.time()
     y_new = sigproc.framesig(x,
                              frame_len=frame_len,
                              frame_step=frame_step,
                              stride_trick=True)
     t_new = time.time() - t1
     t_old = t1 - t0
     self.assertTupleEqual(y_old.shape, y_new.shape)
     np.testing.assert_array_equal(y_old, y_new)
     self.assertLess(t_new, t_old)
     print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
コード例 #26
0
ファイル: ue4_net.py プロジェクト: johndpope/motion_dance
def datafeed():
    data_w.put('start')
    loc, sec, i = 0, 0, 0
    if '.wav' not in args.track:
        rsmpfile = 'resampled.wav'
        resampling(args.track, rsmpfile)
    else:
        rsmpfile = args.track
    data_wav, fs = soundfile.read(rsmpfile)
    if fs != 16000:
        rsmpfile = 'resampled.wav'
        resampling(args.track, rsmpfile)
        data_wav, fs = soundfile.read(rsmpfile)
    data_wav /= np.amax(np.abs(data_wav))
    idxs = np.linspace(0, fs, 31, endpoint=True, dtype=np.int)
    rest = [0.0325, 0.0335, 0.0325]

    slope = (rng[1] - rng[0]) / (audio_max - audio_min)
    intersec = rng[1] - slope * audio_max
    data_w.put('start')

    if vlclib:
        vlcplayer.play()
    if enable_record:
        ws.call(requests.StartRecording())
    NFFT = int(2**(np.ceil(np.log2(256))))
    while loc < data_wav.shape[0]:
        # t = time.time()
        prv = idxs[i] + fs * sec
        loc = idxs[i + 1] + fs * sec
        frames = framesig(data_wav[prv:loc],
                          256,
                          80,
                          winfunc=lambda x: np.hamming(x))
        stft_data = logpowspec(frames, NFFT)  # , fs, 160, 80)
        stft_data = (stft_data * slope) + intersec
        stft_data = np.swapaxes(stft_data, 0, 1).astype(np.float32)
        if stft_data.shape[1] != 5:
            stft_data = np.ones((129, 5), dtype=np.float32) * rng[0]
            data_w.put((0, stft_data.copy()))
            break
        data_w.put((0, stft_data.copy()))
        sleep(rest[i % 3])
        if i >= 29:
            i = 0
            sec += 1
        else:
            i += 1
    os.remove(rsmpfile)
    data_w.put('end')
    return
コード例 #27
0
def fft_sam(signal,
            samplerate=16000,
            winlen=0.08,
            winstep=0.04,
            nfft=2048,
            preemph=0.97,
            winfunc=lambda x: numpy.ones((x, ))):
    signal = sigproc.preemphasis(signal, preemph)
    frames = sigproc.framesig(signal, winlen * samplerate,
                              winstep * samplerate, winfunc)
    feat = numpy.float32(numpy.absolute(numpy.fft.fft(
        frames, nfft)))[:, 0:int(numpy.floor(nfft / 2)) + 1]
    for i in range(0, len(feat)):
        feat[i, 1:] = feat[i, 1:] - feat[i, :-1]
    return feat
コード例 #28
0
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False):
    """Return complex spec
    """
    rate, sig = wav.read(wav_)

    sig = preemphasis(sig, PREEMPH)
    frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC)
    complex_spec = np.fft.rfft(frames, NFFT)

    time_scaled_complex_spec = None
    if with_time_scaled:
        time_scaled_frames = np.arange(frames.shape[-1]) * frames
        time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT)

    return complex_spec, time_scaled_complex_spec
コード例 #29
0
ファイル: features.py プロジェクト: songtaoshi/open_stt_e2e
def job(input_name, output_name):
    audio, _ = librosa.load(input_name, mono=True, sr=samplerate)
    if len(audio) == 0:
        return False
    signal = sigproc.preemphasis(audio, 0.97)
    x = sigproc.framesig(signal, winlen, winstep, np.hanning)
    if len(x) == 0:
        return False
    x = sigproc.powspec(x, nfft)
    x = np.dot(x, banks)
    x = np.where(x == 0, np.finfo(float).eps, x)
    x = np.log(x).astype(dtype=np.float32)
    if np.isnan(np.sum(x)):
        return False
    np.save(output_name, x)
    return True
コード例 #30
0
ファイル: Features.py プロジェクト: sdwfrost/ECMLDeepAudio
def energyentropy_xtr_func(sig, fs, args, winlen, winstep):
    chnkd_sig = framesig(sig, winlen * fs, winstep * fs)
    """Entropy of energy -  based on pyAudioAnalysis.audioFeatureExtraction library
    [github.com/tyiannak/pyAudioAnalysis]"""
    #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args)
    entropies = []
    for chnk in chnkd_sig:
        tot_enrgy = np.sum(chnk**2)
        subwin_len = int(np.floor(len(chnk) / args["n_subwins"]))
        if len(chnk) != subwin_len * args["n_subwins"]:
            chnk = chnk[0:subwin_len * args["n_subwins"]]
        subwins = chnk.reshape(subwin_len, args["n_subwins"], order='F').copy()
        subwin_enrgy = np.sum(subwins**2, axis=0) / float(tot_enrgy + eps)
        entropy = -np.sum(subwin_enrgy * np.log2(subwin_enrgy + eps))
        entropies.append(entropy)
    return np.array(entropies)
コード例 #31
0
 def compute_lpcc(self, pad, save=False):
     filepaths = self.wavpaths
     self.lpcc = {}
     self.sr = {}
     self.frames = {}
     st = time.time()
     for f in filepaths:
         sig, self.sr[f[1]] = librosa.load(f[0])
         self.frames[f[1]] = sigproc.framesig(sig, frame_len=2200, frame_step=1100)
         temp = list(self.frames[f[1]])
         arr = np.array([np.array(lpc.kautocor(i, 12).numerator) for i in temp])
         print arr.shape
         self.lpcc[f[1]] = self.pad_sequence_into_array(arr.transpose(), maxlen=pad)
     if save:
         with open('feat/lpcc.pkl', 'wb') as f:
             pickle.dump(self.lpcc, f)
コード例 #32
0
ファイル: Features.py プロジェクト: sdwfrost/ECMLDeepAudio
def specflux_xtr_func(sig, fs, args, winlen, winstep):
    chnkd_sig = framesig(sig, winlen * fs, winstep * fs)
    """Spectral flux as sum of square differences -  based on pyAudioAnalysis.audioFeatureExtraction library
    [github.com/tyiannak/pyAudioAnalysis]"""
    #chnkd_sigs = raw_chnkd_xtr_func(chnkd_sig_lst, fs_lst, args)
    fluxs, prev_chnk, prev_chnk_sum = [], [], []
    for chnk in chnkd_sig:
        spec = get_win_fft(chnk, winlen, fs)
        specsum = np.sum(spec + eps)
        if prev_chnk != []:
            flux = np.sum((spec / specsum - prev_chnk / prev_chnk_sum)**2)
            fluxs.append(flux)
        else:
            fluxs.append(0.)
        prev_chnk = spec
        prev_chnk_sum = specsum
    return np.array(fluxs)
コード例 #33
0
def get_complex_spec(wav_, winstep, winlen, with_time_scaled=False):
    """Return complex spec
    """
    sig, rate = librosa.load(wav_, sr=sr)
    #print(rate,sig)

    sig = preemphasis(sig, PREEMPH)
    frames = framesig(sig, winlen * rate, winstep * rate, HAMMING_WINFUNC)
    complex_spec = np.fft.rfft(frames, NFFT)

    time_scaled_complex_spec = None
    if with_time_scaled:
        time_scaled_frames = np.arange(frames.shape[-1]) * frames
        time_scaled_complex_spec = np.fft.rfft(time_scaled_frames, NFFT)

    print(complex_spec.shape, time_scaled_complex_spec.shape)
    return complex_spec, time_scaled_complex_spec