Ejemplos de mfcc en Python, ejemplos de python_speech_features.base.mfcc en Python

Ejemplo n.º 1

0

Mostrar archivo

def create_mfcc(filename: str) -> np.ndarray:
    bitrate, signal = wav.read(filename)
    mfcc_data = mfcc(signal,
                     bitrate,
                     numcep=lingua_franca_config.num_cepstra,
                     nfft=1200)
    return mfcc_data

Ejemplo n.º 2

0

Mostrar archivo

Archivo: mymfcc.py Proyecto: andychen1981/azure-1

def mfcc_features(
	wavarr, 
	win_len=5, 		# window length for feature extraction in secs - run_orig.m
	win_overlap=0, 	# specify the overlap between adjacent windows for feature extraction in percentage - run_orig.m
	nfft=0,
	lowfreq=5, 
	highfreq=1000,
	kDelta=False, 
	logging=False
):
	# rate, aud_data = scipy.io.wavfile.read(file)
	rate = wavarr[0]
	signal = wavarr[1]
	d_mfcc_feat = None

	if nfft == 0:
		nfft = fft.calculate_nfft(signal.size)		#FFT size as the padded next power-of-two

	mfcc_feat = base.mfcc(signal, rate,
		winlen=win_len,						#window_length*1000 in extractFeatures.m
		winstep=win_len-win_overlap,		#10ms shift; Ts = 10 in extractFeatures.m
		numcep=13,			 				#C=12; in extractFeatures.m
		nfilt=20, 							#M=20; in extractFeatures.m
		nfft=nfft,							#pad to next power-of-2
		lowfreq=5, highfreq=1000, 			#LF=5; HF=1000; in extractFeatures.m
		preemph=0.97, ceplifter=22,			#alpha=0.97; L=22; in extractFeatures.m
		winfunc=np.hamming,					#@hamming
		appendEnergy=False					# replace first cepstral coefficient with log of frame energy
	)
	if kDelta:
		d_mfcc_feat = base.delta(mfcc_feat, 2)		#compute delta features from a feature vector
	#fbank_feat = sigproc.logfbank(signal, rate)	#compute log Mel-filterbank energy features from an audio signal

	return mfcc_feat, d_mfcc_feat

Ejemplo n.º 3

0

Mostrar archivo

Archivo: feature_construction.py Proyecto: andrew-deveau/CS229-Project

def get_mfcc_pca(sample_rate, signal, num_components):
    '''
    Returns the N largest principal components of input multivariate time series
    Required input format: each time series arranged in a column vector
    '''
    mfccs = mfcc(signal, samplerate=sample_rate, appendEnergy=False)
    pca = PCA(n_components = num_components)
    pca.fit(mfccs)
    components = pca.components_  # each row is a component
    return components.flatten()

Ejemplo n.º 4

0

Mostrar archivo

 def mfcc(signal,
          rate=default_rate,
          filters_number=default_filters_number,
          augmented=default_augmented):
     mfcc_features = mfcc(signal, rate, numcep=filters_number)
     if not augmented:
         return mfcc_features
     d_mfcc_features = delta(mfcc_features, 2)
     a_mfcc_features = delta(d_mfcc_features, 2)
     concatenated_features = np.concatenate(
         (mfcc_features, d_mfcc_features, a_mfcc_features), axis=1)
     return concatenated_features

Ejemplo n.º 5

0

Mostrar archivo

def extract_feature(wav_path):
    """Extract 39-dim mfcc feature."""
    fs, audio = wav.read(wav_path)
    mfcc = base.mfcc(audio,
                     fs,
                     winlen=0.025,
                     winstep=0.01,
                     numcep=13,
                     nfilt=26,
                     preemph=0.97,
                     appendEnergy=True)
    mfcc_d = base.delta(mfcc, N=2)
    mfcc_dd = base.delta(mfcc_d, N=2)
    feat = np.concatenate([mfcc, mfcc_d, mfcc_dd], axis=1)
    return feat

Ejemplo n.º 6

0

Mostrar archivo

Archivo: pohuy.py Proyecto: renbou/pohui

    def Features(self, data, rate, dim):
        spec = np.abs(np.fft.rfft(data))
        freq = np.fft.rfftfreq(len(data), d=1 / dim)
        a = spec / spec.sum()
        meaN = (freq * a).sum()
        std = np.sqrt(np.sum(a * ((freq - meaN) ** 2)))
        a_cumsum = np.cumsum(a)
        mediaN = freq[len(a_cumsum[a_cumsum <= 0.5])]
        modE = freq[a.argmax()]
        q25 = freq[len(a_cumsum[a_cumsum <= 0.25])]
        q75 = freq[len(a_cumsum[a_cumsum <= 0.75])]
        IQR = q75 - q25
        z = a - a.mean()
        w = a.std()
        skewnesS = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3
        kurtosiS = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4
    
        m = speech.mfcc(data,rate)
        f = speech.fbank(data,rate)
        l = speech.logfbank(data,rate)
        s = speech.ssc(data,rate)
    
        data = pd.DataFrame(data)
        desc = data.describe()
        mean = desc.loc["mean"].get(0)
        mad = data.mad().get(0)
        sd = desc.loc["std"].get(0)
        median = data.median().get(0)
        minimum = desc.loc["min"].get(0)
        maximum = desc.loc["max"].get(0)
        Q25 = desc.loc["25%"].get(0)
        Q75 = desc.loc["75%"].get(0)
        interquartileR = Q75 - Q25
        skewness = data.skew().get(0)
        kurtosis = data.kurtosis().get(0)
    
        result = {
            "Mean": mean, "Mad": mad, "deviation": sd, "Median": median, "Min": minimum, "Max": maximum, 
            "interquartileR": interquartileR, "Skewness": skewness, "Q25": Q25, "Q75": Q75, "Kurtosis": kurtosis,
            "mfcc_mean": np.mean(m), "mfcc_max": np.max(m), "mfcc_min": np.min(m),
            "fbank_mean": np.mean(f[0]), "fbank_max": np.max(f[0]), "fbank_min": np.min(f[0]),
            "energy_mean": np.mean(f[1]), "energy_max": np.max(f[1]), "energy_min": np.min(f[1]),
            "lfbank_mean": np.mean(l), "lfbank_max": np.max(l), "lfbank_min": np.min(l),
            "ssc_mean": np.mean(s), "ssc_max": np.max(s), "ssc_min": np.min(s),
            "meaN": meaN, "deviatioN": std, "mediaN": mediaN, "modE": modE, "IQR": IQR,
            "skewnesS": skewnesS, "q25": q25, "q75": q75, "kurtosiS": kurtosiS}

        return result

Ejemplo n.º 7

0

Mostrar archivo

def audio_read(datafs):
    (data, fs) = wav.read(datafs)
    ceps = mfcc(fs, numcep=cepCount)
    feat2 = ssc(fs,
                samplerate=16000,
                winlen=0.025,
                winstep=0.01,
                nfilt=26,
                nfft=512,
                lowfreq=0,
                highfreq=None,
                preemph=0.97)
    ls = []
    for i in range(ceps.shape[1]):
        temp = ceps[:, i]
        dtemp = np.gradient(temp)
        lfeatures = [
            np.mean(temp),
            np.var(temp),
            np.amax(temp),
            np.amin(temp),
            np.var(dtemp),
            np.mean(temp[0:temp.shape[0] / 2]),
            np.mean(temp[temp.shape[0] / 2:temp.shape[0]])
        ]
        temp2 = np.array(lfeatures)
        ls.append(temp2)

    ls2 = []
    for i in range(feat2.shape[1]):
        temp = feat2[:, i]
        dtemp = np.gradient(temp)
        lfeatures = [
            np.mean(temp),
            np.var(temp),
            np.amax(temp),
            np.amin(temp),
            np.var(dtemp),
            np.mean(temp[0:temp.shape[0] / 2]),
            np.mean(temp[temp.shape[0] / 2:temp.shape[0]])
        ]
        temp2 = np.array(lfeatures)
        ls2.append(temp2)

    source = np.array(ls).flatten()
    source = np.append(source, np.array(ls2).flatten())
    return source

Ejemplo n.º 8

0

Mostrar archivo

def get_mfcc(x):
    y = np.concatenate([
        mfcc(x, numcep=12, winlen=0.01, winstep=0.005),
        logfbank(x, nfilt=1, winlen=0.01, winstep=0.005)
    ],
                       axis=-1)
    derivatives = []
    previousf = np.zeros((13, ))
    for i in range(len(y)):
        if (i + 1) == len(y):
            nextf = np.zeros((13, ))
        else:
            nextf = y[i + 1]
        derivatives.append(((nextf - previousf) / 2).reshape((1, 13)))
        previousf = y[i]
    derivatives = np.concatenate(derivatives, axis=0)
    y = np.concatenate([y, derivatives], axis=1)
    return y

Ejemplo n.º 9

0

Mostrar archivo

 def mel(self, name):
     y, sr = librosa.load(name, sr=None)
     y = self.norm(y)
     #		plt.figure()
     #		plt.plot([x for x in range(y.shape[0])],y)
     #		plt.show()
     zero, ener = self.get_feature(y)
     new_y = self.detect(y, zero, ener, name)
     mfcc_feature = mfcc(signal=new_y,
                         samplerate=sr,
                         winlen=self.len_frame,
                         winstep=(1 - self.ratio) * self.len_frame,
                         numcep=self.n_mfcc,
                         nfilt=26,
                         nfft=2000,
                         winfunc=np.hamming)
     #		plt.matshow(mfcc_feature)
     #		plt.show()
     return mfcc_feature

Ejemplo n.º 10

0

Mostrar archivo

def compute_mfcc(wav_path, winstep=0.01):

    (rate, sig) = wav.read(wav_path)

    mfcc_feat = mfcc(signal=sig,
                     samplerate=rate,
                     appendEnergy=True,
                     winstep=winstep)
    # Deltas
    d_mfcc_feat = delta(mfcc_feat, 2)
    # Deltas-Deltas
    dd_mfcc_feat = delta(d_mfcc_feat, 2)
    # transpose
    mfcc_feat = np.transpose(mfcc_feat)
    d_mfcc_feat = np.transpose(d_mfcc_feat)
    dd_mfcc_feat = np.transpose(dd_mfcc_feat)
    # concat above three features
    concat_mfcc_feat = np.concatenate((mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
    return concat_mfcc_feat

Ejemplo n.º 11

0

Mostrar archivo

Archivo: preprocess_data.py Proyecto: novicasarenac/speech-to-text

def extract_mfcc(wave_files, encoded_labels, files_destination, labels_destination, mfcc_type):
    labels_df = pd.DataFrame(columns=['file', 'label'])
    files_num = len(wave_files)

    for i, (wave_file, label) in enumerate(zip(wave_files, encoded_labels)):
        wave_file_name = wave_file.split('/')[-1]
        mfcc_file_path = files_destination + wave_file_name.split('.')[0] + '.npy'

        print('{}/{}\t{}'.format(i + 1, files_num, wave_file_name))
        wave_data, sample_rate = sf.read(wave_file)
        # save mfcc
        if mfcc_type == 'cnn':
            mfcc = librosa.feature.mfcc(wave_data, sr=sample_rate)
        elif mfcc_type == 'rnn':
            mfcc = base.mfcc(wave_data,
                             samplerate=sample_rate,
                             numcep=13,
                             winstep=0.01,
                             winfunc=np.hamming)
            deltas = base.delta(mfcc, 2)

            # normalize mfcc over all frames
            mfcc_mean = np.mean(mfcc, axis=0)
            mfcc_std = np.std(mfcc, axis=0)
            mfcc = (mfcc - mfcc_mean)/mfcc_std

            # normalize deltas over all frames
            delta_mean = np.mean(deltas, axis=0)
            delta_std = np.std(deltas, axis=0)
            deltas = (deltas - delta_mean)/delta_std

        np.save(mfcc_file_path,
                np.concatenate((mfcc, deltas), axis=1),
                allow_pickle=False)

        labels_df.loc[i] = [wave_file_name, label]

    labels_df.to_csv(labels_destination,
                     sep='\t',
                     index=False)

Ejemplo n.º 12

0

Mostrar archivo

def get_mfcc(x):
    y = np.concatenate([
        mfcc(x, numcep=12, winlen=0.01, winstep=0.005),
        logfbank(x, nfilt=1, winlen=0.01, winstep=0.005)
    ],
                       axis=-1)
    derivatives = []
    previousf = np.zeros((13, ))
    for i in range(len(y)):
        if (i + 1) == len(y):
            nextf = np.zeros((13, ))
        else:
            nextf = y[i + 1]
        derivatives.append(((nextf - previousf) / 2).reshape((1, 13)))
        previousf = y[i]
    derivatives = np.concatenate(derivatives, axis=0)
    y = np.concatenate([y, derivatives], axis=1)
    ynoise = np.random.normal(0, 0.6, y.shape)
    orig_len = len(y)
    pad = [np.zeros((1, 26))] * (3150 - y.shape[0])
    ypad = np.concatenate([y] + pad, axis=0)
    noisepad = np.concatenate([ynoise] + pad, axis=0)
    return orig_len, ypad, ypad + noisepad

Ejemplo n.º 13

0

Mostrar archivo

Archivo: feature_construction.py Proyecto: andrew-deveau/CS229-Project

def get_mfcc(sample_rate, signal):
    '''
    Returns Mel Frequency Cepstral Coefficients
    Provides information about sinusoids that constitute sound wave,
    adjusted to account for the way human's perceive sound
    '''
    mfccs = mfcc(signal, samplerate=sample_rate, appendEnergy=False)
    mfcc_cov = np.cov(mfccs.T)
    dim = mfcc_cov.shape[0]

    # Get means
    mean = mfccs.mean(axis=0)

    # Get variances (i.e. diagonal of covariance matrix)
    var_mask = np.nonzero(np.eye(dim))
    var = mfcc_cov[var_mask]

    # Get off-diagonal covariances
    cov_mask = np.nonzero(np.tri(dim) - np.eye(dim))
    cov = mfcc_cov[cov_mask]

    # NOTE: librosa also provides an MFCC function, but I believe it
    # requires passing as input some complicated information
    return mean, var, cov

Ejemplo n.º 14

0

Mostrar archivo

    window_size = int(fs * params['t_window'])
    exp = 1
    while True:
        if np.power(2, exp) - window_size >= 0:
            fft_size = np.power(2, exp)
            break
        else:
            exp += 1

    prime_features = base.mfcc(prime_data[1],
                               samplerate=fs,
                               winlen=params['t_window'],
                               winstep=params['t_shift'],
                               numcep=params['ncep'],
                               nfilt=params['nfilters'],
                               nfft=fft_size,
                               lowfreq=0,
                               highfreq=None,
                               preemph=params['alpha'],
                               ceplifter=0,
                               appendEnergy=params['use_energy'],
                               winfunc=params['windowing'])

    target_features = base.mfcc(target_data[1],
                                samplerate=fs,
                                winlen=params['t_window'],
                                winstep=params['t_shift'],
                                numcep=params['ncep'],
                                nfilt=params['nfilters'],
                                nfft=fft_size,
                                lowfreq=0,

Ejemplo n.º 15

0

Mostrar archivo

#coding:utf-8

from pydub.audio_segment import AudioSegment#pydub是python中用户处理音频文件的一个库
from scipy.io import wavfile
from python_speech_features.base import mfcc #傅里叶变换+梅尔倒谱
import pandas as pd
import numpy as np
import sys


#mfcc 包含了两个步骤，一个是傅里叶变换，一个是梅尔倒谱系数
song = AudioSegment.from_file('./data/灰姑娘.mp3', format = 'mp3')#读入歌曲
# song_split = song[-30*1000:]#切分歌曲
song.export('./data/灰姑娘.wav', format= 'wav')#MP3到wav的转换
rate, data = wavfile.read('./data/灰姑娘.wav')#每秒播放速度及数据
mf_feat = mfcc(data, rate, numcep = 13, nfft = 2048)#傅里叶变换速度每秒多少帧
#  numcep = 13 越大越慢
# 108键， 小于1/4 欢快，大于1/4悲伤

print(mf_feat)
print(mf_feat.shape)
sys.exit(0)
# df = pd.DataFrame(mf_feat)
# df.to_csv('./mfFeat.csv')
# print(mf_feat)
# print(mf_feat.shape)
mm = np.mean(mf_feat, axis = 0)#隐含了时域上的相关性
mf = np.transpose(mf_feat)
mc = np.cov(mf) #原mf_feat矩阵列的协方差矩阵
# print(mc)
result = mm

Ejemplo n.º 16

0

Mostrar archivo

Archivo: preprocessing.py Proyecto: andrew-deveau/CS229-Project

def calc_mfcc(pathname):
    samprate, samples = wavfile.read(pathname)
    return mfcc(samples, samplerate=samprate, appendEnergy=False)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: audio_features.py Proyecto: DannyMerkx/speech2image

def audio_features(params, img_audio, audio_path, append_name, node_list):
    output_file = params['output_file']
    # create pytable atom for the features
    f_atom = tables.Float32Atom()
    count = 1
    # keep track of the nodes for which no features could be made, places
    # database contains some empty audio files
    invalid = []
    for node in node_list:
        print(f'processing file: {count}')
        count += 1
        # create a group for the desired feature type
        audio_node = output_file.create_group(node, params['feat'])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        # get the caption file names corresponding to the image of this node
        caption_files = img_audio[base_name][1]

        for cap in caption_files:
            # remove extension from the caption filename
            base_capt = cap.split('.')[0]
            # remove folder path from file names (Places/coco database)
            if '/' in base_capt:
                base_capt = base_capt.split('/')[-1]
            if '-' in base_capt:
                base_capt = base_capt.replace('-', '_')
            # read audio samples
            try:
                input_data, fs = librosa.load(os.path.join(audio_path, cap),
                                              sr=None)
                # in the places database some of the audiofiles are empty
                if len(input_data) == 0:
                    break
            except:
                # try to repair broken files, some files had a wrong header.
                # In Places I found some that could not be fixed however
                try:
                    fix_wav(os.path.join(audio_path, cap))
                    #input_data = read(os.path.join(audio_path, cap))
                except:
                    # the loop will break, if no valid audio features could
                    # be made for this image, the entire node is deleted.
                    break
            # set the fft size to the power of two equal to or greater than
            # the window size.
            window_size = int(fs * params['t_window'])
            exp = 1
            while True:
                if np.power(2, exp) - window_size >= 0:
                    fft_size = np.power(2, exp)
                    break
                else:
                    exp += 1

###############################################################################
# create audio features
            if params['feat'] == 'raw':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                features = sigproc.framesig(input_data,
                                            frame_len=window_size,
                                            frame_step=frame_shift,
                                            winfunc=params['windowing'])

            elif params['feat'] == 'freq_spectrum':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                frames = sigproc.framesig(input,
                                          frame_len=window_size,
                                          frame_step=frame_shift,
                                          winfunc=params['windowing'])
                # create the power spectrum
                features = sigproc.powspec(frames, fft_size)

            elif params['feat'] == 'fbanks':
                # create mel filterbank features
                [features, energy] = base.fbank(input_data,
                                                samplerate=fs,
                                                winlen=params['t_window'],
                                                winstep=params['t_shift'],
                                                nfilt=params['nfilters'],
                                                nfft=fft_size,
                                                lowfreq=0,
                                                highfreq=None,
                                                preemph=params['alpha'],
                                                winfunc=params['windowing'])

            elif params['feat'] == 'mfcc':
                # create mfcc features
                features = base.mfcc(input_data,
                                     samplerate=fs,
                                     winlen=params['t_window'],
                                     winstep=params['t_shift'],
                                     numcep=params['ncep'],
                                     nfilt=params['nfilters'],
                                     nfft=fft_size,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=params['alpha'],
                                     ceplifter=0,
                                     appendEnergy=params['use_energy'],
                                     winfunc=params['windowing'])

            # apply cepstral mean variance normalisation
            if params['normalise']:
                features = (features - features.mean(0)) / features.std(0)
            # optionally add the deltas and double deltas
            if params['use_deltas']:

                single_delta = base.delta(features, params['delta_n'])
                double_delta = base.delta(single_delta, params['delta_n'])
                features = np.concatenate(
                    [features, single_delta, double_delta], 1)
###############################################################################
# create new leaf node in the feature node for the current audio
# file
            feature_shape = np.shape(features)[1]
            f_table = output_file.create_earray(audio_node,
                                                append_name + base_capt,
                                                f_atom, (0, feature_shape),
                                                expectedrows=5000)

            # append new data to the tables
            f_table.append(features)
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could
            # be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions
            # features could be created
            output_file.remove_node(node, recursive=True)
    print(invalid)
    print(f'There were {len(invalid)} files that could not be processed')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: features.py Proyecto: aphilippsen/goalspeech

def calcAcousticFeatures(sound,
                         fs,
                         featureMode,
                         speakerType='male',
                         tmpDir=".",
                         speech_sound_type='vowel',
                         octave_binding=None):
    """
        Calculates acoustic features with given featureMode for sound
        with audio sampling rate fs.

        sound: 1D np.array or 2D array of horizontal vector.

        Returns a tuple containing a 2D np.array of numTimeSteps x numFeatureParams
        and a 2 x numFeatureParams array that contains scaling factors that
        can be used to ensure equal contribution of each feature type.
    """

    # how many Hz may change in the first formant in mergeFactor ms – few variation for vowels required
    if speech_sound_type == 'vowel':
        maxFormantChange = 50
    elif speech_sound_type == 'syllable':
        maxFormantChange = 800

    if np.ndim(sound) == 2:
        sound = sound[0, :]

    if featureMode == 'formants':
        formants = getPraatFormantsMean(sound, fs, speakerType, tmpDir)
        return (np.array(formants).reshape((1, -1)), None)

    elif featureMode == 'formants_full':
        (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir)

        # downsample
        mergeFactor = 10  # how many time steps (=ms) should be merged to one value
        newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2)
        for t in range(mergeFactor, len(timePos), mergeFactor):
            new = np.mean(formants[t:t + mergeFactor, :], 0)

            if abs(newFormants[-1, 0] - new[0]) > maxFormantChange:
                # TODO: this is dangerous if the first detected formant is incorrect!
                pass
            else:
                newFormants = np.vstack((newFormants, new))

        return (newFormants, None)

    elif featureMode == 'mfcc':
        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))

        # returns (numFrames x numCeps) np array
        window_length = 0.02  # 0.025 * 22050 = ca. 551 frames
        window_step = 0.01  # 0.01 * 22050 = ca. 221 frames
        num_cepstrals = 13
        features = mfcc(sound, fs, window_length, window_step, num_cepstrals)
        return (features, None)

    elif featureMode == 'mfcc_formants':

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))

        # returns (numFrames x numCeps) np array
        window_length = 0.02  # 0.025 * 22050 = ca. 551 frames
        window_step = 0.005  # 0.01 * 22050 = ca. 221 frames
        num_cepstrals = 13
        features = mfcc(sound, fs, window_length, window_step, num_cepstrals)

        (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir)
        # downsample
        mergeFactor = 10  # how many time steps (=ms) should be merged to one value

        # get a good estimate for initial formants (ignoring initial perturbations):
        initialFormants = np.median(formants[0:5, :], axis=0)
        newFormants = None
        i = 0
        while not newFormants:
            if abs(formants[i, 0] - initialFormants[0]) < maxFormantChange:
                newFormants = formants[i, :]
                break
            else:
                i += 1

        newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2)
        for t in range(mergeFactor, len(timePos), mergeFactor):
            new = np.mean(formants[t:t + mergeFactor, :], 0)
            if abs(newFormants[-1, 0] - new[0]) > maxFormantChange:
                pass
            else:
                newFormants = np.vstack((newFormants, new))

        # resample formants according to mfccs
        # TODO Warning: interp just copies the last element to make trajectories longer!!!
        # alternative: https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
        resampledFormants = np.zeros(
            (np.shape(features)[0], np.shape(newFormants)[1]))
        for i in range(np.shape(newFormants)[1]):
            resampledFormants[:,
                              i] = np.interp(range(np.shape(features)[0]),
                                             range(np.shape(newFormants)[0]),
                                             newFormants[:, i])

        minmax = np.array([
            np.concatenate((np.repeat([-1 / np.shape(resampledFormants)[1]],
                                      np.shape(resampledFormants)[1]),
                            np.repeat([-1 / np.shape(features)[1]],
                                      np.shape(features)[1]))),
            np.concatenate((np.repeat([1 / np.shape(resampledFormants)[1]],
                                      np.shape(resampledFormants)[1]),
                            np.repeat([1 / np.shape(features)[1]],
                                      np.shape(features)[1])))
        ])
        return (np.concatenate((resampledFormants, features), axis=1), minmax)

    elif featureMode == "fbank":

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))
        fbank_feat = logfbank(sound, fs, nfft=1024)

        return (fbank_feat,
                np.concatenate(([-1 * np.ones(fbank_feat.shape[1])],
                                [np.ones(fbank_feat.shape[1])])))

    elif featureMode == "logfbank":

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))
        fbank_feat = logfbank(sound, fs, nfft=1024)

        return (fbank_feat,
                np.concatenate(([-1 * np.ones(fbank_feat.shape[1])],
                                [np.ones(fbank_feat.shape[1])])))

    elif featureMode == 'gbfb':  # Gabor filter bank features, requires Octave installed

        # scaledAudio = np.int16(copiedAudio/maxAmplitude * 32767)
        soundNorm = sound / 32767

        #features = octave_binding.gbfb_feature_extraction(soundNorm, fs)
        features = octave_binding.heq(
            octave_binding.gbfb(
                octave_binding.log_mel_spectrogram(soundNorm, fs)))

        features = features.transpose()
        return (features,
                np.concatenate(([-1 * np.ones(features.shape[1])],
                                [np.ones(features.shape[1])])))

    else:
        print("Feature mode " + featureMode +
              " not yet defined in calcAcousticFeatures()!")

    return None