def draw_logmel(wav_file, label, feature_name, logmelband_nums=[4, 5]):
    (y, sr) = librosa.load(wav_file)
    rate, data = read(wav_file)
    plt.figure()
    plt.subplot(2, 1, 1)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, y_axis='linear', cmap="viridis")
    plt.colorbar(format='%+2.0f dB')
    plt.title('Linear-frequency power spectrogram')
    plt.ylim([0, 8192])
    plt.subplot(2, 1, 2)
    feat, _ = fbank(y, samplerate=rate, nfft=2048)
    logfbank_energy = np.log(feat).T
    logfbank_energy = logfbank(data, samplerate=rate, nfft=2048)
    colors = ["r", "g", "b"]
    for i in range(len(logmelband_nums)):
        logmelband_num = logmelband_nums[i]
        X = np.linspace(0, len(logfbank_energy[logmelband_num]),
                        len(logfbank_energy[logmelband_num]))
        plt.plot(X,
                 logfbank_energy[logmelband_num],
                 'o',
                 markersize=5,
                 color=colors[i],
                 label="logMelFreqBands[{}]".format(logmelband_num))
    #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*2)
    #plt.plot(X,[quantile_value]*len(X),markersize=2,color="r",label="quartile2")
    #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*3)
    #plt.plot(X,[quantile_value]*len(X),markersize=2,color="g",label="quartile3")
    plt.title('logMelFrequencyBands(de)')
    plt.ylabel("Filterbank")
    plt.xlabel("Frame Idx")
    plt.legend(loc="upper right", prop={"size": 8})
    plt.savefig("/home/jialu/voice_quality_plots/v2/logMelFreqBand/" + label +
                "_" + feature_name + ".png")
Exemple #2
0
def tiqu(path, weidu, logenergy, energy_1):

    basedir = path
    for mulu in os.listdir(basedir):

        input_dir = os.path.join(basedir, mulu, "wav")
        #语音文件的路劲

        output_dir2 = os.path.join(basedir, mulu, 'log_yuan')
        #log梅尔普系数

        # output_dir3 =r"C:\Users\a7825\Desktop\工作空间\语音数据\UUDB\第一次实验\打标签\第三批\C063L\mfcc"
        #mfcc

        muluz.mkdir(output_dir2)

        for ad_file in os.listdir(input_dir):
            print(input_dir + "/" + ad_file)
            (fs, audio) = wav.read(input_dir + "/" + ad_file)

            if energy_1 == True:
                feat, energy = fbank(audio, fs, nfilt=weidu)
                np.savetxt(output_dir2 + "/" + ad_file + ".csv",
                           feat,
                           delimiter=',')

            if logenergy == True:
                log = logfbank(audio, fs, nfilt=weidu)
                np.savetxt(output_dir2 + "/" + ad_file + ".csv",
                           log,
                           delimiter=',')
def logfbank_features(fname):
    """
    Compute log Mel-filterbank energy features
    """
    (rate, signal) = wavfile.read(fname)
    fbank_beat = logfbank(signal, rate)
    # take mean of all rows
    features = fbank_beat.mean(axis=0)
    return features
 def logfbank(signal,
              rate=default_rate,
              filters_number=default_filters_number,
              augmented=default_augmented):
     logfbank_features = logfbank(signal, rate, nfilt=filters_number)
     if not augmented:
         return logfbank_features
     d_logfbank_features = delta(logfbank_features, 2)
     a_logfbank_features = delta(d_logfbank_features, 2)
     concatenated_features = np.concatenate(
         (logfbank_features, d_logfbank_features, a_logfbank_features),
         axis=1)
     return concatenated_features
Exemple #5
0
    def Features(self, data, rate, dim):
        spec = np.abs(np.fft.rfft(data))
        freq = np.fft.rfftfreq(len(data), d=1 / dim)
        a = spec / spec.sum()
        meaN = (freq * a).sum()
        std = np.sqrt(np.sum(a * ((freq - meaN) ** 2)))
        a_cumsum = np.cumsum(a)
        mediaN = freq[len(a_cumsum[a_cumsum <= 0.5])]
        modE = freq[a.argmax()]
        q25 = freq[len(a_cumsum[a_cumsum <= 0.25])]
        q75 = freq[len(a_cumsum[a_cumsum <= 0.75])]
        IQR = q75 - q25
        z = a - a.mean()
        w = a.std()
        skewnesS = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3
        kurtosiS = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4
    
        m = speech.mfcc(data,rate)
        f = speech.fbank(data,rate)
        l = speech.logfbank(data,rate)
        s = speech.ssc(data,rate)
    
        data = pd.DataFrame(data)
        desc = data.describe()
        mean = desc.loc["mean"].get(0)
        mad = data.mad().get(0)
        sd = desc.loc["std"].get(0)
        median = data.median().get(0)
        minimum = desc.loc["min"].get(0)
        maximum = desc.loc["max"].get(0)
        Q25 = desc.loc["25%"].get(0)
        Q75 = desc.loc["75%"].get(0)
        interquartileR = Q75 - Q25
        skewness = data.skew().get(0)
        kurtosis = data.kurtosis().get(0)
    
        result = {
            "Mean": mean, "Mad": mad, "deviation": sd, "Median": median, "Min": minimum, "Max": maximum, 
            "interquartileR": interquartileR, "Skewness": skewness, "Q25": Q25, "Q75": Q75, "Kurtosis": kurtosis,
            "mfcc_mean": np.mean(m), "mfcc_max": np.max(m), "mfcc_min": np.min(m),
            "fbank_mean": np.mean(f[0]), "fbank_max": np.max(f[0]), "fbank_min": np.min(f[0]),
            "energy_mean": np.mean(f[1]), "energy_max": np.max(f[1]), "energy_min": np.min(f[1]),
            "lfbank_mean": np.mean(l), "lfbank_max": np.max(l), "lfbank_min": np.min(l),
            "ssc_mean": np.mean(s), "ssc_max": np.max(s), "ssc_min": np.min(s),
            "meaN": meaN, "deviatioN": std, "mediaN": mediaN, "modE": modE, "IQR": IQR,
            "skewnesS": skewnesS, "q25": q25, "q75": q75, "kurtosiS": kurtosiS}

        return result
def get_fbank_feature(wavsignal, fs):
    """
    输入为wav文件数学表示和采样频率,输出为语音的FBANK特征+一阶差分+二阶差分
    :param wavsignal:
    :param fs:
    :return:
    """
    feat_fbank = logfbank(wavsignal,
                          fs,
                          nfilt=40,
                          nfft=2048,
                          winstep=0.025,
                          winlen=0.05)
    feat_fbank_d = delta(feat_fbank, 2)
    feat_fbank_dd = delta(feat_fbank_d, 2)
    wav_feature = np.column_stack((feat_fbank, feat_fbank_d, feat_fbank_dd))
    return wav_feature
Exemple #7
0
def get_mfcc(x):
    y = np.concatenate([
        mfcc(x, numcep=12, winlen=0.01, winstep=0.005),
        logfbank(x, nfilt=1, winlen=0.01, winstep=0.005)
    ],
                       axis=-1)
    derivatives = []
    previousf = np.zeros((13, ))
    for i in range(len(y)):
        if (i + 1) == len(y):
            nextf = np.zeros((13, ))
        else:
            nextf = y[i + 1]
        derivatives.append(((nextf - previousf) / 2).reshape((1, 13)))
        previousf = y[i]
    derivatives = np.concatenate(derivatives, axis=0)
    y = np.concatenate([y, derivatives], axis=1)
    return y
Exemple #8
0
    def _compute_sample(self, file, sliceLength, segmentLength):
        """computes the feature and label vector for single audio slice"""
        filename = self.dataset_path + file['audio_file_path']
        fs, sound_data = wavfile.read(filename)
        sound_data = sound_data.astype('float32')
        label = file['is_hotword']
        label_vec = np.full(int(sliceLength / segmentLength - 1),
                            0,
                            dtype='int8')
        if label:
            is_audio_subsampled = self._detect_audio(
                sound_data
            )[0::
              segmentLength]  #detect audio and subsample to single label per segment
            num_segments = int(sliceLength / segmentLength)
            if len(is_audio_subsampled) >= num_segments:
                is_audio_subsampled = is_audio_subsampled[:
                                                          num_segments]  #truncate if longer than target segments
            else:
                is_audio_subsampled = np.pad(
                    is_audio_subsampled,
                    (num_segments - len(is_audio_subsampled),
                     0))  #pad if shorter than target segments
            label_vec = label_vec + is_audio_subsampled[
                1:]  # overlay frames where label is true

        if len(sound_data) >= sliceLength:
            sound_data = sound_data[:sliceLength]
        else:
            sound_data = np.pad(sound_data, (sliceLength - len(sound_data), 0))

        #compute log mel filterbank energies
        feature_vec = logfbank(sound_data,
                               samplerate=self.sampling_frequency,
                               winlen=self.window_size,
                               winstep=self.time_step,
                               nfilt=self.num_features)
        return feature_vec, label_vec
Exemple #9
0
def get_mfcc(x):
    y = np.concatenate([
        mfcc(x, numcep=12, winlen=0.01, winstep=0.005),
        logfbank(x, nfilt=1, winlen=0.01, winstep=0.005)
    ],
                       axis=-1)
    derivatives = []
    previousf = np.zeros((13, ))
    for i in range(len(y)):
        if (i + 1) == len(y):
            nextf = np.zeros((13, ))
        else:
            nextf = y[i + 1]
        derivatives.append(((nextf - previousf) / 2).reshape((1, 13)))
        previousf = y[i]
    derivatives = np.concatenate(derivatives, axis=0)
    y = np.concatenate([y, derivatives], axis=1)
    ynoise = np.random.normal(0, 0.6, y.shape)
    orig_len = len(y)
    pad = [np.zeros((1, 26))] * (3150 - y.shape[0])
    ypad = np.concatenate([y] + pad, axis=0)
    noisepad = np.concatenate([ynoise] + pad, axis=0)
    return orig_len, ypad, ypad + noisepad
Exemple #10
0
import scipy.io.wavfile as wav
from python_speech_features.base import mfcc, fbank, logfbank
import numpy as np
import os

input_dir = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\wav_16"
output_dir1 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\melbank"
output_dir2 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\log"
if __name__ == "__main__":

    for ad_file in os.listdir(input_dir):
        (fs, audio) = wav.read(input_dir + "/" + ad_file)
        feature_m, feature_n = fbank(audio, fs, winfunc=np.hamming)
        log = logfbank(audio, fs)
        np.savetxt(output_dir1 + "/" + ad_file + ".fbank.csv",
                   feature_m,
                   delimiter=',')
        np.savetxt(output_dir2 + "/" + ad_file + ".log.csv",
                   log,
                   delimiter=',')
        # 拼接字符串,把单引号改成双引号居然好使

# fs, audio = wav.read(r"C:\Users\a7825\Desktop\新しいフォルダー/a.wav")
# # feature_mfcc = mfcc(audio, samplerate=fs, numcep=40, nfilt=40)
# feature_m,feature_n = fbank(audio, fs)
# feature_log = logfbank(audio, fs)
# # print(feature_mel[0].shape)
# # print(feature_m)
# # print(feature_n.shape)
# np.savetxt('fbank.csv', feature_m, delimiter = ',')
# np.savetxt('energy.csv', feature_n, delimiter= ',')
Exemple #11
0
import matplotlib.pyplot as plt
import librosa as li
import librosa.display as ds
import librosa
from python_speech_features.base import mfcc, logfbank
import scipy.io.wavfile as wav
import numpy as np
import os

indir = r'C:\Users\a7825\Desktop\工作空间\セミナー\语音\wav/C001L_061.wav'
# indir_1 =r'C:\Users\a7825\Desktop\工作空间\杂物\临时\这个就对了'

# 显示メルフィルタバンク的图
x, fs = li.load(indir, sr=16000)
# (fs, x) = wav.read(indir)
log = logfbank(x, fs)

# np.savetxt(indir_1 + ".csv", log, delimiter=',')
# print(log.shape)
# os.system('pause')

# for e in range(11,21):
#     for i in range(26):
#         log[e][i] = log[e][i]+3.5#为了让图变得亮一点

# ig, ax = plt.subplots()
# plt.xlim(0,130)#设置x的范围
plt.ylim(0, 25.6)  #设置y的范围

# new_ticks = np.linspace(0.00,5.00,4)
# plt.xticks(new_ticks)
Exemple #12
0
import matplotlib.pyplot as plt
import librosa as li
import librosa.display as ds
import librosa
from python_speech_features.base import mfcc, logfbank, fbank
import scipy.io.wavfile as wav
import numpy as np
import zhengguihua
import os

indir = r'C:\Users\a7825\Desktop\工作空间\セミナー\语音\wav/C001L_061.wav'
# indir_1 =r'C:\Users\a7825\Desktop\工作空间\杂物\临时\这个就对了'

# 显示logメルフィルタバンク的图
(fs, x) = wav.read(indir)
log = logfbank(x, fs, nfilt=40)

np.savetxt(indir + ".csv", log, delimiter=',')

ig, ax = plt.subplots()
log = np.swapaxes(log, 0, 1)
cax = ax.imshow(log, interpolation='nearest', origin='lower', aspect='auto')

plt.show()
Exemple #13
0
def calcAcousticFeatures(sound,
                         fs,
                         featureMode,
                         speakerType='male',
                         tmpDir=".",
                         speech_sound_type='vowel',
                         octave_binding=None):
    """
        Calculates acoustic features with given featureMode for sound
        with audio sampling rate fs.

        sound: 1D np.array or 2D array of horizontal vector.

        Returns a tuple containing a 2D np.array of numTimeSteps x numFeatureParams
        and a 2 x numFeatureParams array that contains scaling factors that
        can be used to ensure equal contribution of each feature type.
    """

    # how many Hz may change in the first formant in mergeFactor ms – few variation for vowels required
    if speech_sound_type == 'vowel':
        maxFormantChange = 50
    elif speech_sound_type == 'syllable':
        maxFormantChange = 800

    if np.ndim(sound) == 2:
        sound = sound[0, :]

    if featureMode == 'formants':
        formants = getPraatFormantsMean(sound, fs, speakerType, tmpDir)
        return (np.array(formants).reshape((1, -1)), None)

    elif featureMode == 'formants_full':
        (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir)

        # downsample
        mergeFactor = 10  # how many time steps (=ms) should be merged to one value
        newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2)
        for t in range(mergeFactor, len(timePos), mergeFactor):
            new = np.mean(formants[t:t + mergeFactor, :], 0)

            if abs(newFormants[-1, 0] - new[0]) > maxFormantChange:
                # TODO: this is dangerous if the first detected formant is incorrect!
                pass
            else:
                newFormants = np.vstack((newFormants, new))

        return (newFormants, None)

    elif featureMode == 'mfcc':
        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))

        # returns (numFrames x numCeps) np array
        window_length = 0.02  # 0.025 * 22050 = ca. 551 frames
        window_step = 0.01  # 0.01 * 22050 = ca. 221 frames
        num_cepstrals = 13
        features = mfcc(sound, fs, window_length, window_step, num_cepstrals)
        return (features, None)

    elif featureMode == 'mfcc_formants':

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))

        # returns (numFrames x numCeps) np array
        window_length = 0.02  # 0.025 * 22050 = ca. 551 frames
        window_step = 0.005  # 0.01 * 22050 = ca. 221 frames
        num_cepstrals = 13
        features = mfcc(sound, fs, window_length, window_step, num_cepstrals)

        (timePos, formants) = getPraatFormants(sound, fs, speakerType, tmpDir)
        # downsample
        mergeFactor = 10  # how many time steps (=ms) should be merged to one value

        # get a good estimate for initial formants (ignoring initial perturbations):
        initialFormants = np.median(formants[0:5, :], axis=0)
        newFormants = None
        i = 0
        while not newFormants:
            if abs(formants[i, 0] - initialFormants[0]) < maxFormantChange:
                newFormants = formants[i, :]
                break
            else:
                i += 1

        newFormants = np.array(np.mean(formants[0:mergeFactor, :], 0), ndmin=2)
        for t in range(mergeFactor, len(timePos), mergeFactor):
            new = np.mean(formants[t:t + mergeFactor, :], 0)
            if abs(newFormants[-1, 0] - new[0]) > maxFormantChange:
                pass
            else:
                newFormants = np.vstack((newFormants, new))

        # resample formants according to mfccs
        # TODO Warning: interp just copies the last element to make trajectories longer!!!
        # alternative: https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
        resampledFormants = np.zeros(
            (np.shape(features)[0], np.shape(newFormants)[1]))
        for i in range(np.shape(newFormants)[1]):
            resampledFormants[:,
                              i] = np.interp(range(np.shape(features)[0]),
                                             range(np.shape(newFormants)[0]),
                                             newFormants[:, i])

        minmax = np.array([
            np.concatenate((np.repeat([-1 / np.shape(resampledFormants)[1]],
                                      np.shape(resampledFormants)[1]),
                            np.repeat([-1 / np.shape(features)[1]],
                                      np.shape(features)[1]))),
            np.concatenate((np.repeat([1 / np.shape(resampledFormants)[1]],
                                      np.shape(resampledFormants)[1]),
                            np.repeat([1 / np.shape(features)[1]],
                                      np.shape(features)[1])))
        ])
        return (np.concatenate((resampledFormants, features), axis=1), minmax)

    elif featureMode == "fbank":

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))
        fbank_feat = logfbank(sound, fs, nfft=1024)

        return (fbank_feat,
                np.concatenate(([-1 * np.ones(fbank_feat.shape[1])],
                                [np.ones(fbank_feat.shape[1])])))

    elif featureMode == "logfbank":

        # sound as 1d
        if sound.ndim == 2:
            sound = np.reshape(sound, (-1))
        fbank_feat = logfbank(sound, fs, nfft=1024)

        return (fbank_feat,
                np.concatenate(([-1 * np.ones(fbank_feat.shape[1])],
                                [np.ones(fbank_feat.shape[1])])))

    elif featureMode == 'gbfb':  # Gabor filter bank features, requires Octave installed

        # scaledAudio = np.int16(copiedAudio/maxAmplitude * 32767)
        soundNorm = sound / 32767

        #features = octave_binding.gbfb_feature_extraction(soundNorm, fs)
        features = octave_binding.heq(
            octave_binding.gbfb(
                octave_binding.log_mel_spectrogram(soundNorm, fs)))

        features = features.transpose()
        return (features,
                np.concatenate(([-1 * np.ones(features.shape[1])],
                                [np.ones(features.shape[1])])))

    else:
        print("Feature mode " + featureMode +
              " not yet defined in calcAcousticFeatures()!")

    return None