Ejemplo n.º 1
0
def tiqu(path, weidu, logenergy, energy_1):

    basedir = path
    for mulu in os.listdir(basedir):

        input_dir = os.path.join(basedir, mulu, "wav")
        #语音文件的路劲

        output_dir2 = os.path.join(basedir, mulu, 'log_yuan')
        #log梅尔普系数

        # output_dir3 =r"C:\Users\a7825\Desktop\工作空间\语音数据\UUDB\第一次实验\打标签\第三批\C063L\mfcc"
        #mfcc

        muluz.mkdir(output_dir2)

        for ad_file in os.listdir(input_dir):
            print(input_dir + "/" + ad_file)
            (fs, audio) = wav.read(input_dir + "/" + ad_file)

            if energy_1 == True:
                feat, energy = fbank(audio, fs, nfilt=weidu)
                np.savetxt(output_dir2 + "/" + ad_file + ".csv",
                           feat,
                           delimiter=',')

            if logenergy == True:
                log = logfbank(audio, fs, nfilt=weidu)
                np.savetxt(output_dir2 + "/" + ad_file + ".csv",
                           log,
                           delimiter=',')
def draw_logmel(wav_file, label, feature_name, logmelband_nums=[4, 5]):
    (y, sr) = librosa.load(wav_file)
    rate, data = read(wav_file)
    plt.figure()
    plt.subplot(2, 1, 1)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, y_axis='linear', cmap="viridis")
    plt.colorbar(format='%+2.0f dB')
    plt.title('Linear-frequency power spectrogram')
    plt.ylim([0, 8192])
    plt.subplot(2, 1, 2)
    feat, _ = fbank(y, samplerate=rate, nfft=2048)
    logfbank_energy = np.log(feat).T
    logfbank_energy = logfbank(data, samplerate=rate, nfft=2048)
    colors = ["r", "g", "b"]
    for i in range(len(logmelband_nums)):
        logmelband_num = logmelband_nums[i]
        X = np.linspace(0, len(logfbank_energy[logmelband_num]),
                        len(logfbank_energy[logmelband_num]))
        plt.plot(X,
                 logfbank_energy[logmelband_num],
                 'o',
                 markersize=5,
                 color=colors[i],
                 label="logMelFreqBands[{}]".format(logmelband_num))
    #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*2)
    #plt.plot(X,[quantile_value]*len(X),markersize=2,color="r",label="quartile2")
    #quantile_value=np.quantile(logfbank_energy[logmelband_num],0.25*3)
    #plt.plot(X,[quantile_value]*len(X),markersize=2,color="g",label="quartile3")
    plt.title('logMelFrequencyBands(de)')
    plt.ylabel("Filterbank")
    plt.xlabel("Frame Idx")
    plt.legend(loc="upper right", prop={"size": 8})
    plt.savefig("/home/jialu/voice_quality_plots/v2/logMelFreqBand/" + label +
                "_" + feature_name + ".png")
Ejemplo n.º 3
0
def dscc(signal,
         samplerate=16000,
         winlen=0.025,
         winstep=0.01,
         numcep=13,
         nfilt=26,
         nfft=512,
         lowfreq=0,
         highfreq=None,
         preemph=0.97,
         ceplifter=22,
         appendEnergy=True,
         winfunc=lambda x: numpy.ones((x, ))):
    feats, energies = base.fbank(signal, samplerate, winlen, winstep, nfilt,
                                 nfft, lowfreq, highfreq, preemph, winfunc)
    feats = base.delta(feats, 2)  # OBTAIN DELTA
    feats = boxcox(feats)
    feats = numpy.log(feats)
    feats = dct(feats, type=2, axis=1, norm='ortho')[:, :numcep]
    feats = base.lifter(feats, ceplifter)
    if appendEnergy:
        feats[:, 0] = numpy.log(
            energies
        )  # replace first cepstral coefficient with log of frame energy
    feats = base.delta(feats, 2)  #verify if 2 is right
    return feats
Ejemplo n.º 4
0
def gen_fft_features(wav, step=512, nfft=[2048, 4096], n_bands=80, log=True):
    features = []
    # Ignoring warnings here.
    # Will warn about issues calculating MEL filters when nfft = 1024.
    # Causes a strange black band at band 5ish. Will ignore for now.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for fft_size in nfft:
            # mel_features is of shape [T, F]
            mel_features, mel_energy = fbank(wav,
                                             nfft=fft_size,
                                             samplerate=44100,
                                             nfilt=n_bands,
                                             winfunc=np.hamming,
                                             lowfreq=27.5,
                                             highfreq=8000.0,
                                             winstep=512 / 44100)

            if log:
                features.append(np.log10(mel_features + 1e-4))
            else:
                features.append(mel_features)

    # Reutnrs shape [Channels, Time, Frequency]
    # return np.log10(np.stack(features))
    return np.stack(features)
Ejemplo n.º 5
0
def get_features(audio_file):
    """Get features from a file"""
    signal, sample_rate = sf.read(audio_file)
    feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt)
    feat = np.log(feat)
    dfeat = delta(feat, 2)
    ddfeat = delta(dfeat, 2)
    return np.concatenate(
        [feat, dfeat, ddfeat, np.expand_dims(energy, 1)], axis=1)
Ejemplo n.º 6
0
def get_features(audio_file):
    '''Get features from a file'''
    signal, sample_rate = sf.read(tf.gfile.FastGFile(audio_file, 'rb'))
    feat, energy = fbank(signal, sample_rate, nfilt=FLAGS.nfilt)
    feat = np.log(feat)
    dfeat = delta(feat, 2)
    ddfeat = delta(dfeat, 2)
    return np.concatenate([feat, dfeat, ddfeat, np.expand_dims(energy, 1)],
                          axis=1)
Ejemplo n.º 7
0
    def Features(self, data, rate, dim):
        spec = np.abs(np.fft.rfft(data))
        freq = np.fft.rfftfreq(len(data), d=1 / dim)
        a = spec / spec.sum()
        meaN = (freq * a).sum()
        std = np.sqrt(np.sum(a * ((freq - meaN) ** 2)))
        a_cumsum = np.cumsum(a)
        mediaN = freq[len(a_cumsum[a_cumsum <= 0.5])]
        modE = freq[a.argmax()]
        q25 = freq[len(a_cumsum[a_cumsum <= 0.25])]
        q75 = freq[len(a_cumsum[a_cumsum <= 0.75])]
        IQR = q75 - q25
        z = a - a.mean()
        w = a.std()
        skewnesS = ((z ** 3).sum() / (len(spec) - 1)) / w ** 3
        kurtosiS = ((z ** 4).sum() / (len(spec) - 1)) / w ** 4
    
        m = speech.mfcc(data,rate)
        f = speech.fbank(data,rate)
        l = speech.logfbank(data,rate)
        s = speech.ssc(data,rate)
    
        data = pd.DataFrame(data)
        desc = data.describe()
        mean = desc.loc["mean"].get(0)
        mad = data.mad().get(0)
        sd = desc.loc["std"].get(0)
        median = data.median().get(0)
        minimum = desc.loc["min"].get(0)
        maximum = desc.loc["max"].get(0)
        Q25 = desc.loc["25%"].get(0)
        Q75 = desc.loc["75%"].get(0)
        interquartileR = Q75 - Q25
        skewness = data.skew().get(0)
        kurtosis = data.kurtosis().get(0)
    
        result = {
            "Mean": mean, "Mad": mad, "deviation": sd, "Median": median, "Min": minimum, "Max": maximum, 
            "interquartileR": interquartileR, "Skewness": skewness, "Q25": Q25, "Q75": Q75, "Kurtosis": kurtosis,
            "mfcc_mean": np.mean(m), "mfcc_max": np.max(m), "mfcc_min": np.min(m),
            "fbank_mean": np.mean(f[0]), "fbank_max": np.max(f[0]), "fbank_min": np.min(f[0]),
            "energy_mean": np.mean(f[1]), "energy_max": np.max(f[1]), "energy_min": np.min(f[1]),
            "lfbank_mean": np.mean(l), "lfbank_max": np.max(l), "lfbank_min": np.min(l),
            "ssc_mean": np.mean(s), "ssc_max": np.max(s), "ssc_min": np.min(s),
            "meaN": meaN, "deviatioN": std, "mediaN": mediaN, "modE": modE, "IQR": IQR,
            "skewnesS": skewnesS, "q25": q25, "q75": q75, "kurtosiS": kurtosiS}

        return result
Ejemplo n.º 8
0
import scipy.io.wavfile as wav
from python_speech_features.base import mfcc, fbank, logfbank
import numpy as np
import os

input_dir = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\wav_16"
output_dir1 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\melbank"
output_dir2 = r"C:\Users\a7825\Desktop\工作空间\跑代码\打标签\F1\B\log"
if __name__ == "__main__":

    for ad_file in os.listdir(input_dir):
        (fs, audio) = wav.read(input_dir + "/" + ad_file)
        feature_m, feature_n = fbank(audio, fs, winfunc=np.hamming)
        log = logfbank(audio, fs)
        np.savetxt(output_dir1 + "/" + ad_file + ".fbank.csv",
                   feature_m,
                   delimiter=',')
        np.savetxt(output_dir2 + "/" + ad_file + ".log.csv",
                   log,
                   delimiter=',')
        # 拼接字符串,把单引号改成双引号居然好使

# fs, audio = wav.read(r"C:\Users\a7825\Desktop\新しいフォルダー/a.wav")
# # feature_mfcc = mfcc(audio, samplerate=fs, numcep=40, nfilt=40)
# feature_m,feature_n = fbank(audio, fs)
# feature_log = logfbank(audio, fs)
# # print(feature_mel[0].shape)
# # print(feature_m)
# # print(feature_n.shape)
# np.savetxt('fbank.csv', feature_m, delimiter = ',')
# np.savetxt('energy.csv', feature_n, delimiter= ',')
Ejemplo n.º 9
0
def audio_features(params, img_audio, audio_path, append_name, node_list):
    output_file = params['output_file']
    # create pytable atom for the features
    f_atom = tables.Float32Atom()
    count = 1
    # keep track of the nodes for which no features could be made, places
    # database contains some empty audio files
    invalid = []
    for node in node_list:
        print(f'processing file: {count}')
        count += 1
        # create a group for the desired feature type
        audio_node = output_file.create_group(node, params['feat'])
        # get the base name of the node this feature will be appended to
        base_name = node._v_name.split(append_name)[1]
        # get the caption file names corresponding to the image of this node
        caption_files = img_audio[base_name][1]

        for cap in caption_files:
            # remove extension from the caption filename
            base_capt = cap.split('.')[0]
            # remove folder path from file names (Places/coco database)
            if '/' in base_capt:
                base_capt = base_capt.split('/')[-1]
            if '-' in base_capt:
                base_capt = base_capt.replace('-', '_')
            # read audio samples
            try:
                input_data, fs = librosa.load(os.path.join(audio_path, cap),
                                              sr=None)
                # in the places database some of the audiofiles are empty
                if len(input_data) == 0:
                    break
            except:
                # try to repair broken files, some files had a wrong header.
                # In Places I found some that could not be fixed however
                try:
                    fix_wav(os.path.join(audio_path, cap))
                    #input_data = read(os.path.join(audio_path, cap))
                except:
                    # the loop will break, if no valid audio features could
                    # be made for this image, the entire node is deleted.
                    break
            # set the fft size to the power of two equal to or greater than
            # the window size.
            window_size = int(fs * params['t_window'])
            exp = 1
            while True:
                if np.power(2, exp) - window_size >= 0:
                    fft_size = np.power(2, exp)
                    break
                else:
                    exp += 1

###############################################################################
# create audio features
            if params['feat'] == 'raw':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                features = sigproc.framesig(input_data,
                                            frame_len=window_size,
                                            frame_step=frame_shift,
                                            winfunc=params['windowing'])

            elif params['feat'] == 'freq_spectrum':
                # calculate the needed frame shift, premphasize and frame
                # the signal
                frame_shift = int(fs * params['t_shift'])
                input = sigproc.preemphasis(input_data, coeff=params['alpha'])
                frames = sigproc.framesig(input,
                                          frame_len=window_size,
                                          frame_step=frame_shift,
                                          winfunc=params['windowing'])
                # create the power spectrum
                features = sigproc.powspec(frames, fft_size)

            elif params['feat'] == 'fbanks':
                # create mel filterbank features
                [features, energy] = base.fbank(input_data,
                                                samplerate=fs,
                                                winlen=params['t_window'],
                                                winstep=params['t_shift'],
                                                nfilt=params['nfilters'],
                                                nfft=fft_size,
                                                lowfreq=0,
                                                highfreq=None,
                                                preemph=params['alpha'],
                                                winfunc=params['windowing'])

            elif params['feat'] == 'mfcc':
                # create mfcc features
                features = base.mfcc(input_data,
                                     samplerate=fs,
                                     winlen=params['t_window'],
                                     winstep=params['t_shift'],
                                     numcep=params['ncep'],
                                     nfilt=params['nfilters'],
                                     nfft=fft_size,
                                     lowfreq=0,
                                     highfreq=None,
                                     preemph=params['alpha'],
                                     ceplifter=0,
                                     appendEnergy=params['use_energy'],
                                     winfunc=params['windowing'])

            # apply cepstral mean variance normalisation
            if params['normalise']:
                features = (features - features.mean(0)) / features.std(0)
            # optionally add the deltas and double deltas
            if params['use_deltas']:

                single_delta = base.delta(features, params['delta_n'])
                double_delta = base.delta(single_delta, params['delta_n'])
                features = np.concatenate(
                    [features, single_delta, double_delta], 1)
###############################################################################
# create new leaf node in the feature node for the current audio
# file
            feature_shape = np.shape(features)[1]
            f_table = output_file.create_earray(audio_node,
                                                append_name + base_capt,
                                                f_atom, (0, feature_shape),
                                                expectedrows=5000)

            # append new data to the tables
            f_table.append(features)
        if audio_node._f_list_nodes() == []:
            # keep track of all the invalid nodes for which no features could
            # be made
            invalid.append(node._v_name)
            # remove the top node including all other features if no captions
            # features could be created
            output_file.remove_node(node, recursive=True)
    print(invalid)
    print(f'There were {len(invalid)} files that could not be processed')