예제 #1
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the glottal features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> glottal=Glottal()
        >>> file_audio="../audios/001_a1_PCGITA.wav"
        >>> features1=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=glottal.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> glottal.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")
        """
        if audio.find('.wav') == -1 and audio.find('.WAV') == -1:
            raise ValueError(audio + " is not a valid wav file")
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.size_step * float(fs)
        overlap = size_stepS / size_frameS
        nF = int((len(data_audio) / size_frameS / overlap)) - 1
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        f0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(0.01 * fs),
                              min=20,
                              max=500,
                              voice_bias=-0.2,
                              otype='f0')
        sizef0 = int(self.size_frame / 0.01)
        stepf0 = int(self.size_step / 0.01)
        startf0 = 0
        stopf0 = sizef0
        avgGCIt = np.zeros(nF)
        varGCIt = np.zeros(nF)
        avgNAQt = np.zeros(nF)
        varNAQt = np.zeros(nF)
        avgQOQt = np.zeros(nF)
        varQOQt = np.zeros(nF)
        avgH1H2t = np.zeros(nF)
        varH1H2t = np.zeros(nF)
        avgHRFt = np.zeros(nF)
        varHRFt = np.zeros(nF)
        rmwin = []
        for l in range(nF):
            data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                            size_frameS)]
            f0_frame = f0[startf0:stopf0]
            pf0framez = np.where(f0_frame != 0)[0]
            f0nzframe = f0_frame[pf0framez]
            if len(f0nzframe) < 10:
                startf0 = startf0 + stepf0
                stopf0 = stopf0 + stepf0
                rmwin.append(l)
                continue
            GCI = SE_VQ_varF0(data_frame, fs, f0=f0_frame)
            g_iaif = IAIF(data_frame, fs, GCI)
            g_iaif = g_iaif - np.mean(g_iaif)
            g_iaif = g_iaif / max(abs(g_iaif))
            glottal = cumtrapz(g_iaif)
            glottal = glottal - np.mean(glottal)
            glottal = glottal / max(abs(glottal))
            startf0 = startf0 + stepf0
            stopf0 = stopf0 + stepf0

            gci_s = GCI[:]
            GCId = np.diff(gci_s)
            avgGCIt[l] = np.mean(GCId / fs)
            varGCIt[l] = np.std(GCId / fs)
            NAQ, QOQ, T1, T2, H1H2, HRF = get_vq_params(
                glottal, g_iaif, fs, GCI)
            avgNAQt[l] = np.mean(NAQ)
            varNAQt[l] = np.std(NAQ)
            avgQOQt[l] = np.mean(QOQ)
            varQOQt[l] = np.std(QOQ)
            avgH1H2t[l] = np.mean(H1H2)
            varH1H2t[l] = np.std(H1H2)
            avgHRFt[l] = np.mean(HRF)
            varHRFt[l] = np.std(HRF)
            if plots:
                self.plot_glottal(data_frame, fs, GCI, g_iaif, glottal,
                                  avgGCIt[l], varGCIt[l])

        if len(rmwin) > 0:
            varGCIt = np.delete(varGCIt, rmwin)
            avgNAQt = np.delete(avgNAQt, rmwin)
            varNAQt = np.delete(varNAQt, rmwin)
            avgQOQt = np.delete(avgQOQt, rmwin)
            varQOQt = np.delete(varQOQt, rmwin)
            avgH1H2t = np.delete(avgH1H2t, rmwin)
            varH1H2t = np.delete(varH1H2t, rmwin)
            avgHRFt = np.delete(avgHRFt, rmwin)
            varHRFt = np.delete(varHRFt, rmwin)

        feat = np.stack((varGCIt, avgNAQt, varNAQt, avgQOQt, varQOQt, avgH1H2t,
                         varH1H2t, avgHRFt, varHRFt),
                        axis=1)
        if fmt == "npy" or fmt == "txt":
            if static:
                return dynamic2static(feat)
            else:
                return feat

        elif fmt == "dataframe" or fmt == "csv":
            if static:
                feat_st = dynamic2static(feat)
                head_st = []
                df = {}
                for k in [
                        "global avg", "global std", "global skewness",
                        "global kurtosis"
                ]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    df[k] = [feat_st[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head):
                    df[k] = feat[:, e]
                return pd.DataFrame(df)
        elif fmt == "torch":
            if static:
                feat_s = dynamic2static(feat)
                feat_t = torch.from_numpy(feat_s)
                return feat_t
            else:
                return torch.from_numpy(feat)
        elif fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            else:
                name_all = audio.split('/')
                dictX = {name_all[-1]: feat}
                save_dict_kaldimat(dictX, kaldi_file)
예제 #2
0
def glottal_features(audio, flag_plots, size_frame=0.2, size_step=0.1):

    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = size_frame * float(fs)
    size_stepS = size_step * float(fs)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1
    data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
    f0 = pysptk.sptk.rapt(data_audiof,
                          fs,
                          int(0.01 * fs),
                          min=20,
                          max=500,
                          voice_bias=-0.2,
                          otype='f0')
    sizef0 = int(size_frame / 0.01)
    stepf0 = int(size_step / 0.01)
    startf0 = 0
    stopf0 = sizef0
    avgGCIt = np.zeros(nF)
    varGCIt = np.zeros(nF)
    avgNAQt = np.zeros(nF)
    varNAQt = np.zeros(nF)
    avgQOQt = np.zeros(nF)
    varQOQt = np.zeros(nF)
    avgH1H2t = np.zeros(nF)
    varH1H2t = np.zeros(nF)
    avgHRFt = np.zeros(nF)
    varHRFt = np.zeros(nF)
    rmwin = []
    for l in range(nF):
        data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                        size_frameS)]
        f0_frame = f0[startf0:stopf0]
        pf0framez = np.where(f0_frame != 0)[0]
        f0nzframe = f0_frame[pf0framez]
        if len(f0nzframe) < 10:
            startf0 = startf0 + stepf0
            stopf0 = stopf0 + stepf0
            rmwin.append(l)
            print("frame " + str(l) + " from " + str(nF) +
                  "-" * int(100 * l / nF) + ">" + str(int(100 *
                                                          (l + 1) / nF)) + "%",
                  sep=' ',
                  end='\r')

            continue
        GCI = SE_VQ_varF0(data_frame, fs, f0=f0_frame)
        g_iaif = IAIF(data_frame, fs, GCI)
        g_iaif = g_iaif - np.mean(g_iaif)
        g_iaif = g_iaif / max(abs(g_iaif))
        glottal = cumtrapz(g_iaif)
        glottal = glottal - np.mean(glottal)
        glottal = glottal / max(abs(glottal))
        startf0 = startf0 + stepf0
        stopf0 = stopf0 + stepf0

        gci_s = GCI[:]
        GCId = np.diff(gci_s)
        avgGCIt[l] = np.mean(GCId / fs)
        varGCIt[l] = np.std(GCId / fs)
        NAQ, QOQ, T1, T2, H1H2, HRF = get_vq_params(glottal, g_iaif, fs, GCI)
        avgNAQt[l] = np.mean(NAQ)
        varNAQt[l] = np.std(NAQ)
        avgQOQt[l] = np.mean(QOQ)
        varQOQt[l] = np.std(QOQ)
        avgH1H2t[l] = np.mean(H1H2)
        varH1H2t[l] = np.std(H1H2)
        avgHRFt[l] = np.mean(HRF)
        varHRFt[l] = np.std(HRF)
        print("frame " + str(l) + " from " + str(nF) +
              "-" * int(100 * l / nF) + ">" + str(int(100 *
                                                      (l + 1) / nF)) + "%",
              sep=' ',
              end='\r')
        if flag_plots:
            plot_glottal(data_frame, fs, GCI, g_iaif, glottal, avgGCIt[l],
                         varGCIt[l])

    if len(rmwin) > 0:
        varGCI = np.delete(varGCIt, rmwin)
        avgNAQ = np.delete(avgNAQt, rmwin)
        varNAQ = np.delete(varNAQt, rmwin)
        avgQOQ = np.delete(avgQOQt, rmwin)
        varQOQ = np.delete(varQOQt, rmwin)
        avgH1H2 = np.delete(avgH1H2t, rmwin)
        varH1H2 = np.delete(varH1H2t, rmwin)
        avgHRF = np.delete(avgHRFt, rmwin)
        varHRF = np.delete(varHRFt, rmwin)
        return varGCI, avgNAQ, varNAQ, avgQOQ, varQOQ, avgH1H2, varH1H2, avgHRF, varHRF
    else:
        return varGCIt, avgNAQt, varNAQt, avgQOQt, varQOQt, avgH1H2t, varH1H2t, avgHRFt, varHRFt