Ejemplo n.º 1
0
    def extract_features_path(self, path_audio, static=True, plots=False, fmt="npy", kaldi_file=""):
        """
        Extract the representation learning features for audios inside a path
        
        :param path_audio: directory with (.wav) audio files inside, sampled at 16 kHz
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldifeatures, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> replearning=RepLearning('CAE')
        >>> path_audio="../audios/"
        >>> features1=phonological.replearning(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=phonological.replearning(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=phonological.replearning(path_audio, static=False, plots=True, fmt="torch")
        >>> replearning.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")
        """

        hf=os.listdir(path_audio)
        hf.sort()

        pbar=tqdm(range(len(hf)))
        ids=[]

        Features=[]
        for j in pbar:
            pbar.set_description("Processing %s" % hf[j])
            audio_file=path_audio+hf[j]
            feat=self.extract_features_file(audio_file, static=static, plots=plots, fmt="npy")
            Features.append(feat)
            if static:
                ids.append(hf[j])
            else:
                ids.append(np.repeat(hf[j], feat.shape[0]))
        
        Features=np.vstack(Features)
        ids=np.hstack(ids)
        if fmt in("npy","txt"):
            return Features
        if fmt in("dataframe","csv"):
            if static:
                df={}
                for e, k in enumerate(self.head_st):
                    df[k]=Features[:,e]
            else:
                df={}
                for e, k in enumerate(self.head_dyn):
                    df[k]=Features[:,e]
            df["id"]=ids
            return pd.DataFrame(df)
        if fmt=="torch":
            return torch.from_numpy(Features)
        if fmt=="kaldi":
            if static:
                raise ValueError("Kaldi is only supported for dynamic features")
            dictX=get_dict(Features, ids)
            save_dict_kaldimat(dictX, kaldi_file)
        else:
            raise ValueError(fmt+" is not supported")
Ejemplo n.º 2
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the prosody features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=prosody.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=prosody.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=prosody.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> prosody.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        """
        if static:
            features = self.prosody_static(audio, plots)
            if fmt in ("npy", "txt"):
                return features
            if fmt in ("dataframe", "csv"):
                df = {}
                for e, k in enumerate(self.head_st):
                    #print(feat_v.shape, len(head_st), e, k)
                    df[k] = [features[e]]
                return pd.DataFrame(df)
            if fmt == "torch":
                feat_t = torch.from_numpy(features)
                return feat_t
            if fmt == "kaldi":
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            raise ValueError("format" + fmt + " is not supported")

        else:
            features = self.prosody_dynamic(audio)
            if fmt in ("npy", "txt"):
                return features
            if fmt in ("dataframe", "csv"):
                df = {}
                for e, k in enumerate(self.head_dyn):
                    df[k] = features[:, e]
                return pd.DataFrame(df)
            if fmt == "torch":
                feat_t = torch.from_numpy(features)
                return feat_t
            if fmt == "kaldi":
                name_all = audio.split('/')
                dictX = {name_all[-1]: features}
                save_dict_kaldimat(dictX, kaldi_file)
            else:
                raise ValueError("format" + fmt + " is not supported")
Ejemplo n.º 3
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the glottal features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> glottal=Glottal()
        >>> file_audio="../audios/001_a1_PCGITA.wav"
        >>> features1=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=glottal.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> glottal.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")
        """
        if audio.find('.wav') == -1 and audio.find('.WAV') == -1:
            raise ValueError(audio + " is not a valid wav file")
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.size_step * float(fs)
        overlap = size_stepS / size_frameS
        nF = int((len(data_audio) / size_frameS / overlap)) - 1
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        f0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(0.01 * fs),
                              min=20,
                              max=500,
                              voice_bias=-0.2,
                              otype='f0')
        sizef0 = int(self.size_frame / 0.01)
        stepf0 = int(self.size_step / 0.01)
        startf0 = 0
        stopf0 = sizef0
        avgGCIt = np.zeros(nF)
        varGCIt = np.zeros(nF)
        avgNAQt = np.zeros(nF)
        varNAQt = np.zeros(nF)
        avgQOQt = np.zeros(nF)
        varQOQt = np.zeros(nF)
        avgH1H2t = np.zeros(nF)
        varH1H2t = np.zeros(nF)
        avgHRFt = np.zeros(nF)
        varHRFt = np.zeros(nF)
        rmwin = []
        for l in range(nF):
            data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                            size_frameS)]
            f0_frame = f0[startf0:stopf0]
            pf0framez = np.where(f0_frame != 0)[0]
            f0nzframe = f0_frame[pf0framez]
            if len(f0nzframe) < 10:
                startf0 = startf0 + stepf0
                stopf0 = stopf0 + stepf0
                rmwin.append(l)
                continue
            GCI = SE_VQ_varF0(data_frame, fs, f0=f0_frame)
            g_iaif = IAIF(data_frame, fs, GCI)
            g_iaif = g_iaif - np.mean(g_iaif)
            g_iaif = g_iaif / max(abs(g_iaif))
            glottal = cumtrapz(g_iaif)
            glottal = glottal - np.mean(glottal)
            glottal = glottal / max(abs(glottal))
            startf0 = startf0 + stepf0
            stopf0 = stopf0 + stepf0

            gci_s = GCI[:]
            GCId = np.diff(gci_s)
            avgGCIt[l] = np.mean(GCId / fs)
            varGCIt[l] = np.std(GCId / fs)
            NAQ, QOQ, T1, T2, H1H2, HRF = get_vq_params(
                glottal, g_iaif, fs, GCI)
            avgNAQt[l] = np.mean(NAQ)
            varNAQt[l] = np.std(NAQ)
            avgQOQt[l] = np.mean(QOQ)
            varQOQt[l] = np.std(QOQ)
            avgH1H2t[l] = np.mean(H1H2)
            varH1H2t[l] = np.std(H1H2)
            avgHRFt[l] = np.mean(HRF)
            varHRFt[l] = np.std(HRF)
            if plots:
                self.plot_glottal(data_frame, fs, GCI, g_iaif, glottal,
                                  avgGCIt[l], varGCIt[l])

        if len(rmwin) > 0:
            varGCIt = np.delete(varGCIt, rmwin)
            avgNAQt = np.delete(avgNAQt, rmwin)
            varNAQt = np.delete(varNAQt, rmwin)
            avgQOQt = np.delete(avgQOQt, rmwin)
            varQOQt = np.delete(varQOQt, rmwin)
            avgH1H2t = np.delete(avgH1H2t, rmwin)
            varH1H2t = np.delete(varH1H2t, rmwin)
            avgHRFt = np.delete(avgHRFt, rmwin)
            varHRFt = np.delete(varHRFt, rmwin)

        feat = np.stack((varGCIt, avgNAQt, varNAQt, avgQOQt, varQOQt, avgH1H2t,
                         varH1H2t, avgHRFt, varHRFt),
                        axis=1)
        if fmt == "npy" or fmt == "txt":
            if static:
                return dynamic2static(feat)
            else:
                return feat

        elif fmt == "dataframe" or fmt == "csv":
            if static:
                feat_st = dynamic2static(feat)
                head_st = []
                df = {}
                for k in [
                        "global avg", "global std", "global skewness",
                        "global kurtosis"
                ]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    df[k] = [feat_st[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head):
                    df[k] = feat[:, e]
                return pd.DataFrame(df)
        elif fmt == "torch":
            if static:
                feat_s = dynamic2static(feat)
                feat_t = torch.from_numpy(feat_s)
                return feat_t
            else:
                return torch.from_numpy(feat)
        elif fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            else:
                name_all = audio.split('/')
                dictX = {name_all[-1]: feat}
                save_dict_kaldimat(dictX, kaldi_file)
Ejemplo n.º 4
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the phonation features from an audio file
        
        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> phonation=Phonation()
        >>> file_audio="../audios/001_a1_PCGITA.wav"
        >>> features1=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=phonation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> phonation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.size_step * float(fs)
        overlap = size_stepS / size_frameS
        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'phon' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.size_step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.size_step)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')
        F0nz = F0[F0 != 0]
        Jitter = jitter_env(F0nz, len(F0nz))
        nF = int((len(data_audio) / size_frameS / overlap)) - 1
        Amp = []
        logE = []
        apq = []
        ppq = []
        DF0 = np.diff(F0nz, 1)
        DDF0 = np.diff(DF0, 1)
        F0z = F0[F0 == 0]
        totaldurU = len(F0z)
        thresholdE = 10 * logEnergy([self.energy_thr_percent])
        degreeU = 100 * float(totaldurU) / len(F0)
        lnz = 0
        for l in range(nF):
            data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                            size_frameS)]
            energy = 10 * logEnergy(data_frame)
            if F0[l] != 0:
                Amp.append(np.max(np.abs(data_frame)))
                logE.append(energy)
                if lnz >= 12:  # TODO:
                    amp_arr = np.asarray(
                        [Amp[j] for j in range(lnz - 12, lnz)])
                    #print(amp_arr)
                    apq.append(APQ(amp_arr))
                if lnz >= 6:  # TODO:
                    f0arr = np.asarray([F0nz[j] for j in range(lnz - 6, lnz)])
                    ppq.append(PPQ(1 / f0arr))
                lnz = lnz + 1

        Shimmer = shimmer_env(Amp, len(Amp))
        apq = np.asarray(apq)
        ppq = np.asarray(ppq)
        logE = np.asarray(logE)

        if len(apq) == 0:
            print(
                "warning, there is not enough long voiced segments to compute the APQ, in this case APQ=shimmer"
            )
            apq = Shimmer

        if plots:
            self.plot_phon(data_audio, fs, F0, logE)

        if len(Shimmer) == len(apq):
            feat_mat = np.vstack((DF0[5:], DDF0[4:], Jitter[6:], Shimmer[6:],
                                  apq[6:], ppq, logE[6:])).T
        else:
            feat_mat = np.vstack((DF0[11:], DDF0[10:], Jitter[12:],
                                  Shimmer[12:], apq, ppq[6:], logE[12:])).T

        feat_v = dynamic2statict([DF0, DDF0, Jitter, Shimmer, apq, ppq, logE])

        if fmt == "npy" or fmt == "txt":
            if static:
                return feat_v
            else:
                return feat_mat
        elif fmt == "dataframe" or fmt == "csv":
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        elif fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            else:
                return torch.from_numpy(feat_mat)

        elif fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            else:
                name_all = audio.split('/')
                dictX = {name_all[-1]: feat_mat}
                save_dict_kaldimat(dictX, kaldi_file)
        else:
            raise ValueError(fmt + " is not supported")
Ejemplo n.º 5
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the phonological features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> phonological=Phonological()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=phonological.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=phonological.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=phonological.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> phonological.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")

        >>> phonological=Phonological()
        >>> path_audio="../audios/"
        >>> features1=phonological.extract_features_path(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=phonological.extract_features_path(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=phonological.extract_features_path(path_audio, static=False, plots=True, fmt="torch")
        >>> phonological.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")

        """

        df = self.phon.get_PLLR(audio, plot_flag=plots)

        keys = df.keys().tolist()
        keys.remove('time')

        if static:
            dff = {}
            feat_vec = []
            functions = [np.mean, np.std, st.skew, st.kurtosis, np.max, np.min]

            for j in keys:
                for l, function in zip(self.statistics, functions):

                    if fmt in ("npy", "txt", "torch"):
                        feat_vec.append(function(df[j]))
                    if fmt in ("dataframe", "csv"):

                        feat_name = j + "_" + l

                        dff[feat_name] = [function(df[j])]
            if fmt in ("npy", "txt"):
                return np.hstack(feat_vec)
            if fmt in ("dataframe", "csv"):
                return pd.DataFrame(dff)
            if fmt == "torch":
                return torch.from_numpy(np.hstack(feat_vec))
            if fmt == "kaldi":
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            raise ValueError(fmt + " is not supported")

        else:

            if fmt in ("npy", "txt"):
                featmat = np.stack([df[k] for k in keys], axis=1)
                return featmat
            if fmt in ("dataframe", "csv"):
                return df
            if fmt == "torch":
                featmat = np.stack([df[k] for k in keys], axis=1)
                return torch.from_numpy(featmat)
            if fmt == "kaldi":
                featmat = np.stack([df[k] for k in keys], axis=1)
                name_all = audio.split('/')
                dictX = {name_all[-1]: featmat}
                save_dict_kaldimat(dictX, kaldi_file)
            else:
                raise ValueError(fmt + " is not supported")
Ejemplo n.º 6
0
    def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""):
        """
        Extract the representation learning features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> replearning=RepLearning('CAE')
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=replearning.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=replearning.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=replearning.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> replearning.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")

        >>> replearning=RepLearning('CAE')
        >>> path_audio="../audios/"
        >>> features1=replearning.extract_features_path(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=replearning.extract_features_path(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=replearning.extract_features_path(path_audio, static=False, plots=True, fmt="torch")
        >>> replearning.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")

        """
        hb=self.AEspeech.compute_bottleneck_features(audio)
        err=self.AEspeech.compute_rec_error_features(audio)

        if plots:
            self.AEspeech.plot_spectrograms(audio)


        if static:

            bottle_feat=np.hstack((np.mean(hb, 0), np.std(hb, 0), st.skew(hb, 0), st.kurtosis(hb, 0)))
            error_feat=np.hstack((np.mean(err, 0), np.std(err, 0), st.skew(err, 0), st.kurtosis(err, 0)))
            feat_vec=np.hstack((bottle_feat, error_feat))
                
            if fmt in("npy","txt"):
                return feat_vec
            if fmt in("dataframe","csv"):
                dff={key: [value] for (key, value) in zip(self.head_st, feat_vec)}
                return pd.DataFrame(dff)
            if fmt=="torch":
                return torch.from_numpy(feat_vec)
            if fmt=="kaldi":
                raise ValueError("Kaldi is only supported for dynamic features")
            raise ValueError(fmt+" is not supported")

        else:
            featmat=np.concatenate((hb, err), axis=1)
            if fmt in("npy","txt"):
                return featmat
            if fmt in("dataframe","csv"):

                dff={}
                for e, key in enumerate(self.head_dyn):
                    dff[key]=featmat[:,e]
                dff=pd.DataFrame(dff)
                return dff
            if fmt=="torch":
                return torch.from_numpy(featmat)
            if fmt=="kaldi":
                name_all=audio.split('/')
                dictX={name_all[-1]:featmat}
                save_dict_kaldimat(dictX, kaldi_file)
            else:
                raise ValueError(fmt+" is not supported")
Ejemplo n.º 7
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the articulation features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> articulation=Articulation()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=articulation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        
        >>> path_audio="../audios/"
        >>> features1=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=articulation.extract_features_path(path_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")

        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.sizeframe * float(fs)
        size_stepS = self.step * float(fs)

        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'articulation' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.step)
            segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans(
                temp_filename_vuv, data_audio, fs, self.sizeframe)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')

            segmentsOn = V_UV(F0, data_audio, fs, 'onset')
            segmentsOff = V_UV(F0, data_audio, fs, 'offset')

        BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS,
                                     self.nB, self.nMFCC)
        BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS,
                                       size_stepS, self.nB, self.nMFCC)

        DMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T
        DDMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T

        DMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T
        DDMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T

        name_audio = audio.split('/')
        temp_uuid = 'artic' + name_audio[-1][0:-4]
        if not os.path.exists(self.PATH + '/../tempfiles/'):
            os.makedirs(self.PATH + '/../tempfiles/')
        temp_filename = self.PATH + '/../tempfiles/tempFormants' + temp_uuid + '.txt'
        praat_functions.praat_formants(audio, temp_filename, self.sizeframe,
                                       self.step)
        [F1, F2] = praat_functions.decodeFormants(temp_filename)
        os.remove(temp_filename)

        if len(F0) < len(F1):
            F0 = np.hstack((F0, np.zeros(len(F1) - len(F0))))
            F1nz = np.zeros((0, 1))
            F2nz = np.zeros((0, 1))
            DF1 = np.zeros((0, 1))
            DDF1 = np.zeros((0, 1))
            DF2 = np.zeros((0, 1))
            DDF2 = np.zeros((0, 1))
        else:
            F1 = np.hstack((F1, np.zeros(len(F0) - len(F1))))
            F2 = np.hstack((F2, np.zeros(len(F0) - len(F2))))

            pos0 = np.where(F0 == 0)[0]
            dpos0 = np.hstack(([1], np.diff(pos0)))
            f0u = np.split(pos0, np.where(dpos0 > 1)[0])

            thr_sil = int(self.len_thr_miliseconds / self.step)

            sil_seg = []
            for l in range(len(f0u)):
                if len(f0u[l]) >= thr_sil:
                    F1[f0u[l]] = 0
                    F2[f0u[l]] = 0
                sil_seg.append(f0u)

            sil_seg = np.hstack(sil_seg)

            F1nz = F1[F1 != 0]
            F2nz = F2[F2 != 0]
            DF1 = np.diff(F1, n=1)
            DF2 = np.diff(F2, n=1)
            DDF1 = np.diff(F1, n=2)
            DDF2 = np.diff(F2, n=2)

            if plots:
                self.plot_art(data_audio, fs, F0, F1, F2, segmentsOn,
                              segmentsOff)

            if len(F1nz) == 0:
                F1nz = np.zeros((0, 1))
            if len(F2nz) == 0:
                F2nz = np.zeros((0, 1))
            if len(DF1) == 0:
                DF1 = np.zeros((0, 1))
            if len(DDF1) == 0:
                DDF1 = np.zeros((0, 1))
            if len(DF2) == 0:
                DF2 = np.zeros((0, 1))
            if len(DDF2) == 0:
                DDF2 = np.zeros((0, 1))

        feat_v = dynamic2statict_artic([
            BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff,
            DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
        ])
        feat_mat = np.hstack(
            (BBEon[2:, :], MFCCon[2:, :], DMFCCon[1:, :], DDMFCCon))

        if fmt in ("npy", "txt"):
            if static:
                return feat_v
            return feat_mat
        if fmt in ("dataframe", "csv"):
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    #print(feat_v.shape, len(head_st), e, k)
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head_dyn):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        if fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            return torch.from_numpy(feat_mat)

        if fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            name_all = audio.split('/')
            dictX = {name_all[-1]: feat_mat}
            save_dict_kaldimat(dictX, kaldi_file)