Beispiel #1
0
    def prosody_static(self, audio, plots):
        """Extract the static prosody features from an audio file

        :param audio: .wav audio file.
        :param plots: timeshift to extract the features
        :returns: array with the 103 prosody features

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features=prosody.prosody_static(file_audio, plots=True)

        """
        fs, data_audio=read(audio)
        data_audio=data_audio-np.mean(data_audio)
        data_audio=data_audio/float(np.max(np.abs(data_audio)))
        size_frameS=self.size_frame*float(fs)
        size_stepS=self.step*float(fs)
        thr_len_pause=self.thr_len*float(fs)
        overlap=size_stepS/size_frameS
        nF=int((len(data_audio)/size_frameS/overlap))-1

        if self.pitch_method == 'praat':
            name_audio=audio.split('/')
            temp_uuid='prosody'+name_audio[-1][0:-4]
            if not os.path.exists(self.PATH+'/../tempfiles/'):
                os.makedirs(self.PATH+'/../tempfiles/')
            temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt'
            temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt'
            praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0)

            F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step)
            os.remove(temp_filename_f0)
            os.remove(temp_filename_vuv)
        elif self.pitch_method == 'rapt':
            data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32)
            F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0')

        segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS)
        segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS)

        segmentsP=[]
        segmentsU=[]
        for k in range(len(segmentsUP)):
            eu=logEnergy(segmentsUP[k])
            if (len(segmentsUP[k])>thr_len_pause):
                segmentsP.append(segmentsUP[k])
            else:
                segmentsU.append(segmentsUP[k])

        F0_features=F0feat(F0)
        energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS)
        energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS)
        duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs)

        if plots:
            self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features)

        features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features))
        
        return features
Beispiel #2
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the phonation features from an audio file
        
        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> phonation=Phonation()
        >>> file_audio="../audios/001_a1_PCGITA.wav"
        >>> features1=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=phonation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> phonation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.size_step * float(fs)
        overlap = size_stepS / size_frameS
        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'phon' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.size_step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.size_step)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')
        F0nz = F0[F0 != 0]
        Jitter = jitter_env(F0nz, len(F0nz))
        nF = int((len(data_audio) / size_frameS / overlap)) - 1
        Amp = []
        logE = []
        apq = []
        ppq = []
        DF0 = np.diff(F0nz, 1)
        DDF0 = np.diff(DF0, 1)
        F0z = F0[F0 == 0]
        totaldurU = len(F0z)
        thresholdE = 10 * logEnergy([self.energy_thr_percent])
        degreeU = 100 * float(totaldurU) / len(F0)
        lnz = 0
        for l in range(nF):
            data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                            size_frameS)]
            energy = 10 * logEnergy(data_frame)
            if F0[l] != 0:
                Amp.append(np.max(np.abs(data_frame)))
                logE.append(energy)
                if lnz >= 12:  # TODO:
                    amp_arr = np.asarray(
                        [Amp[j] for j in range(lnz - 12, lnz)])
                    #print(amp_arr)
                    apq.append(APQ(amp_arr))
                if lnz >= 6:  # TODO:
                    f0arr = np.asarray([F0nz[j] for j in range(lnz - 6, lnz)])
                    ppq.append(PPQ(1 / f0arr))
                lnz = lnz + 1

        Shimmer = shimmer_env(Amp, len(Amp))
        apq = np.asarray(apq)
        ppq = np.asarray(ppq)
        logE = np.asarray(logE)

        if len(apq) == 0:
            print(
                "warning, there is not enough long voiced segments to compute the APQ, in this case APQ=shimmer"
            )
            apq = Shimmer

        if plots:
            self.plot_phon(data_audio, fs, F0, logE)

        if len(Shimmer) == len(apq):
            feat_mat = np.vstack((DF0[5:], DDF0[4:], Jitter[6:], Shimmer[6:],
                                  apq[6:], ppq, logE[6:])).T
        else:
            feat_mat = np.vstack((DF0[11:], DDF0[10:], Jitter[12:],
                                  Shimmer[12:], apq, ppq[6:], logE[12:])).T

        feat_v = dynamic2statict([DF0, DDF0, Jitter, Shimmer, apq, ppq, logE])

        if fmt == "npy" or fmt == "txt":
            if static:
                return feat_v
            else:
                return feat_mat
        elif fmt == "dataframe" or fmt == "csv":
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        elif fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            else:
                return torch.from_numpy(feat_mat)

        elif fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            else:
                name_all = audio.split('/')
                dictX = {name_all[-1]: feat_mat}
                save_dict_kaldimat(dictX, kaldi_file)
        else:
            raise ValueError(fmt + " is not supported")
Beispiel #3
0
    def prosody_dynamic(self, audio):
        """Extract the dynamic prosody features from an audio file

        :param audio: .wav audio file.
        :returns: array (N,13) with the prosody features extracted from an audio file.  N= number of voiced segments

        >>> prosody=Prosody()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features=prosody.prosody_dynamic(file_audio)

        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.step * float(fs)
        overlap = size_stepS / size_frameS

        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'prosody' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)

            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.step)
            os.remove(temp_filename_f0)
            os.remove(temp_filename_vuv)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')

        #Find pitch contour of EACH voiced segment
        pitchON = np.where(F0 != 0)[0]
        dchange = np.diff(pitchON)
        change = np.where(dchange > 1)[0]
        iniV = pitchON[0]

        featvec = []
        iniVoiced = (pitchON[0] * size_stepS) + size_stepS  #To compute energy
        seg_voiced = []
        f0v = []
        Ev = []
        for indx in change:
            finV = pitchON[indx] + 1
            finVoiced = (pitchON[indx] *
                         size_stepS) + size_stepS  #To compute energy
            VoicedSeg = data_audio[int(iniVoiced):int(
                finVoiced)]  #To compute energy
            temp = F0[iniV:finV]
            tempvec = []
            if len(VoicedSeg) > int(
                    size_frameS):  #Take only segments greater than frame size
                seg_voiced.append(VoicedSeg)
                #Compute duration
                dur = len(VoicedSeg) / float(fs)
                #Pitch coefficients
                x = np.arange(0, len(temp))
                z = np.poly1d(np.polyfit(x, temp, self.P))
                f0v.append(temp)
                tempvec.extend(z.coeffs)
                #Energy coefficients
                temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
                Ev.append(temp)
                x = np.arange(0, len(temp))
                z = np.poly1d(np.polyfit(x, temp, self.P))
                tempvec.extend(z.coeffs)
                tempvec.append(dur)
                featvec.append(tempvec)
            iniV = pitchON[indx + 1]
            iniVoiced = (pitchON[indx + 1] *
                         size_stepS) + size_stepS  #To compute energy

        #Add the last voiced segment
        finV = (pitchON[len(pitchON) - 1])
        finVoiced = (pitchON[len(pitchON) - 1] *
                     size_stepS) + size_stepS  #To compute energy
        VoicedSeg = data_audio[int(iniVoiced):int(
            finVoiced)]  #To compute energy
        temp = F0[iniV:finV]
        tempvec = []

        if len(VoicedSeg) > int(
                size_frameS):  #Take only segments greater than frame size
            #Compute duration
            dur = len(VoicedSeg) / float(fs)
            tempvec.append(dur)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, self.P))
            tempvec.extend(z.coeffs)
            #Energy coefficients
            temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, self.P))
            tempvec.extend(z.coeffs)
            #Compute duration
            featvec.append(tempvec)

        return np.asarray(featvec)
Beispiel #4
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the articulation features from an audio file

        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> articulation=Articulation()
        >>> file_audio="../audios/001_ddk1_PCGITA.wav"
        >>> features1=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=articulation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        
        >>> path_audio="../audios/"
        >>> features1=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="npy")
        >>> features2=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="csv")
        >>> features3=articulation.extract_features_path(path_audio, static=False, plots=True, fmt="torch")
        >>> articulation.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark")

        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.sizeframe * float(fs)
        size_stepS = self.step * float(fs)

        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'articulation' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.step)
            segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans(
                temp_filename_vuv, data_audio, fs, self.sizeframe)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')

            segmentsOn = V_UV(F0, data_audio, fs, 'onset')
            segmentsOff = V_UV(F0, data_audio, fs, 'offset')

        BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS,
                                     self.nB, self.nMFCC)
        BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS,
                                       size_stepS, self.nB, self.nMFCC)

        DMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T
        DDMFCCon = np.asarray(
            [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T

        DMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T
        DDMFCCoff = np.asarray(
            [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T

        name_audio = audio.split('/')
        temp_uuid = 'artic' + name_audio[-1][0:-4]
        if not os.path.exists(self.PATH + '/../tempfiles/'):
            os.makedirs(self.PATH + '/../tempfiles/')
        temp_filename = self.PATH + '/../tempfiles/tempFormants' + temp_uuid + '.txt'
        praat_functions.praat_formants(audio, temp_filename, self.sizeframe,
                                       self.step)
        [F1, F2] = praat_functions.decodeFormants(temp_filename)
        os.remove(temp_filename)

        if len(F0) < len(F1):
            F0 = np.hstack((F0, np.zeros(len(F1) - len(F0))))
            F1nz = np.zeros((0, 1))
            F2nz = np.zeros((0, 1))
            DF1 = np.zeros((0, 1))
            DDF1 = np.zeros((0, 1))
            DF2 = np.zeros((0, 1))
            DDF2 = np.zeros((0, 1))
        else:
            F1 = np.hstack((F1, np.zeros(len(F0) - len(F1))))
            F2 = np.hstack((F2, np.zeros(len(F0) - len(F2))))

            pos0 = np.where(F0 == 0)[0]
            dpos0 = np.hstack(([1], np.diff(pos0)))
            f0u = np.split(pos0, np.where(dpos0 > 1)[0])

            thr_sil = int(self.len_thr_miliseconds / self.step)

            sil_seg = []
            for l in range(len(f0u)):
                if len(f0u[l]) >= thr_sil:
                    F1[f0u[l]] = 0
                    F2[f0u[l]] = 0
                sil_seg.append(f0u)

            sil_seg = np.hstack(sil_seg)

            F1nz = F1[F1 != 0]
            F2nz = F2[F2 != 0]
            DF1 = np.diff(F1, n=1)
            DF2 = np.diff(F2, n=1)
            DDF1 = np.diff(F1, n=2)
            DDF2 = np.diff(F2, n=2)

            if plots:
                self.plot_art(data_audio, fs, F0, F1, F2, segmentsOn,
                              segmentsOff)

            if len(F1nz) == 0:
                F1nz = np.zeros((0, 1))
            if len(F2nz) == 0:
                F2nz = np.zeros((0, 1))
            if len(DF1) == 0:
                DF1 = np.zeros((0, 1))
            if len(DDF1) == 0:
                DDF1 = np.zeros((0, 1))
            if len(DF2) == 0:
                DF2 = np.zeros((0, 1))
            if len(DDF2) == 0:
                DDF2 = np.zeros((0, 1))

        feat_v = dynamic2statict_artic([
            BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff,
            DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
        ])
        feat_mat = np.hstack(
            (BBEon[2:, :], MFCCon[2:, :], DMFCCon[1:, :], DDMFCCon))

        if fmt in ("npy", "txt"):
            if static:
                return feat_v
            return feat_mat
        if fmt in ("dataframe", "csv"):
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    #print(feat_v.shape, len(head_st), e, k)
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head_dyn):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        if fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            return torch.from_numpy(feat_mat)

        if fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            name_all = audio.split('/')
            dictX = {name_all[-1]: feat_mat}
            save_dict_kaldimat(dictX, kaldi_file)