Beispiel #1
0
    def extract_features_file(self,
                              audio,
                              static=True,
                              plots=False,
                              fmt="npy",
                              kaldi_file=""):
        """Extract the phonation features from an audio file
        
        :param audio: .wav audio file.
        :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames
        :param plots: timeshift to extract the features
        :param fmt: format to return the features (npy, dataframe, torch, kaldi)
        :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi"
        :returns: features computed from the audio file.

        >>> phonation=Phonation()
        >>> file_audio="../audios/001_a1_PCGITA.wav"
        >>> features1=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="npy")
        >>> features2=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe")
        >>> features3=phonation.extract_features_file(file_audio, static=False, plots=True, fmt="torch")
        >>> phonation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test")
        """
        fs, data_audio = read(audio)
        data_audio = data_audio - np.mean(data_audio)
        data_audio = data_audio / float(np.max(np.abs(data_audio)))
        size_frameS = self.size_frame * float(fs)
        size_stepS = self.size_step * float(fs)
        overlap = size_stepS / size_frameS
        if self.pitch_method == 'praat':
            name_audio = audio.split('/')
            temp_uuid = 'phon' + name_audio[-1][0:-4]
            if not os.path.exists(self.PATH + '/../tempfiles/'):
                os.makedirs(self.PATH + '/../tempfiles/')
            temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
            temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt'
            praat_functions.praat_vuv(audio,
                                      temp_filename_f0,
                                      temp_filename_vuv,
                                      time_stepF0=self.size_step,
                                      minf0=self.minf0,
                                      maxf0=self.maxf0)
            F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                             len(data_audio) / float(fs),
                                             self.size_step)
            os.remove(temp_filename_vuv)
            os.remove(temp_filename_f0)
        elif self.pitch_method == 'rapt':
            data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
            F0 = pysptk.sptk.rapt(data_audiof,
                                  fs,
                                  int(size_stepS),
                                  min=self.minf0,
                                  max=self.maxf0,
                                  voice_bias=self.voice_bias,
                                  otype='f0')
        F0nz = F0[F0 != 0]
        Jitter = jitter_env(F0nz, len(F0nz))
        nF = int((len(data_audio) / size_frameS / overlap)) - 1
        Amp = []
        logE = []
        apq = []
        ppq = []
        DF0 = np.diff(F0nz, 1)
        DDF0 = np.diff(DF0, 1)
        F0z = F0[F0 == 0]
        totaldurU = len(F0z)
        thresholdE = 10 * logEnergy([self.energy_thr_percent])
        degreeU = 100 * float(totaldurU) / len(F0)
        lnz = 0
        for l in range(nF):
            data_frame = data_audio[int(l * size_stepS):int(l * size_stepS +
                                                            size_frameS)]
            energy = 10 * logEnergy(data_frame)
            if F0[l] != 0:
                Amp.append(np.max(np.abs(data_frame)))
                logE.append(energy)
                if lnz >= 12:  # TODO:
                    amp_arr = np.asarray(
                        [Amp[j] for j in range(lnz - 12, lnz)])
                    #print(amp_arr)
                    apq.append(APQ(amp_arr))
                if lnz >= 6:  # TODO:
                    f0arr = np.asarray([F0nz[j] for j in range(lnz - 6, lnz)])
                    ppq.append(PPQ(1 / f0arr))
                lnz = lnz + 1

        Shimmer = shimmer_env(Amp, len(Amp))
        apq = np.asarray(apq)
        ppq = np.asarray(ppq)
        logE = np.asarray(logE)

        if len(apq) == 0:
            print(
                "warning, there is not enough long voiced segments to compute the APQ, in this case APQ=shimmer"
            )
            apq = Shimmer

        if plots:
            self.plot_phon(data_audio, fs, F0, logE)

        if len(Shimmer) == len(apq):
            feat_mat = np.vstack((DF0[5:], DDF0[4:], Jitter[6:], Shimmer[6:],
                                  apq[6:], ppq, logE[6:])).T
        else:
            feat_mat = np.vstack((DF0[11:], DDF0[10:], Jitter[12:],
                                  Shimmer[12:], apq, ppq[6:], logE[12:])).T

        feat_v = dynamic2statict([DF0, DDF0, Jitter, Shimmer, apq, ppq, logE])

        if fmt == "npy" or fmt == "txt":
            if static:
                return feat_v
            else:
                return feat_mat
        elif fmt == "dataframe" or fmt == "csv":
            if static:
                head_st = []
                df = {}
                for k in ["avg", "std", "skewness", "kurtosis"]:
                    for h in self.head:
                        head_st.append(k + " " + h)
                for e, k in enumerate(head_st):
                    df[k] = [feat_v[e]]

                return pd.DataFrame(df)
            else:
                df = {}
                for e, k in enumerate(self.head):
                    df[k] = feat_mat[:, e]
                return pd.DataFrame(df)
        elif fmt == "torch":
            if static:
                feat_t = torch.from_numpy(feat_v)
                return feat_t
            else:
                return torch.from_numpy(feat_mat)

        elif fmt == "kaldi":
            if static:
                raise ValueError(
                    "Kaldi is only supported for dynamic features")
            else:
                name_all = audio.split('/')
                dictX = {name_all[-1]: feat_mat}
                save_dict_kaldimat(dictX, kaldi_file)
        else:
            raise ValueError(fmt + " is not supported")
Beispiel #2
0
def phonationVowels(audio, flag_plots, size_frame=0.04,size_step=0.02,minf0=60,maxf0=350, voice_bias=-0.2,energy_thr_percent=0.025, pitch_method='praat'):



    fs, data_audio=read(audio)
    data_audio=data_audio-np.mean(data_audio)
    data_audio=data_audio/float(np.max(np.abs(data_audio)))
    size_frameS=size_frame*float(fs)
    size_stepS=size_step*float(fs)
    overlap=size_stepS/size_frameS
    if pitch_method == 'praat':
        name_audio=audio.split('/')
        temp_uuid='phon'+name_audio[-1][0:-4]
        if not os.path.exists('../tempfiles/'):
            os.makedirs('../tempfiles/')
        temp_filename_vuv='../tempfiles/tempVUV'+temp_uuid+'.txt'
        temp_filename_f0='../tempfiles/tempF0'+temp_uuid+'.txt'
        praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0)
        F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),size_step)
        #os.remove(temp_filename_vuv)
        #os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32)
        F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0')
    F0nz=F0[F0!=0]
    Jitter=jitter_env(F0nz, len(F0nz))

    nF=int((len(data_audio)/size_frameS/overlap))-1
    Amp=[]
    logE=[]
    apq=[]
    ppq=[]

    DF0=np.diff(F0nz, 1)
    DDF0=np.diff(DF0,1)

    F0z=F0[F0==0]
    totaldurU=len(F0z)

    thresholdE=10*logEnergy([energy_thr_percent])
    degreeU=100*float(totaldurU)/len(F0)
    lnz=0
    for l in range(nF):
        data_frame=data_audio[int(l*size_stepS):int(l*size_stepS+size_frameS)]
        energy=10*logEnergy(data_frame)
        if F0[l]!=0:
            Amp.append(np.max(np.abs(data_frame)))
            logE.append(10*logEnergy(data_frame))
            if lnz>=12: # TODO:
                amp_arr=np.asarray([Amp[j] for j in range(lnz-12, lnz)])
                #print(amp_arr)
                apq.append(APQ(amp_arr))
            if lnz>=6: # TODO:
                f0arr=np.asarray([F0nz[j] for j in range(lnz-6, lnz)])
                ppq.append(PPQ(1/f0arr))
            lnz=lnz+1
        print("frame "+str(l) +" from "+str(nF)+"-"*int(100*l/nF)+">"+str(int(100*(l+1)/nF))+"%", sep=' ', end='\r', flush=True)

    Shimmer=shimmer_env(Amp, len(Amp))
    apq=np.asarray(apq)
    ppq=np.asarray(ppq)
    logE=np.asarray(logE)
    F0semi=np.asarray([Hz2semitones(F0nz[l]) for l in range(len(F0nz))])

    if flag_plots:
        plot_phon(data_audio,fs,F0,logE)

    print("Jitter=", len(Jitter))
    print("Shimmer", len(Shimmer))
    print("APQ", len(apq))
    print("PPQ", len(ppq))
    print("DF0", len(DF0))
    print("DDF0", len(DDF0))
    print("Energy", len(logE))
    print("degree unvoiced",degreeU)

    return F0, DF0, DDF0, F0semi, Jitter, Shimmer, apq, ppq, logE, degreeU