def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""): """Extract the phonation features from an audio file :param audio: .wav audio file. :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames :param plots: timeshift to extract the features :param fmt: format to return the features (npy, dataframe, torch, kaldi) :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi" :returns: features computed from the audio file. >>> phonation=Phonation() >>> file_audio="../audios/001_a1_PCGITA.wav" >>> features1=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="npy") >>> features2=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe") >>> features3=phonation.extract_features_file(file_audio, static=False, plots=True, fmt="torch") >>> phonation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test") """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.size_frame * float(fs) size_stepS = self.size_step * float(fs) overlap = size_stepS / size_frameS if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'phon' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.size_step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.size_step) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') F0nz = F0[F0 != 0] Jitter = jitter_env(F0nz, len(F0nz)) nF = int((len(data_audio) / size_frameS / overlap)) - 1 Amp = [] logE = [] apq = [] ppq = [] DF0 = np.diff(F0nz, 1) DDF0 = np.diff(DF0, 1) F0z = F0[F0 == 0] totaldurU = len(F0z) thresholdE = 10 * logEnergy([self.energy_thr_percent]) degreeU = 100 * float(totaldurU) / len(F0) lnz = 0 for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] energy = 10 * logEnergy(data_frame) if F0[l] != 0: Amp.append(np.max(np.abs(data_frame))) logE.append(energy) if lnz >= 12: # TODO: amp_arr = np.asarray( [Amp[j] for j in range(lnz - 12, lnz)]) #print(amp_arr) apq.append(APQ(amp_arr)) if lnz >= 6: # TODO: f0arr = np.asarray([F0nz[j] for j in range(lnz - 6, lnz)]) ppq.append(PPQ(1 / f0arr)) lnz = lnz + 1 Shimmer = shimmer_env(Amp, len(Amp)) apq = np.asarray(apq) ppq = np.asarray(ppq) logE = np.asarray(logE) if len(apq) == 0: print( "warning, there is not enough long voiced segments to compute the APQ, in this case APQ=shimmer" ) apq = Shimmer if plots: self.plot_phon(data_audio, fs, F0, logE) if len(Shimmer) == len(apq): feat_mat = np.vstack((DF0[5:], DDF0[4:], Jitter[6:], Shimmer[6:], apq[6:], ppq, logE[6:])).T else: feat_mat = np.vstack((DF0[11:], DDF0[10:], Jitter[12:], Shimmer[12:], apq, ppq[6:], logE[12:])).T feat_v = dynamic2statict([DF0, DDF0, Jitter, Shimmer, apq, ppq, logE]) if fmt == "npy" or fmt == "txt": if static: return feat_v else: return feat_mat elif fmt == "dataframe" or fmt == "csv": if static: head_st = [] df = {} for k in ["avg", "std", "skewness", "kurtosis"]: for h in self.head: head_st.append(k + " " + h) for e, k in enumerate(head_st): df[k] = [feat_v[e]] return pd.DataFrame(df) else: df = {} for e, k in enumerate(self.head): df[k] = feat_mat[:, e] return pd.DataFrame(df) elif fmt == "torch": if static: feat_t = torch.from_numpy(feat_v) return feat_t else: return torch.from_numpy(feat_mat) elif fmt == "kaldi": if static: raise ValueError( "Kaldi is only supported for dynamic features") else: name_all = audio.split('/') dictX = {name_all[-1]: feat_mat} save_dict_kaldimat(dictX, kaldi_file) else: raise ValueError(fmt + " is not supported")
def phonationVowels(audio, flag_plots, size_frame=0.04,size_step=0.02,minf0=60,maxf0=350, voice_bias=-0.2,energy_thr_percent=0.025, pitch_method='praat'): fs, data_audio=read(audio) data_audio=data_audio-np.mean(data_audio) data_audio=data_audio/float(np.max(np.abs(data_audio))) size_frameS=size_frame*float(fs) size_stepS=size_step*float(fs) overlap=size_stepS/size_frameS if pitch_method == 'praat': name_audio=audio.split('/') temp_uuid='phon'+name_audio[-1][0:-4] if not os.path.exists('../tempfiles/'): os.makedirs('../tempfiles/') temp_filename_vuv='../tempfiles/tempVUV'+temp_uuid+'.txt' temp_filename_f0='../tempfiles/tempF0'+temp_uuid+'.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),size_step) #os.remove(temp_filename_vuv) #os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32) F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') F0nz=F0[F0!=0] Jitter=jitter_env(F0nz, len(F0nz)) nF=int((len(data_audio)/size_frameS/overlap))-1 Amp=[] logE=[] apq=[] ppq=[] DF0=np.diff(F0nz, 1) DDF0=np.diff(DF0,1) F0z=F0[F0==0] totaldurU=len(F0z) thresholdE=10*logEnergy([energy_thr_percent]) degreeU=100*float(totaldurU)/len(F0) lnz=0 for l in range(nF): data_frame=data_audio[int(l*size_stepS):int(l*size_stepS+size_frameS)] energy=10*logEnergy(data_frame) if F0[l]!=0: Amp.append(np.max(np.abs(data_frame))) logE.append(10*logEnergy(data_frame)) if lnz>=12: # TODO: amp_arr=np.asarray([Amp[j] for j in range(lnz-12, lnz)]) #print(amp_arr) apq.append(APQ(amp_arr)) if lnz>=6: # TODO: f0arr=np.asarray([F0nz[j] for j in range(lnz-6, lnz)]) ppq.append(PPQ(1/f0arr)) lnz=lnz+1 print("frame "+str(l) +" from "+str(nF)+"-"*int(100*l/nF)+">"+str(int(100*(l+1)/nF))+"%", sep=' ', end='\r', flush=True) Shimmer=shimmer_env(Amp, len(Amp)) apq=np.asarray(apq) ppq=np.asarray(ppq) logE=np.asarray(logE) F0semi=np.asarray([Hz2semitones(F0nz[l]) for l in range(len(F0nz))]) if flag_plots: plot_phon(data_audio,fs,F0,logE) print("Jitter=", len(Jitter)) print("Shimmer", len(Shimmer)) print("APQ", len(apq)) print("PPQ", len(ppq)) print("DF0", len(DF0)) print("DDF0", len(DDF0)) print("Energy", len(logE)) print("degree unvoiced",degreeU) return F0, DF0, DDF0, F0semi, Jitter, Shimmer, apq, ppq, logE, degreeU