def prosody_static(self, audio, plots): """Extract the static prosody features from an audio file :param audio: .wav audio file. :param plots: timeshift to extract the features :returns: array with the 103 prosody features >>> prosody=Prosody() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features=prosody.prosody_static(file_audio, plots=True) """ fs, data_audio=read(audio) data_audio=data_audio-np.mean(data_audio) data_audio=data_audio/float(np.max(np.abs(data_audio))) size_frameS=self.size_frame*float(fs) size_stepS=self.step*float(fs) thr_len_pause=self.thr_len*float(fs) overlap=size_stepS/size_frameS nF=int((len(data_audio)/size_frameS/overlap))-1 if self.pitch_method == 'praat': name_audio=audio.split('/') temp_uuid='prosody'+name_audio[-1][0:-4] if not os.path.exists(self.PATH+'/../tempfiles/'): os.makedirs(self.PATH+'/../tempfiles/') temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt' temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step) os.remove(temp_filename_f0) os.remove(temp_filename_vuv) elif self.pitch_method == 'rapt': data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32) F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP=[] segmentsU=[] for k in range(len(segmentsUP)): eu=logEnergy(segmentsUP[k]) if (len(segmentsUP[k])>thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features=F0feat(F0) energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if plots: self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features) features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features)) return features
def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""): """Extract the phonation features from an audio file :param audio: .wav audio file. :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames :param plots: timeshift to extract the features :param fmt: format to return the features (npy, dataframe, torch, kaldi) :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi" :returns: features computed from the audio file. >>> phonation=Phonation() >>> file_audio="../audios/001_a1_PCGITA.wav" >>> features1=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="npy") >>> features2=phonation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe") >>> features3=phonation.extract_features_file(file_audio, static=False, plots=True, fmt="torch") >>> phonation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test") """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.size_frame * float(fs) size_stepS = self.size_step * float(fs) overlap = size_stepS / size_frameS if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'phon' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.size_step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.size_step) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') F0nz = F0[F0 != 0] Jitter = jitter_env(F0nz, len(F0nz)) nF = int((len(data_audio) / size_frameS / overlap)) - 1 Amp = [] logE = [] apq = [] ppq = [] DF0 = np.diff(F0nz, 1) DDF0 = np.diff(DF0, 1) F0z = F0[F0 == 0] totaldurU = len(F0z) thresholdE = 10 * logEnergy([self.energy_thr_percent]) degreeU = 100 * float(totaldurU) / len(F0) lnz = 0 for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] energy = 10 * logEnergy(data_frame) if F0[l] != 0: Amp.append(np.max(np.abs(data_frame))) logE.append(energy) if lnz >= 12: # TODO: amp_arr = np.asarray( [Amp[j] for j in range(lnz - 12, lnz)]) #print(amp_arr) apq.append(APQ(amp_arr)) if lnz >= 6: # TODO: f0arr = np.asarray([F0nz[j] for j in range(lnz - 6, lnz)]) ppq.append(PPQ(1 / f0arr)) lnz = lnz + 1 Shimmer = shimmer_env(Amp, len(Amp)) apq = np.asarray(apq) ppq = np.asarray(ppq) logE = np.asarray(logE) if len(apq) == 0: print( "warning, there is not enough long voiced segments to compute the APQ, in this case APQ=shimmer" ) apq = Shimmer if plots: self.plot_phon(data_audio, fs, F0, logE) if len(Shimmer) == len(apq): feat_mat = np.vstack((DF0[5:], DDF0[4:], Jitter[6:], Shimmer[6:], apq[6:], ppq, logE[6:])).T else: feat_mat = np.vstack((DF0[11:], DDF0[10:], Jitter[12:], Shimmer[12:], apq, ppq[6:], logE[12:])).T feat_v = dynamic2statict([DF0, DDF0, Jitter, Shimmer, apq, ppq, logE]) if fmt == "npy" or fmt == "txt": if static: return feat_v else: return feat_mat elif fmt == "dataframe" or fmt == "csv": if static: head_st = [] df = {} for k in ["avg", "std", "skewness", "kurtosis"]: for h in self.head: head_st.append(k + " " + h) for e, k in enumerate(head_st): df[k] = [feat_v[e]] return pd.DataFrame(df) else: df = {} for e, k in enumerate(self.head): df[k] = feat_mat[:, e] return pd.DataFrame(df) elif fmt == "torch": if static: feat_t = torch.from_numpy(feat_v) return feat_t else: return torch.from_numpy(feat_mat) elif fmt == "kaldi": if static: raise ValueError( "Kaldi is only supported for dynamic features") else: name_all = audio.split('/') dictX = {name_all[-1]: feat_mat} save_dict_kaldimat(dictX, kaldi_file) else: raise ValueError(fmt + " is not supported")
def prosody_dynamic(self, audio): """Extract the dynamic prosody features from an audio file :param audio: .wav audio file. :returns: array (N,13) with the prosody features extracted from an audio file. N= number of voiced segments >>> prosody=Prosody() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features=prosody.prosody_dynamic(file_audio) """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.size_frame * float(fs) size_stepS = self.step * float(fs) overlap = size_stepS / size_frameS if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'prosody' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.step) os.remove(temp_filename_f0) os.remove(temp_filename_vuv) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') #Find pitch contour of EACH voiced segment pitchON = np.where(F0 != 0)[0] dchange = np.diff(pitchON) change = np.where(dchange > 1)[0] iniV = pitchON[0] featvec = [] iniVoiced = (pitchON[0] * size_stepS) + size_stepS #To compute energy seg_voiced = [] f0v = [] Ev = [] for indx in change: finV = pitchON[indx] + 1 finVoiced = (pitchON[indx] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size seg_voiced.append(VoicedSeg) #Compute duration dur = len(VoicedSeg) / float(fs) #Pitch coefficients x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) f0v.append(temp) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) Ev.append(temp) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) tempvec.append(dur) featvec.append(tempvec) iniV = pitchON[indx + 1] iniVoiced = (pitchON[indx + 1] * size_stepS) + size_stepS #To compute energy #Add the last voiced segment finV = (pitchON[len(pitchON) - 1]) finVoiced = (pitchON[len(pitchON) - 1] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, self.P)) tempvec.extend(z.coeffs) #Compute duration featvec.append(tempvec) return np.asarray(featvec)
def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""): """Extract the articulation features from an audio file :param audio: .wav audio file. :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames :param plots: timeshift to extract the features :param fmt: format to return the features (npy, dataframe, torch, kaldi) :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi" :returns: features computed from the audio file. >>> articulation=Articulation() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features1=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="npy") >>> features2=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe") >>> features3=articulation.extract_features_file(file_audio, static=False, plots=True, fmt="torch") >>> articulation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test") >>> path_audio="../audios/" >>> features1=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="npy") >>> features2=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="csv") >>> features3=articulation.extract_features_path(path_audio, static=False, plots=True, fmt="torch") >>> articulation.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark") """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.sizeframe * float(fs) size_stepS = self.step * float(fs) if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'articulation' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.step) segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans( temp_filename_vuv, data_audio, fs, self.sizeframe) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') segmentsOn = V_UV(F0, data_audio, fs, 'onset') segmentsOff = V_UV(F0, data_audio, fs, 'offset') BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, self.nB, self.nMFCC) BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS, self.nB, self.nMFCC) DMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T DDMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T DMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T DDMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T name_audio = audio.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename = self.PATH + '/../tempfiles/tempFormants' + temp_uuid + '.txt' praat_functions.praat_formants(audio, temp_filename, self.sizeframe, self.step) [F1, F2] = praat_functions.decodeFormants(temp_filename) os.remove(temp_filename) if len(F0) < len(F1): F0 = np.hstack((F0, np.zeros(len(F1) - len(F0)))) F1nz = np.zeros((0, 1)) F2nz = np.zeros((0, 1)) DF1 = np.zeros((0, 1)) DDF1 = np.zeros((0, 1)) DF2 = np.zeros((0, 1)) DDF2 = np.zeros((0, 1)) else: F1 = np.hstack((F1, np.zeros(len(F0) - len(F1)))) F2 = np.hstack((F2, np.zeros(len(F0) - len(F2)))) pos0 = np.where(F0 == 0)[0] dpos0 = np.hstack(([1], np.diff(pos0))) f0u = np.split(pos0, np.where(dpos0 > 1)[0]) thr_sil = int(self.len_thr_miliseconds / self.step) sil_seg = [] for l in range(len(f0u)): if len(f0u[l]) >= thr_sil: F1[f0u[l]] = 0 F2[f0u[l]] = 0 sil_seg.append(f0u) sil_seg = np.hstack(sil_seg) F1nz = F1[F1 != 0] F2nz = F2[F2 != 0] DF1 = np.diff(F1, n=1) DF2 = np.diff(F2, n=1) DDF1 = np.diff(F1, n=2) DDF2 = np.diff(F2, n=2) if plots: self.plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff) if len(F1nz) == 0: F1nz = np.zeros((0, 1)) if len(F2nz) == 0: F2nz = np.zeros((0, 1)) if len(DF1) == 0: DF1 = np.zeros((0, 1)) if len(DDF1) == 0: DDF1 = np.zeros((0, 1)) if len(DF2) == 0: DF2 = np.zeros((0, 1)) if len(DDF2) == 0: DDF2 = np.zeros((0, 1)) feat_v = dynamic2statict_artic([ BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2 ]) feat_mat = np.hstack( (BBEon[2:, :], MFCCon[2:, :], DMFCCon[1:, :], DDMFCCon)) if fmt in ("npy", "txt"): if static: return feat_v return feat_mat if fmt in ("dataframe", "csv"): if static: head_st = [] df = {} for k in ["avg", "std", "skewness", "kurtosis"]: for h in self.head: head_st.append(k + " " + h) for e, k in enumerate(head_st): #print(feat_v.shape, len(head_st), e, k) df[k] = [feat_v[e]] return pd.DataFrame(df) else: df = {} for e, k in enumerate(self.head_dyn): df[k] = feat_mat[:, e] return pd.DataFrame(df) if fmt == "torch": if static: feat_t = torch.from_numpy(feat_v) return feat_t return torch.from_numpy(feat_mat) if fmt == "kaldi": if static: raise ValueError( "Kaldi is only supported for dynamic features") name_all = audio.split('/') dictX = {name_all[-1]: feat_mat} save_dict_kaldimat(dictX, kaldi_file)