def articulation_continuous(audio_filename, flag_plots, sizeframe=0.04, step=0.02, nB=22, nMFCC=12, minf0=60, maxf0=350, voice_bias=-0.5, len_thr_miliseconds=270.0, pitch_method='praat'): fs, data_audio = read(audio_filename) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = sizeframe * float(fs) size_stepS = step * float(fs) overlap = size_stepS / size_frameS if pitch_method == 'praat': name_audio = audio_filename.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio_filename, temp_filename_f0, temp_filename_vuv, time_stepF0=step, minf0=minf0, maxf0=maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), step) segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans( temp_filename_vuv, data_audio, fs, sizeframe) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') segments = read_Textgrid(path_base + 'vuv.txt', file_audio, win_trans) segmentsOn = V_UV(F0, data_audio, fs, 'onset') segmentsOff = V_UV(F0, data_audio, fs, 'offset') BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, nB, nMFCC) BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS, nB, nMFCC) DMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T DDMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T DMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T DDMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T # TODO: Make parameters configurable. (If worth it) name_audio = audio_filename.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] temp_filename = '../tempfiles/tempFormants' + temp_uuid + '.txt' praat_functions.praat_formants(audio_filename, temp_filename, sizeframe, step) [F1, F2] = praat_functions.decodeFormants(temp_filename) os.remove(temp_filename) if len(F0) < len(F1): F0 = np.hstack((F0, np.zeros(len(F1) - len(F0)))) else: F1 = np.hstack((F1, np.zeros(len(F0) - len(F1)))) F2 = np.hstack((F2, np.zeros(len(F0) - len(F2)))) pos0 = np.where(F0 == 0)[0] dpos0 = np.hstack(([1], np.diff(pos0))) f0u = np.split(pos0, np.where(dpos0 > 1)[0]) thr_sil = int(len_thr_miliseconds / step) sil_seg = [] for l in range(len(f0u)): if len(f0u[l]) >= thr_sil: F1[f0u[l]] = 0 F2[f0u[l]] = 0 sil_seg.append(f0u) sil_seg = np.hstack(sil_seg) F1nz = F1[F1 != 0] F2nz = F2[F2 != 0] DF1 = np.diff(F1, n=1) DF2 = np.diff(F2, n=1) DDF1 = np.diff(F1, n=2) DDF2 = np.diff(F2, n=2) if flag_plots: plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff) return BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""): """Extract the articulation features from an audio file :param audio: .wav audio file. :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames :param plots: timeshift to extract the features :param fmt: format to return the features (npy, dataframe, torch, kaldi) :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi" :returns: features computed from the audio file. >>> articulation=Articulation() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features1=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="npy") >>> features2=articulation.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe") >>> features3=articulation.extract_features_file(file_audio, static=False, plots=True, fmt="torch") >>> articulation.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test") >>> path_audio="../audios/" >>> features1=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="npy") >>> features2=articulation.extract_features_path(path_audio, static=True, plots=False, fmt="csv") >>> features3=articulation.extract_features_path(path_audio, static=False, plots=True, fmt="torch") >>> articulation.extract_features_path(path_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark") """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.sizeframe * float(fs) size_stepS = self.step * float(fs) if self.pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'articulation' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename_vuv = self.PATH + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = self.PATH + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), self.step) segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans( temp_filename_vuv, data_audio, fs, self.sizeframe) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif self.pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') segmentsOn = V_UV(F0, data_audio, fs, 'onset') segmentsOff = V_UV(F0, data_audio, fs, 'offset') BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, self.nB, self.nMFCC) BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS, self.nB, self.nMFCC) DMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T DDMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T DMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T DDMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T name_audio = audio.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] if not os.path.exists(self.PATH + '/../tempfiles/'): os.makedirs(self.PATH + '/../tempfiles/') temp_filename = self.PATH + '/../tempfiles/tempFormants' + temp_uuid + '.txt' praat_functions.praat_formants(audio, temp_filename, self.sizeframe, self.step) [F1, F2] = praat_functions.decodeFormants(temp_filename) os.remove(temp_filename) if len(F0) < len(F1): F0 = np.hstack((F0, np.zeros(len(F1) - len(F0)))) F1nz = np.zeros((0, 1)) F2nz = np.zeros((0, 1)) DF1 = np.zeros((0, 1)) DDF1 = np.zeros((0, 1)) DF2 = np.zeros((0, 1)) DDF2 = np.zeros((0, 1)) else: F1 = np.hstack((F1, np.zeros(len(F0) - len(F1)))) F2 = np.hstack((F2, np.zeros(len(F0) - len(F2)))) pos0 = np.where(F0 == 0)[0] dpos0 = np.hstack(([1], np.diff(pos0))) f0u = np.split(pos0, np.where(dpos0 > 1)[0]) thr_sil = int(self.len_thr_miliseconds / self.step) sil_seg = [] for l in range(len(f0u)): if len(f0u[l]) >= thr_sil: F1[f0u[l]] = 0 F2[f0u[l]] = 0 sil_seg.append(f0u) sil_seg = np.hstack(sil_seg) F1nz = F1[F1 != 0] F2nz = F2[F2 != 0] DF1 = np.diff(F1, n=1) DF2 = np.diff(F2, n=1) DDF1 = np.diff(F1, n=2) DDF2 = np.diff(F2, n=2) if plots: self.plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff) if len(F1nz) == 0: F1nz = np.zeros((0, 1)) if len(F2nz) == 0: F2nz = np.zeros((0, 1)) if len(DF1) == 0: DF1 = np.zeros((0, 1)) if len(DDF1) == 0: DDF1 = np.zeros((0, 1)) if len(DF2) == 0: DF2 = np.zeros((0, 1)) if len(DDF2) == 0: DDF2 = np.zeros((0, 1)) feat_v = dynamic2statict_artic([ BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2 ]) feat_mat = np.hstack( (BBEon[2:, :], MFCCon[2:, :], DMFCCon[1:, :], DDMFCCon)) if fmt in ("npy", "txt"): if static: return feat_v return feat_mat if fmt in ("dataframe", "csv"): if static: head_st = [] df = {} for k in ["avg", "std", "skewness", "kurtosis"]: for h in self.head: head_st.append(k + " " + h) for e, k in enumerate(head_st): #print(feat_v.shape, len(head_st), e, k) df[k] = [feat_v[e]] return pd.DataFrame(df) else: df = {} for e, k in enumerate(self.head_dyn): df[k] = feat_mat[:, e] return pd.DataFrame(df) if fmt == "torch": if static: feat_t = torch.from_numpy(feat_v) return feat_t return torch.from_numpy(feat_mat) if fmt == "kaldi": if static: raise ValueError( "Kaldi is only supported for dynamic features") name_all = audio.split('/') dictX = {name_all[-1]: feat_mat} save_dict_kaldimat(dictX, kaldi_file)