def extract_features_file(self, audio, static=True, plots=False, fmt="npy", kaldi_file=""): """Extract the glottal features from an audio file :param audio: .wav audio file. :param static: whether to compute and return statistic functionals over the feature matrix, or return the feature matrix computed over frames :param plots: timeshift to extract the features :param fmt: format to return the features (npy, dataframe, torch, kaldi) :param kaldi_file: file to store kaldi features, only valid when fmt=="kaldi" :returns: features computed from the audio file. >>> glottal=Glottal() >>> file_audio="../audios/001_a1_PCGITA.wav" >>> features1=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="npy") >>> features2=glottal.extract_features_file(file_audio, static=True, plots=True, fmt="dataframe") >>> features3=glottal.extract_features_file(file_audio, static=False, plots=True, fmt="torch") >>> glottal.extract_features_file(file_audio, static=False, plots=False, fmt="kaldi", kaldi_file="./test.ark") """ if audio.find('.wav') == -1 and audio.find('.WAV') == -1: raise ValueError(audio + " is not a valid wav file") fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = self.size_frame * float(fs) size_stepS = self.size_step * float(fs) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) f0 = pysptk.sptk.rapt(data_audiof, fs, int(0.01 * fs), min=20, max=500, voice_bias=-0.2, otype='f0') sizef0 = int(self.size_frame / 0.01) stepf0 = int(self.size_step / 0.01) startf0 = 0 stopf0 = sizef0 avgGCIt = np.zeros(nF) varGCIt = np.zeros(nF) avgNAQt = np.zeros(nF) varNAQt = np.zeros(nF) avgQOQt = np.zeros(nF) varQOQt = np.zeros(nF) avgH1H2t = np.zeros(nF) varH1H2t = np.zeros(nF) avgHRFt = np.zeros(nF) varHRFt = np.zeros(nF) rmwin = [] for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] f0_frame = f0[startf0:stopf0] pf0framez = np.where(f0_frame != 0)[0] f0nzframe = f0_frame[pf0framez] if len(f0nzframe) < 10: startf0 = startf0 + stepf0 stopf0 = stopf0 + stepf0 rmwin.append(l) continue GCI = SE_VQ_varF0(data_frame, fs, f0=f0_frame) g_iaif = IAIF(data_frame, fs, GCI) g_iaif = g_iaif - np.mean(g_iaif) g_iaif = g_iaif / max(abs(g_iaif)) glottal = cumtrapz(g_iaif) glottal = glottal - np.mean(glottal) glottal = glottal / max(abs(glottal)) startf0 = startf0 + stepf0 stopf0 = stopf0 + stepf0 gci_s = GCI[:] GCId = np.diff(gci_s) avgGCIt[l] = np.mean(GCId / fs) varGCIt[l] = np.std(GCId / fs) NAQ, QOQ, T1, T2, H1H2, HRF = get_vq_params( glottal, g_iaif, fs, GCI) avgNAQt[l] = np.mean(NAQ) varNAQt[l] = np.std(NAQ) avgQOQt[l] = np.mean(QOQ) varQOQt[l] = np.std(QOQ) avgH1H2t[l] = np.mean(H1H2) varH1H2t[l] = np.std(H1H2) avgHRFt[l] = np.mean(HRF) varHRFt[l] = np.std(HRF) if plots: self.plot_glottal(data_frame, fs, GCI, g_iaif, glottal, avgGCIt[l], varGCIt[l]) if len(rmwin) > 0: varGCIt = np.delete(varGCIt, rmwin) avgNAQt = np.delete(avgNAQt, rmwin) varNAQt = np.delete(varNAQt, rmwin) avgQOQt = np.delete(avgQOQt, rmwin) varQOQt = np.delete(varQOQt, rmwin) avgH1H2t = np.delete(avgH1H2t, rmwin) varH1H2t = np.delete(varH1H2t, rmwin) avgHRFt = np.delete(avgHRFt, rmwin) varHRFt = np.delete(varHRFt, rmwin) feat = np.stack((varGCIt, avgNAQt, varNAQt, avgQOQt, varQOQt, avgH1H2t, varH1H2t, avgHRFt, varHRFt), axis=1) if fmt == "npy" or fmt == "txt": if static: return dynamic2static(feat) else: return feat elif fmt == "dataframe" or fmt == "csv": if static: feat_st = dynamic2static(feat) head_st = [] df = {} for k in [ "global avg", "global std", "global skewness", "global kurtosis" ]: for h in self.head: head_st.append(k + " " + h) for e, k in enumerate(head_st): df[k] = [feat_st[e]] return pd.DataFrame(df) else: df = {} for e, k in enumerate(self.head): df[k] = feat[:, e] return pd.DataFrame(df) elif fmt == "torch": if static: feat_s = dynamic2static(feat) feat_t = torch.from_numpy(feat_s) return feat_t else: return torch.from_numpy(feat) elif fmt == "kaldi": if static: raise ValueError( "Kaldi is only supported for dynamic features") else: name_all = audio.split('/') dictX = {name_all[-1]: feat} save_dict_kaldimat(dictX, kaldi_file)
def glottal_features(audio, flag_plots, size_frame=0.2, size_step=0.1): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = size_frame * float(fs) size_stepS = size_step * float(fs) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) f0 = pysptk.sptk.rapt(data_audiof, fs, int(0.01 * fs), min=20, max=500, voice_bias=-0.2, otype='f0') sizef0 = int(size_frame / 0.01) stepf0 = int(size_step / 0.01) startf0 = 0 stopf0 = sizef0 avgGCIt = np.zeros(nF) varGCIt = np.zeros(nF) avgNAQt = np.zeros(nF) varNAQt = np.zeros(nF) avgQOQt = np.zeros(nF) varQOQt = np.zeros(nF) avgH1H2t = np.zeros(nF) varH1H2t = np.zeros(nF) avgHRFt = np.zeros(nF) varHRFt = np.zeros(nF) rmwin = [] for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] f0_frame = f0[startf0:stopf0] pf0framez = np.where(f0_frame != 0)[0] f0nzframe = f0_frame[pf0framez] if len(f0nzframe) < 10: startf0 = startf0 + stepf0 stopf0 = stopf0 + stepf0 rmwin.append(l) print("frame " + str(l) + " from " + str(nF) + "-" * int(100 * l / nF) + ">" + str(int(100 * (l + 1) / nF)) + "%", sep=' ', end='\r') continue GCI = SE_VQ_varF0(data_frame, fs, f0=f0_frame) g_iaif = IAIF(data_frame, fs, GCI) g_iaif = g_iaif - np.mean(g_iaif) g_iaif = g_iaif / max(abs(g_iaif)) glottal = cumtrapz(g_iaif) glottal = glottal - np.mean(glottal) glottal = glottal / max(abs(glottal)) startf0 = startf0 + stepf0 stopf0 = stopf0 + stepf0 gci_s = GCI[:] GCId = np.diff(gci_s) avgGCIt[l] = np.mean(GCId / fs) varGCIt[l] = np.std(GCId / fs) NAQ, QOQ, T1, T2, H1H2, HRF = get_vq_params(glottal, g_iaif, fs, GCI) avgNAQt[l] = np.mean(NAQ) varNAQt[l] = np.std(NAQ) avgQOQt[l] = np.mean(QOQ) varQOQt[l] = np.std(QOQ) avgH1H2t[l] = np.mean(H1H2) varH1H2t[l] = np.std(H1H2) avgHRFt[l] = np.mean(HRF) varHRFt[l] = np.std(HRF) print("frame " + str(l) + " from " + str(nF) + "-" * int(100 * l / nF) + ">" + str(int(100 * (l + 1) / nF)) + "%", sep=' ', end='\r') if flag_plots: plot_glottal(data_frame, fs, GCI, g_iaif, glottal, avgGCIt[l], varGCIt[l]) if len(rmwin) > 0: varGCI = np.delete(varGCIt, rmwin) avgNAQ = np.delete(avgNAQt, rmwin) varNAQ = np.delete(varNAQt, rmwin) avgQOQ = np.delete(avgQOQt, rmwin) varQOQ = np.delete(varQOQt, rmwin) avgH1H2 = np.delete(avgH1H2t, rmwin) varH1H2 = np.delete(varH1H2t, rmwin) avgHRF = np.delete(avgHRFt, rmwin) varHRF = np.delete(varHRFt, rmwin) return varGCI, avgNAQ, varNAQ, avgQOQ, varQOQ, avgH1H2, varH1H2, avgHRF, varHRF else: return varGCIt, avgNAQt, varNAQt, avgQOQt, varQOQt, avgH1H2t, varH1H2t, avgHRFt, varHRFt