def prosody_static(self, audio, plots): """Extract the static prosody features from an audio file :param audio: .wav audio file. :param plots: timeshift to extract the features :returns: array with the 103 prosody features >>> prosody=Prosody() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features=prosody.prosody_static(file_audio, plots=True) """ fs, data_audio=read(audio) data_audio=data_audio-np.mean(data_audio) data_audio=data_audio/float(np.max(np.abs(data_audio))) size_frameS=self.size_frame*float(fs) size_stepS=self.step*float(fs) thr_len_pause=self.thr_len*float(fs) overlap=size_stepS/size_frameS nF=int((len(data_audio)/size_frameS/overlap))-1 if self.pitch_method == 'praat': name_audio=audio.split('/') temp_uuid='prosody'+name_audio[-1][0:-4] if not os.path.exists(self.PATH+'/../tempfiles/'): os.makedirs(self.PATH+'/../tempfiles/') temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt' temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step) os.remove(temp_filename_f0) os.remove(temp_filename_vuv) elif self.pitch_method == 'rapt': data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32) F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP=[] segmentsU=[] for k in range(len(segmentsUP)): eu=logEnergy(segmentsUP[k]) if (len(segmentsUP[k])>thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features=F0feat(F0) energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if plots: self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features) features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features)) return features
def prosody_static(audio, flag_plots): fs, data_audio = read(audio) data_audio = data_audio[:-1, 0] print(len(data_audio)) print(data_audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = 0.02 * float(fs) size_stepS = 0.01 * float(fs) thr_len_pause = 0.14 * float(fs) thr_en_pause = 0.2 overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) print(data_audiof) print(size_stepS) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=60, max=350, voice_bias=-0.2, otype='f0') logE = [] for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] logE.append(logEnergy(data_frame)) logE = np.asarray(logE) print("see") print(np.unique(F0)) segmentsV = V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsU = V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) Nvoiced = len(segmentsV) Nunvoiced = len(segmentsU) Vrate = fs * float(Nvoiced) / len(data_audio) avgdurv = 1000 * np.mean([len(segmentsV[k]) for k in range(Nvoiced)]) / float(fs) stddurv = 1000 * np.std([len(segmentsV[k]) for k in range(Nvoiced)]) / float(fs) silence = [] for k in range(Nunvoiced): eu = logEnergy(segmentsU[k]) if (eu < thr_en_pause or len(segmentsU[k]) > thr_len_pause): silence.append(segmentsU[k]) print("here") print(eu) Silrate = fs * float(len(silence)) / len(data_audio) avgdurs = 1000 * np.mean([len(silence[k]) for k in range(len(silence))]) / float(fs) stddurs = 1000 * np.std([len(silence[k]) for k in range(len(silence))]) / float(fs) if flag_plots: plt.figure(1) plt.subplot(311) t = np.arange(0, float(len(data_audio)) / fs, 1.0 / fs) if len(t) != len(data_audio): t = np.arange(1.0 / fs, float(len(data_audio)) / fs, 1.0 / fs) print(len(t), len(data_audio)) plt.plot(t, data_audio, 'k') plt.ylabel('Amplitude') plt.xlabel('Time (s)') plt.xlim([0, t[-1]]) plt.grid(True) plt.subplot(312) fsp = len(F0) / t[-1] print(fsp) t2 = np.arange(0.0, t[-1], 1.0 / fsp) if len(t2) > len(F0): t2 = t2[:len(F0)] elif len(F0) > len(t2): F0 = F0[:len(t2)] plt.plot(t2, F0, color='k', linewidth=2.0) plt.xlabel('Time (s)') plt.ylabel('F0 (Hz)') plt.ylim([0, np.max(F0) + 10]) plt.xlim([0, t[-1]]) plt.grid(True) plt.subplot(313) fse = len(logE) / t[-1] t3 = np.arange(0.0, t[-1], 1.0 / fse) if len(t3) > len(logE): t3 = t3[:len(logE)] elif len(logE) > len(t3): logE = logE[:len(t3)] plt.plot(t3, logE, color='k', linewidth=2.0) plt.xlabel('Time (s)') plt.ylabel('Energy (dB)') #plt.ylim([0,np.max(logE)]) plt.xlim([0, t[-1]]) plt.grid(True) plt.show() F0std = np.std(F0[F0 != 0]) F0varsemi = Hz2semitones(F0std**2) return F0, logE, np.mean(F0[F0 != 0]), np.std( F0[F0 != 0]), np.max(F0), np.mean(logE), np.std(logE), np.max( logE ), Vrate, avgdurv, stddurv, Silrate, avgdurs, stddurs, F0varsemi
def prosody_static(audio, flag_plots, pitch_method='praat'): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = 0.02 * float(fs) size_stepS = 0.01 * float(fs) thr_len_pause = 0.14 * float(fs) thr_en_pause = 10 * np.log10(0.02) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 if pitch_method == 'praat': temp_uuid = audio.split('/')[-1][0:-4] temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt' temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=0.01, minf0=60, maxf0=350) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), 0.01) os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=60, max=350, voice_bias=-0.2, otype='f0') segmentsV = V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP = V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP = [] segmentsU = [] for k in range(len(segmentsUP)): eu = logEnergy(segmentsUP[k]) if (len(segmentsUP[k]) > thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features = F0feat(F0) energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features = duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if flag_plots: plot_pros(data_audio, fs, F0, segmentsV, segmentsU) features = np.hstack( (F0_features, energy_featuresV, energy_featuresU, duration_features)) return features