def prosody_static(self, audio, plots): """Extract the static prosody features from an audio file :param audio: .wav audio file. :param plots: timeshift to extract the features :returns: array with the 103 prosody features >>> prosody=Prosody() >>> file_audio="../audios/001_ddk1_PCGITA.wav" >>> features=prosody.prosody_static(file_audio, plots=True) """ fs, data_audio=read(audio) data_audio=data_audio-np.mean(data_audio) data_audio=data_audio/float(np.max(np.abs(data_audio))) size_frameS=self.size_frame*float(fs) size_stepS=self.step*float(fs) thr_len_pause=self.thr_len*float(fs) overlap=size_stepS/size_frameS nF=int((len(data_audio)/size_frameS/overlap))-1 if self.pitch_method == 'praat': name_audio=audio.split('/') temp_uuid='prosody'+name_audio[-1][0:-4] if not os.path.exists(self.PATH+'/../tempfiles/'): os.makedirs(self.PATH+'/../tempfiles/') temp_filename_f0=self.PATH+'/../tempfiles/tempF0'+temp_uuid+'.txt' temp_filename_vuv=self.PATH+'/../tempfiles/tempVUV'+temp_uuid+'.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=self.step, minf0=self.minf0, maxf0=self.maxf0) F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),self.step) os.remove(temp_filename_f0) os.remove(temp_filename_vuv) elif self.pitch_method == 'rapt': data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32) F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=self.minf0, max=self.maxf0, voice_bias=self.voice_bias, otype='f0') segmentsV=V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP=V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP=[] segmentsU=[] for k in range(len(segmentsUP)): eu=logEnergy(segmentsUP[k]) if (len(segmentsUP[k])>thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features=F0feat(F0) energy_featuresV=energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU=energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features=duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if plots: self.plot_pros(data_audio,fs,F0,segmentsV, segmentsU, F0_features) features=np.hstack((F0_features, energy_featuresV, energy_featuresU, duration_features)) return features
def intonation_duration(audio, size_step=0.01, minf0=60, maxf0=350, stol=0.150, flag_plots=False): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) temp_filename_f0 = path_app + '/../tempfiles/pitchtemp.txt' temp_filename_vuv = path_app + '/../tempfiles/voicetemp.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0, path_praat_script=path_app + "/../praat") pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / fs, size_step) #Slopes slopes = [] #buffers for voiced and unvoiced segments vbuffer = [] ubuffer = [] #energy for total voiced and unvoiced segments venergy = [] uenergy = [] #arrays for time-storing voicedtimes = [] unvoicedtimes = [] silencetimes = [] #flag for starting point voiced time and unvoiced time startvoicedflag = True startUNvoicedflag = True #flag to compare with last segment recordneighbor = True energydifflocalneighbors = [] F0_rec = np.zeros(len(pitch_z)) slopesE = [] for i in range(0, len(pitch_z) - 1): #condition for voiced segment if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: vbuffer.append(pitch_z[i]) #voiced segment starting time if (startvoicedflag): t_start_venergy = ttotal[i] startvoicedflag = False frameF0start = i if len(ubuffer) != 0: samples = len(ubuffer) t = float(samples * size_step ) #unvoiced time based on F0 Fs and actual samples #silence condition if t > stol: silencetimes.append(t) else: unvoicedtimes.append(t) #clear the mess ubuffer = [] #final time for unvoiced t_end_uenergy = ttotal[i] startUNvoicedflag = True #calculate segments with obtained times n_start_unvoiced = fs * t_start_uenergy n_end_unvoiced = fs * t_end_uenergy #energy of real audio segment based on fs and timestamp from F0 #store uenergy.append( logEnergy( data_audio[int(n_start_unvoiced):int(n_end_unvoiced)])) #start appending unvoiced segments else: if (len(vbuffer) != 0): #based on F0 Fs and in buffer length, actual time is calculated samples = len(vbuffer) t = float(samples * size_step) #pick up voiced times voicedtimes.append(t) #voiced segment slope process #temporal x axis vector for slope calculation xtemp_slope = [] tempslope = np.array(vbuffer) for j in range(0, len(vbuffer)): xtemp_slope.append(j) #get slopes of voiced segments if len(xtemp_slope) > 1: pol = np.polyfit(xtemp_slope, tempslope, 1) if not np.isnan(pol[0]): slopes.append(pol[0]) else: pol = [np.nan, np.nan] print("detected short voiced segment", len(xtemp_slope)) #print(xtemp_slope, tempslope) #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope))) #clear the mess vbuffer = [] #final time of voiced segment t_end_venergy = ttotal[i] frameF0end = i if np.isnan(pol[0]): F0_rec[int(frameF0start):int(frameF0end)] = tempslope else: F0_rec[int(frameF0start):int(frameF0end)] = pol[ 0] * np.asarray(xtemp_slope) + pol[1] tempslope = [] xtemp_slope = [] startvoicedflag = True #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps n_start_voiced = fs * t_start_venergy n_end_voiced = fs * t_end_venergy #calculate energy and make venergy append the result envoiced = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) venergy.append(envoiced) #store last element energy in neighbor, at next iteration calculate local and operate if recordneighbor: recordneighbor = False neighbor = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) else: recordneighbor = True local = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) local = np.array(local) neighbor = np.array(neighbor) #diferencia de energia entre semgento actual y anterior ALV energydifflocalneighbors.append( abs(np.mean(local) - np.mean(neighbor))) else: ubuffer.append(pitch_z[i]) #initial time of unvoiced segment if (startUNvoicedflag): t_start_uenergy = ttotal[i] startUNvoicedflag = False #if last segment was not computed with the next one then #compute it with the previous one start = True end = False #record last segment if recordneighbor == False: for i in range(len(pitch_z) - 1, 0): if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: if start == True: startseg = i start = False else: if end == False: endseg = i end = True if (end == True): #retrieve from timestamp in F0 the actual time segments startseg = fs * ttotal[startseg] endseg = fs * ttotal[endseg] #compute energy lastseg = logEnergy(data_audio[int(startseg):int(endseg)]) #cast as array local = np.array(lastseg) neighbor = np.array(neighbor) #take mean difference between them energydifflocalneighbors.append( abs(np.mean(local) - np.mean(neighbor))) break energydifflocalneighbors = np.array(energydifflocalneighbors) voicedtimes = np.array(voicedtimes) unvoicedtimes = np.array(unvoicedtimes) silencetimes = np.array(silencetimes) #print(unvoicedtimes, silencetimes) uenergy = np.array(uenergy) venergy = np.array(venergy) """Measures""" """Intonation""" avgF0slopes = np.average(slopes) # 1. average F0 slope stdF0slopes = np.std(slopes) # 2. std F0 slope """Duration""" if ((silencetimes.size > 0)): SVU = (np.sum(silencetimes)) / ( np.sum(voicedtimes) + np.sum(unvoicedtimes)) # 3.S/(V+U) else: SVU = 0 VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes) # 4.V/U UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 5.U/(V+U) VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 6.V/V+U if ((silencetimes.size > 0)): VS = np.sum(voicedtimes) / np.sum(silencetimes) # 7. V/S US = np.sum(unvoicedtimes) / np.sum(silencetimes) # 8. U/S else: VS = 0 US = 0 URD = np.std(unvoicedtimes) # 9. (std U) VRD = np.std(voicedtimes) # 10. (std V) URE = np.std(uenergy) # 11. (std Energy U) wtf VRE = np.std(venergy) # 12. (std Energy V) MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2) if ((silencetimes.size > 0)): # 13. (std S) PR = np.std(silencetimes) else: PR = 0 os.remove(temp_filename_f0) os.remove(temp_filename_vuv) #nextmeasures maxvoicedlen = np.max(voicedtimes) #max voiced duration maxunvoicedlen = np.max(unvoicedtimes) #max unvoiced duration minvoicedlen = np.min(voicedtimes) #min voiced duration minunvoicedlen = np.min(unvoicedtimes) #min unvoiced duration rvuv = len(voicedtimes) / len( unvoicedtimes) #ratio voiced unvoiced segments #meansqrd error voiced energy segments and voiced energy segments regression coefficient energyslope, intercept, RegCoefenergy, p_value, std_err = st.linregress( venergy, np.arange(len(venergy))) t = np.arange(len(venergy)) energyslope1 = np.polyval([energyslope, intercept], t) msqerrenergy = mean_squared_error(energyslope1, venergy) #mean sqrd error voiced f0 and f0 regression coefficient pitch_znz = pitch_z[pitch_z != minf0] F0slope, intercept, RegCoeff0, p_value, std_err = st.linregress( pitch_znz, np.arange(len(pitch_znz))) #neighbor segment measures meanNeighborenergydiff = np.mean(energydifflocalneighbors) stdNeighborenergydiff = np.std(energydifflocalneighbors) if flag_plots: plt.figure(1) plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0) plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0) plt.text(min(ttotal), max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3))) plt.text(min(ttotal), max(pitch_z) - 10, "Avg. tilt=" + str(np.round(avgF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 15, "Std. tilt=" + str(np.round(stdF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 20, "R^2=" + str(np.round(RegCoeff0, 3))) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.legend() plt.grid(True) plt.show() return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR, maxvoicedlen, maxunvoicedlen, minvoicedlen, minunvoicedlen, rvuv, energyslope, RegCoefenergy, msqerrenergy, RegCoeff0, meanNeighborenergydiff, stdNeighborenergydiff, F0_rec, pitch_z, venergy, uenergy
def prosody_static(audio, flag_plots): fs, data_audio = read(audio) data_audio = data_audio[:-1, 0] print(len(data_audio)) print(data_audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = 0.02 * float(fs) size_stepS = 0.01 * float(fs) thr_len_pause = 0.14 * float(fs) thr_en_pause = 0.2 overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) print(data_audiof) print(size_stepS) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=60, max=350, voice_bias=-0.2, otype='f0') logE = [] for l in range(nF): data_frame = data_audio[int(l * size_stepS):int(l * size_stepS + size_frameS)] logE.append(logEnergy(data_frame)) logE = np.asarray(logE) print("see") print(np.unique(F0)) segmentsV = V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsU = V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) Nvoiced = len(segmentsV) Nunvoiced = len(segmentsU) Vrate = fs * float(Nvoiced) / len(data_audio) avgdurv = 1000 * np.mean([len(segmentsV[k]) for k in range(Nvoiced)]) / float(fs) stddurv = 1000 * np.std([len(segmentsV[k]) for k in range(Nvoiced)]) / float(fs) silence = [] for k in range(Nunvoiced): eu = logEnergy(segmentsU[k]) if (eu < thr_en_pause or len(segmentsU[k]) > thr_len_pause): silence.append(segmentsU[k]) print("here") print(eu) Silrate = fs * float(len(silence)) / len(data_audio) avgdurs = 1000 * np.mean([len(silence[k]) for k in range(len(silence))]) / float(fs) stddurs = 1000 * np.std([len(silence[k]) for k in range(len(silence))]) / float(fs) if flag_plots: plt.figure(1) plt.subplot(311) t = np.arange(0, float(len(data_audio)) / fs, 1.0 / fs) if len(t) != len(data_audio): t = np.arange(1.0 / fs, float(len(data_audio)) / fs, 1.0 / fs) print(len(t), len(data_audio)) plt.plot(t, data_audio, 'k') plt.ylabel('Amplitude') plt.xlabel('Time (s)') plt.xlim([0, t[-1]]) plt.grid(True) plt.subplot(312) fsp = len(F0) / t[-1] print(fsp) t2 = np.arange(0.0, t[-1], 1.0 / fsp) if len(t2) > len(F0): t2 = t2[:len(F0)] elif len(F0) > len(t2): F0 = F0[:len(t2)] plt.plot(t2, F0, color='k', linewidth=2.0) plt.xlabel('Time (s)') plt.ylabel('F0 (Hz)') plt.ylim([0, np.max(F0) + 10]) plt.xlim([0, t[-1]]) plt.grid(True) plt.subplot(313) fse = len(logE) / t[-1] t3 = np.arange(0.0, t[-1], 1.0 / fse) if len(t3) > len(logE): t3 = t3[:len(logE)] elif len(logE) > len(t3): logE = logE[:len(t3)] plt.plot(t3, logE, color='k', linewidth=2.0) plt.xlabel('Time (s)') plt.ylabel('Energy (dB)') #plt.ylim([0,np.max(logE)]) plt.xlim([0, t[-1]]) plt.grid(True) plt.show() F0std = np.std(F0[F0 != 0]) F0varsemi = Hz2semitones(F0std**2) return F0, logE, np.mean(F0[F0 != 0]), np.std( F0[F0 != 0]), np.max(F0), np.mean(logE), np.std(logE), np.max( logE ), Vrate, avgdurv, stddurv, Silrate, avgdurs, stddurs, F0varsemi
def intonation_duration(audio, size_step=0.01, minf0=60, maxf0=350, stol=0.150, flag_plots=False): fs, data_audio = read(audio) temp_filename_f0 = '../tempfiles/pitchtemp.txt' temp_filename_vuv = '../tempfiles/voicetemp.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / fs, size_step) #Slopes slopes = [] #buffers for voiced and unvoiced segments vbuffer = [] ubuffer = [] #energy for total voiced and unvoiced segments venergy = [] uenergy = [] #arrays for time-storing voicedtimes = [] unvoicedtimes = [] silencetimes = [] #flag for starting point voiced time and unvoiced time startvoicedflag = True startUNvoicedflag = True F0_rec = np.zeros(len(pitch_z)) for i in range(0, len(pitch_z) - 1): #condition for voiced segment if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: vbuffer.append(pitch_z[i]) #voiced segment starting time if (startvoicedflag): t_start_venergy = ttotal[i] startvoicedflag = False frameF0start = i if len(ubuffer) != 0: samples = len(ubuffer) t = float(samples * size_step ) #unvoiced time based on F0 Fs and actual samples #silence condition if t > stol: silencetimes.append(t) else: unvoicedtimes.append(t) #clear the mess ubuffer = [] #final time for unvoiced t_end_uenergy = ttotal[i] startUNvoicedflag = True #calculate segments with obtained times n_start_unvoiced = fs * t_start_uenergy n_end_unvoiced = fs * t_end_uenergy #energy of real audio segment based on fs and timestamp from F0 #store uenergy.append( logEnergy( data_audio[int(n_start_unvoiced):int(n_end_unvoiced)])) #start appending unvoiced segments else: if (len(vbuffer) != 0): #based on F0 Fs and in buffer length, actual time is calculated samples = len(vbuffer) t = float(samples * size_step) #pick up voiced times voicedtimes.append(t) #voiced segment slope process #temporal x axis vector for slope calculation xtemp_slope = [] tempslope = np.array(vbuffer) for j in range(0, len(vbuffer)): xtemp_slope.append(j) #get slopes of voiced segments pol = np.polyfit(xtemp_slope, tempslope, 1) if np.isnan(pol[0]): print("#################################") print("detected short voiced segment") #print(xtemp_slope, tempslope) else: slopes.append(pol[0]) #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope))) #clear the mess vbuffer = [] #final time of voiced segment t_end_venergy = ttotal[i] frameF0end = i if np.isnan(pol[0]): F0_rec[int(frameF0start):int(frameF0end)] = tempslope else: F0_rec[int(frameF0start):int(frameF0end)] = pol[ 0] * np.asarray(xtemp_slope) + pol[1] tempslope = [] xtemp_slope = [] startvoicedflag = True #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps n_start_voiced = fs * t_start_venergy n_end_voiced = fs * t_end_venergy #calculate energy and make venergy append the result venergy.append( logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)])) else: ubuffer.append(pitch_z[i]) #initial time of unvoiced segment if (startUNvoicedflag): t_start_uenergy = ttotal[i] startUNvoicedflag = False voicedtimes = np.array(voicedtimes) unvoicedtimes = np.array(unvoicedtimes) silencetimes = np.array(silencetimes) #print(unvoicedtimes, silencetimes) uenergy = np.array(uenergy) venergy = np.array(venergy) """Measures""" """Intonation""" avgF0slopes = np.average(slopes) # 1. average F0 slope stdF0slopes = np.std(slopes) # 2. std F0 slope """Duration""" if ((silencetimes.size > 0)): SVU = (np.sum(silencetimes)) / ( np.sum(voicedtimes) + np.sum(unvoicedtimes)) # 3.S/(V+U) else: SVU = 0 VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes) # 4.V/U UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 5.U/(V+U) VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 6.V/V+U #si no hay silencios hay que prevenir dividir por cero if ((silencetimes.size > 0)): VS = np.sum(voicedtimes) / np.sum(silencetimes) # 7. V/S US = np.sum(unvoicedtimes) / np.sum(silencetimes) # 8. U/S else: VS = 0 US = 0 URD = np.std(unvoicedtimes) # 9. (std U) VRD = np.std(voicedtimes) # 10. (std V) URE = np.std(uenergy) # 11. (std Energy U) wtf VRE = np.std(venergy) # 12. (std Energy V) MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2) if ((silencetimes.size > 0)): # 13. (std S) PR = np.std(silencetimes) else: PR = 0 os.remove(temp_filename_f0) os.remove(temp_filename_vuv) if flag_plots: plt.figure(1) plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0) plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0) plt.text(min(ttotal), max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3))) plt.text(min(ttotal), max(pitch_z) - 10, "Avg. tilt=" + str(np.round(avgF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 15, "Std. tilt=" + str(np.round(stdF0slopes, 3))) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.legend() plt.grid(True) plt.show() return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR
def prosody_static(audio, flag_plots, pitch_method='praat'): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = 0.02 * float(fs) size_stepS = 0.01 * float(fs) thr_len_pause = 0.14 * float(fs) thr_en_pause = 10 * np.log10(0.02) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 if pitch_method == 'praat': temp_uuid = audio.split('/')[-1][0:-4] temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt' temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=0.01, minf0=60, maxf0=350) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), 0.01) os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=60, max=350, voice_bias=-0.2, otype='f0') segmentsV = V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP = V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP = [] segmentsU = [] for k in range(len(segmentsUP)): eu = logEnergy(segmentsUP[k]) if (len(segmentsUP[k]) > thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features = F0feat(F0) energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features = duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if flag_plots: plot_pros(data_audio, fs, F0, segmentsV, segmentsU) features = np.hstack( (F0_features, energy_featuresV, energy_featuresU, duration_features)) return features