def getF0(audio_file, size_step=0.02, minF0=60, maxF0=500): name_audio = audio_file.split('/') temp_uuid = 'phon' + name_audio[-1][0:-4] temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minF0, maxf0=maxF0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), size_step) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) return F0
def articulation_continuous(audio_filename, flag_plots, sizeframe=0.04, step=0.02, nB=22, nMFCC=12, minf0=60, maxf0=350, voice_bias=-0.5, len_thr_miliseconds=270.0, pitch_method='praat'): fs, data_audio = read(audio_filename) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = sizeframe * float(fs) size_stepS = step * float(fs) overlap = size_stepS / size_frameS if pitch_method == 'praat': name_audio = audio_filename.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio_filename, temp_filename_f0, temp_filename_vuv, time_stepF0=step, minf0=minf0, maxf0=maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), step) segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans( temp_filename_vuv, data_audio, fs, sizeframe) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') segments = read_Textgrid(path_base + 'vuv.txt', file_audio, win_trans) segmentsOn = V_UV(F0, data_audio, fs, 'onset') segmentsOff = V_UV(F0, data_audio, fs, 'offset') BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, nB, nMFCC) BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS, nB, nMFCC) DMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T DDMFCCon = np.asarray( [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T DMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T DDMFCCoff = np.asarray( [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T # TODO: Make parameters configurable. (If worth it) name_audio = audio_filename.split('/') temp_uuid = 'artic' + name_audio[-1][0:-4] temp_filename = '../tempfiles/tempFormants' + temp_uuid + '.txt' praat_functions.praat_formants(audio_filename, temp_filename, sizeframe, step) [F1, F2] = praat_functions.decodeFormants(temp_filename) os.remove(temp_filename) if len(F0) < len(F1): F0 = np.hstack((F0, np.zeros(len(F1) - len(F0)))) else: F1 = np.hstack((F1, np.zeros(len(F0) - len(F1)))) F2 = np.hstack((F2, np.zeros(len(F0) - len(F2)))) pos0 = np.where(F0 == 0)[0] dpos0 = np.hstack(([1], np.diff(pos0))) f0u = np.split(pos0, np.where(dpos0 > 1)[0]) thr_sil = int(len_thr_miliseconds / step) sil_seg = [] for l in range(len(f0u)): if len(f0u[l]) >= thr_sil: F1[f0u[l]] = 0 F2[f0u[l]] = 0 sil_seg.append(f0u) sil_seg = np.hstack(sil_seg) F1nz = F1[F1 != 0] F2nz = F2[F2 != 0] DF1 = np.diff(F1, n=1) DF2 = np.diff(F2, n=1) DDF1 = np.diff(F1, n=2) DDF2 = np.diff(F2, n=2) if flag_plots: plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff) return BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
def intonation_duration(audio, size_step=0.01, minf0=60, maxf0=350, stol=0.150, flag_plots=False): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) temp_filename_f0 = path_app + '/../tempfiles/pitchtemp.txt' temp_filename_vuv = path_app + '/../tempfiles/voicetemp.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0, path_praat_script=path_app + "/../praat") pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / fs, size_step) #Slopes slopes = [] #buffers for voiced and unvoiced segments vbuffer = [] ubuffer = [] #energy for total voiced and unvoiced segments venergy = [] uenergy = [] #arrays for time-storing voicedtimes = [] unvoicedtimes = [] silencetimes = [] #flag for starting point voiced time and unvoiced time startvoicedflag = True startUNvoicedflag = True #flag to compare with last segment recordneighbor = True energydifflocalneighbors = [] F0_rec = np.zeros(len(pitch_z)) slopesE = [] for i in range(0, len(pitch_z) - 1): #condition for voiced segment if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: vbuffer.append(pitch_z[i]) #voiced segment starting time if (startvoicedflag): t_start_venergy = ttotal[i] startvoicedflag = False frameF0start = i if len(ubuffer) != 0: samples = len(ubuffer) t = float(samples * size_step ) #unvoiced time based on F0 Fs and actual samples #silence condition if t > stol: silencetimes.append(t) else: unvoicedtimes.append(t) #clear the mess ubuffer = [] #final time for unvoiced t_end_uenergy = ttotal[i] startUNvoicedflag = True #calculate segments with obtained times n_start_unvoiced = fs * t_start_uenergy n_end_unvoiced = fs * t_end_uenergy #energy of real audio segment based on fs and timestamp from F0 #store uenergy.append( logEnergy( data_audio[int(n_start_unvoiced):int(n_end_unvoiced)])) #start appending unvoiced segments else: if (len(vbuffer) != 0): #based on F0 Fs and in buffer length, actual time is calculated samples = len(vbuffer) t = float(samples * size_step) #pick up voiced times voicedtimes.append(t) #voiced segment slope process #temporal x axis vector for slope calculation xtemp_slope = [] tempslope = np.array(vbuffer) for j in range(0, len(vbuffer)): xtemp_slope.append(j) #get slopes of voiced segments if len(xtemp_slope) > 1: pol = np.polyfit(xtemp_slope, tempslope, 1) if not np.isnan(pol[0]): slopes.append(pol[0]) else: pol = [np.nan, np.nan] print("detected short voiced segment", len(xtemp_slope)) #print(xtemp_slope, tempslope) #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope))) #clear the mess vbuffer = [] #final time of voiced segment t_end_venergy = ttotal[i] frameF0end = i if np.isnan(pol[0]): F0_rec[int(frameF0start):int(frameF0end)] = tempslope else: F0_rec[int(frameF0start):int(frameF0end)] = pol[ 0] * np.asarray(xtemp_slope) + pol[1] tempslope = [] xtemp_slope = [] startvoicedflag = True #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps n_start_voiced = fs * t_start_venergy n_end_voiced = fs * t_end_venergy #calculate energy and make venergy append the result envoiced = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) venergy.append(envoiced) #store last element energy in neighbor, at next iteration calculate local and operate if recordneighbor: recordneighbor = False neighbor = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) else: recordneighbor = True local = logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)]) local = np.array(local) neighbor = np.array(neighbor) #diferencia de energia entre semgento actual y anterior ALV energydifflocalneighbors.append( abs(np.mean(local) - np.mean(neighbor))) else: ubuffer.append(pitch_z[i]) #initial time of unvoiced segment if (startUNvoicedflag): t_start_uenergy = ttotal[i] startUNvoicedflag = False #if last segment was not computed with the next one then #compute it with the previous one start = True end = False #record last segment if recordneighbor == False: for i in range(len(pitch_z) - 1, 0): if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: if start == True: startseg = i start = False else: if end == False: endseg = i end = True if (end == True): #retrieve from timestamp in F0 the actual time segments startseg = fs * ttotal[startseg] endseg = fs * ttotal[endseg] #compute energy lastseg = logEnergy(data_audio[int(startseg):int(endseg)]) #cast as array local = np.array(lastseg) neighbor = np.array(neighbor) #take mean difference between them energydifflocalneighbors.append( abs(np.mean(local) - np.mean(neighbor))) break energydifflocalneighbors = np.array(energydifflocalneighbors) voicedtimes = np.array(voicedtimes) unvoicedtimes = np.array(unvoicedtimes) silencetimes = np.array(silencetimes) #print(unvoicedtimes, silencetimes) uenergy = np.array(uenergy) venergy = np.array(venergy) """Measures""" """Intonation""" avgF0slopes = np.average(slopes) # 1. average F0 slope stdF0slopes = np.std(slopes) # 2. std F0 slope """Duration""" if ((silencetimes.size > 0)): SVU = (np.sum(silencetimes)) / ( np.sum(voicedtimes) + np.sum(unvoicedtimes)) # 3.S/(V+U) else: SVU = 0 VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes) # 4.V/U UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 5.U/(V+U) VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 6.V/V+U if ((silencetimes.size > 0)): VS = np.sum(voicedtimes) / np.sum(silencetimes) # 7. V/S US = np.sum(unvoicedtimes) / np.sum(silencetimes) # 8. U/S else: VS = 0 US = 0 URD = np.std(unvoicedtimes) # 9. (std U) VRD = np.std(voicedtimes) # 10. (std V) URE = np.std(uenergy) # 11. (std Energy U) wtf VRE = np.std(venergy) # 12. (std Energy V) MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2) if ((silencetimes.size > 0)): # 13. (std S) PR = np.std(silencetimes) else: PR = 0 os.remove(temp_filename_f0) os.remove(temp_filename_vuv) #nextmeasures maxvoicedlen = np.max(voicedtimes) #max voiced duration maxunvoicedlen = np.max(unvoicedtimes) #max unvoiced duration minvoicedlen = np.min(voicedtimes) #min voiced duration minunvoicedlen = np.min(unvoicedtimes) #min unvoiced duration rvuv = len(voicedtimes) / len( unvoicedtimes) #ratio voiced unvoiced segments #meansqrd error voiced energy segments and voiced energy segments regression coefficient energyslope, intercept, RegCoefenergy, p_value, std_err = st.linregress( venergy, np.arange(len(venergy))) t = np.arange(len(venergy)) energyslope1 = np.polyval([energyslope, intercept], t) msqerrenergy = mean_squared_error(energyslope1, venergy) #mean sqrd error voiced f0 and f0 regression coefficient pitch_znz = pitch_z[pitch_z != minf0] F0slope, intercept, RegCoeff0, p_value, std_err = st.linregress( pitch_znz, np.arange(len(pitch_znz))) #neighbor segment measures meanNeighborenergydiff = np.mean(energydifflocalneighbors) stdNeighborenergydiff = np.std(energydifflocalneighbors) if flag_plots: plt.figure(1) plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0) plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0) plt.text(min(ttotal), max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3))) plt.text(min(ttotal), max(pitch_z) - 10, "Avg. tilt=" + str(np.round(avgF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 15, "Std. tilt=" + str(np.round(stdF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 20, "R^2=" + str(np.round(RegCoeff0, 3))) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.legend() plt.grid(True) plt.show() return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR, maxvoicedlen, maxunvoicedlen, minvoicedlen, minunvoicedlen, rvuv, energyslope, RegCoefenergy, msqerrenergy, RegCoeff0, meanNeighborenergydiff, stdNeighborenergydiff, F0_rec, pitch_z, venergy, uenergy
def phonationVowels(audio, flag_plots, size_frame=0.04,size_step=0.02,minf0=60,maxf0=350, voice_bias=-0.2,energy_thr_percent=0.025, pitch_method='praat'): fs, data_audio=read(audio) data_audio=data_audio-np.mean(data_audio) data_audio=data_audio/float(np.max(np.abs(data_audio))) size_frameS=size_frame*float(fs) size_stepS=size_step*float(fs) overlap=size_stepS/size_frameS if pitch_method == 'praat': name_audio=audio.split('/') temp_uuid='phon'+name_audio[-1][0:-4] if not os.path.exists('../tempfiles/'): os.makedirs('../tempfiles/') temp_filename_vuv='../tempfiles/tempVUV'+temp_uuid+'.txt' temp_filename_f0='../tempfiles/tempF0'+temp_uuid+'.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),size_step) #os.remove(temp_filename_vuv) #os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32) F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') F0nz=F0[F0!=0] Jitter=jitter_env(F0nz, len(F0nz)) nF=int((len(data_audio)/size_frameS/overlap))-1 Amp=[] logE=[] apq=[] ppq=[] DF0=np.diff(F0nz, 1) DDF0=np.diff(DF0,1) F0z=F0[F0==0] totaldurU=len(F0z) thresholdE=10*logEnergy([energy_thr_percent]) degreeU=100*float(totaldurU)/len(F0) lnz=0 for l in range(nF): data_frame=data_audio[int(l*size_stepS):int(l*size_stepS+size_frameS)] energy=10*logEnergy(data_frame) if F0[l]!=0: Amp.append(np.max(np.abs(data_frame))) logE.append(10*logEnergy(data_frame)) if lnz>=12: # TODO: amp_arr=np.asarray([Amp[j] for j in range(lnz-12, lnz)]) #print(amp_arr) apq.append(APQ(amp_arr)) if lnz>=6: # TODO: f0arr=np.asarray([F0nz[j] for j in range(lnz-6, lnz)]) ppq.append(PPQ(1/f0arr)) lnz=lnz+1 print("frame "+str(l) +" from "+str(nF)+"-"*int(100*l/nF)+">"+str(int(100*(l+1)/nF))+"%", sep=' ', end='\r', flush=True) Shimmer=shimmer_env(Amp, len(Amp)) apq=np.asarray(apq) ppq=np.asarray(ppq) logE=np.asarray(logE) F0semi=np.asarray([Hz2semitones(F0nz[l]) for l in range(len(F0nz))]) if flag_plots: plot_phon(data_audio,fs,F0,logE) print("Jitter=", len(Jitter)) print("Shimmer", len(Shimmer)) print("APQ", len(apq)) print("PPQ", len(ppq)) print("DF0", len(DF0)) print("DDF0", len(DDF0)) print("Energy", len(logE)) print("degree unvoiced",degreeU) return F0, DF0, DDF0, F0semi, Jitter, Shimmer, apq, ppq, logE, degreeU
def prosody_dynamic(audio, size_frame=0.03, size_step=0.01, minf0=60, maxf0=350, voice_bias=-0.2, energy_thr_percent=0.025, P=5, pitch_method='praat'): """ Based on: Najim Dehak, "Modeling Prosodic Features With Joint Factor Analysis for Speaker Verification", 2007 """ fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = size_frame * float(fs) size_stepS = size_step * float(fs) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) if pitch_method == 'praat': name_audio = audio.split('/') temp_uuid = 'pros' + name_audio[-1][0:-4] temp_filename_vuv = path_app + '/../tempfiles/tempVUV' + temp_uuid + '.txt' temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), size_step) os.remove(temp_filename_vuv) os.remove(temp_filename_f0) elif pitch_method == 'rapt': F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0') #Find pitch contour of EACH voiced segment pitchON = np.where(F0 != 0)[0] dchange = np.diff(pitchON) change = np.where(dchange > 1)[0] iniV = pitchON[0] featvec = [] iniVoiced = (pitchON[0] * size_stepS) + size_stepS #To compute energy seg_voiced = [] f0v = [] Ev = [] for indx in change: finV = pitchON[indx] + 1 finVoiced = (pitchON[indx] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int( finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size seg_voiced.append(VoicedSeg) #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) #Pitch coefficients x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) f0v.append(temp) #fitCoeff.append(z.coeffs) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) Ev.append(temp) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) featvec.append(tempvec) iniV = pitchON[indx + 1] iniVoiced = (pitchON[indx + 1] * size_stepS) + size_stepS #To compute energy #Add the last voiced segment finV = (pitchON[len(pitchON) - 1]) finVoiced = (pitchON[len(pitchON) - 1] * size_stepS) + size_stepS #To compute energy VoicedSeg = data_audio[int(iniVoiced):int(finVoiced)] #To compute energy temp = F0[iniV:finV] tempvec = [] if len(VoicedSeg) > int( size_frameS): #Take only segments greater than frame size #Compute duration dur = len(VoicedSeg) / float(fs) tempvec.append(dur) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) #Energy coefficients temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap) x = np.arange(0, len(temp)) z = np.poly1d(np.polyfit(x, temp, P)) tempvec.extend(z.coeffs) #Compute duration featvec.append(tempvec) if flag_plots: plot_pros(data_audio, fs, F0, seg_voiced, Ev, featvec, f0v) return np.asarray(featvec)
def intonation_duration(audio, size_step=0.01, minf0=60, maxf0=350, stol=0.150, flag_plots=False): fs, data_audio = read(audio) temp_filename_f0 = '../tempfiles/pitchtemp.txt' temp_filename_vuv = '../tempfiles/voicetemp.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0) pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / fs, size_step) #Slopes slopes = [] #buffers for voiced and unvoiced segments vbuffer = [] ubuffer = [] #energy for total voiced and unvoiced segments venergy = [] uenergy = [] #arrays for time-storing voicedtimes = [] unvoicedtimes = [] silencetimes = [] #flag for starting point voiced time and unvoiced time startvoicedflag = True startUNvoicedflag = True F0_rec = np.zeros(len(pitch_z)) for i in range(0, len(pitch_z) - 1): #condition for voiced segment if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0: vbuffer.append(pitch_z[i]) #voiced segment starting time if (startvoicedflag): t_start_venergy = ttotal[i] startvoicedflag = False frameF0start = i if len(ubuffer) != 0: samples = len(ubuffer) t = float(samples * size_step ) #unvoiced time based on F0 Fs and actual samples #silence condition if t > stol: silencetimes.append(t) else: unvoicedtimes.append(t) #clear the mess ubuffer = [] #final time for unvoiced t_end_uenergy = ttotal[i] startUNvoicedflag = True #calculate segments with obtained times n_start_unvoiced = fs * t_start_uenergy n_end_unvoiced = fs * t_end_uenergy #energy of real audio segment based on fs and timestamp from F0 #store uenergy.append( logEnergy( data_audio[int(n_start_unvoiced):int(n_end_unvoiced)])) #start appending unvoiced segments else: if (len(vbuffer) != 0): #based on F0 Fs and in buffer length, actual time is calculated samples = len(vbuffer) t = float(samples * size_step) #pick up voiced times voicedtimes.append(t) #voiced segment slope process #temporal x axis vector for slope calculation xtemp_slope = [] tempslope = np.array(vbuffer) for j in range(0, len(vbuffer)): xtemp_slope.append(j) #get slopes of voiced segments pol = np.polyfit(xtemp_slope, tempslope, 1) if np.isnan(pol[0]): print("#################################") print("detected short voiced segment") #print(xtemp_slope, tempslope) else: slopes.append(pol[0]) #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope))) #clear the mess vbuffer = [] #final time of voiced segment t_end_venergy = ttotal[i] frameF0end = i if np.isnan(pol[0]): F0_rec[int(frameF0start):int(frameF0end)] = tempslope else: F0_rec[int(frameF0start):int(frameF0end)] = pol[ 0] * np.asarray(xtemp_slope) + pol[1] tempslope = [] xtemp_slope = [] startvoicedflag = True #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps n_start_voiced = fs * t_start_venergy n_end_voiced = fs * t_end_venergy #calculate energy and make venergy append the result venergy.append( logEnergy( data_audio[int(n_start_voiced):int(n_end_voiced)])) else: ubuffer.append(pitch_z[i]) #initial time of unvoiced segment if (startUNvoicedflag): t_start_uenergy = ttotal[i] startUNvoicedflag = False voicedtimes = np.array(voicedtimes) unvoicedtimes = np.array(unvoicedtimes) silencetimes = np.array(silencetimes) #print(unvoicedtimes, silencetimes) uenergy = np.array(uenergy) venergy = np.array(venergy) """Measures""" """Intonation""" avgF0slopes = np.average(slopes) # 1. average F0 slope stdF0slopes = np.std(slopes) # 2. std F0 slope """Duration""" if ((silencetimes.size > 0)): SVU = (np.sum(silencetimes)) / ( np.sum(voicedtimes) + np.sum(unvoicedtimes)) # 3.S/(V+U) else: SVU = 0 VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes) # 4.V/U UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 5.U/(V+U) VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes) ) # 6.V/V+U #si no hay silencios hay que prevenir dividir por cero if ((silencetimes.size > 0)): VS = np.sum(voicedtimes) / np.sum(silencetimes) # 7. V/S US = np.sum(unvoicedtimes) / np.sum(silencetimes) # 8. U/S else: VS = 0 US = 0 URD = np.std(unvoicedtimes) # 9. (std U) VRD = np.std(voicedtimes) # 10. (std V) URE = np.std(uenergy) # 11. (std Energy U) wtf VRE = np.std(venergy) # 12. (std Energy V) MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2) if ((silencetimes.size > 0)): # 13. (std S) PR = np.std(silencetimes) else: PR = 0 os.remove(temp_filename_f0) os.remove(temp_filename_vuv) if flag_plots: plt.figure(1) plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0) plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0) plt.text(min(ttotal), max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3))) plt.text(min(ttotal), max(pitch_z) - 10, "Avg. tilt=" + str(np.round(avgF0slopes, 3))) plt.text(min(ttotal), max(pitch_z) - 15, "Std. tilt=" + str(np.round(stdF0slopes, 3))) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.legend() plt.grid(True) plt.show() return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR
def prosody_static(audio, flag_plots, pitch_method='praat'): fs, data_audio = read(audio) data_audio = data_audio - np.mean(data_audio) data_audio = data_audio / float(np.max(np.abs(data_audio))) size_frameS = 0.02 * float(fs) size_stepS = 0.01 * float(fs) thr_len_pause = 0.14 * float(fs) thr_en_pause = 10 * np.log10(0.02) overlap = size_stepS / size_frameS nF = int((len(data_audio) / size_frameS / overlap)) - 1 if pitch_method == 'praat': temp_uuid = audio.split('/')[-1][0:-4] temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt' temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt' praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=0.01, minf0=60, maxf0=350) F0, _ = praat_functions.decodeF0(temp_filename_f0, len(data_audio) / float(fs), 0.01) os.remove(temp_filename_f0) elif pitch_method == 'rapt': data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32) F0 = pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=60, max=350, voice_bias=-0.2, otype='f0') segmentsV = V_UV(F0, data_audio, fs, type_seg="Voiced", size_stepS=size_stepS) segmentsUP = V_UV(F0, data_audio, fs, type_seg="Unvoiced", size_stepS=size_stepS) segmentsP = [] segmentsU = [] for k in range(len(segmentsUP)): eu = logEnergy(segmentsUP[k]) if (len(segmentsUP[k]) > thr_len_pause): segmentsP.append(segmentsUP[k]) else: segmentsU.append(segmentsUP[k]) F0_features = F0feat(F0) energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS) energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS) duration_features = duration_feat(segmentsV, segmentsU, segmentsP, data_audio, fs) if flag_plots: plot_pros(data_audio, fs, F0, segmentsV, segmentsU) features = np.hstack( (F0_features, energy_featuresV, energy_featuresU, duration_features)) return features