Example #1
0
def getF0(audio_file, size_step=0.02, minF0=60, maxF0=500):
    name_audio = audio_file.split('/')
    temp_uuid = 'phon' + name_audio[-1][0:-4]
    temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
    temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt'
    praat_functions.praat_vuv(audio,
                              temp_filename_f0,
                              temp_filename_vuv,
                              time_stepF0=size_step,
                              minf0=minF0,
                              maxf0=maxF0)
    F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                     len(data_audio) / float(fs), size_step)
    os.remove(temp_filename_vuv)
    os.remove(temp_filename_f0)
    return F0
Example #2
0
def articulation_continuous(audio_filename,
                            flag_plots,
                            sizeframe=0.04,
                            step=0.02,
                            nB=22,
                            nMFCC=12,
                            minf0=60,
                            maxf0=350,
                            voice_bias=-0.5,
                            len_thr_miliseconds=270.0,
                            pitch_method='praat'):

    fs, data_audio = read(audio_filename)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = sizeframe * float(fs)
    size_stepS = step * float(fs)
    overlap = size_stepS / size_frameS

    if pitch_method == 'praat':
        name_audio = audio_filename.split('/')
        temp_uuid = 'artic' + name_audio[-1][0:-4]
        temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
        temp_filename_f0 = '../tempfiles/tempF0' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio_filename,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=step,
                                  minf0=minf0,
                                  maxf0=maxf0)
        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs), step)
        segmentsFull, segmentsOn, segmentsOff = praat_functions.read_textgrid_trans(
            temp_filename_vuv, data_audio, fs, sizeframe)
        os.remove(temp_filename_vuv)
        os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=minf0,
                              max=maxf0,
                              voice_bias=voice_bias,
                              otype='f0')
        segments = read_Textgrid(path_base + 'vuv.txt', file_audio, win_trans)
        segmentsOn = V_UV(F0, data_audio, fs, 'onset')
        segmentsOff = V_UV(F0, data_audio, fs, 'offset')

    BBEon, MFCCon = extractTrans(segmentsOn, fs, size_frameS, size_stepS, nB,
                                 nMFCC)
    BBEoff, MFCCoff = extractTrans(segmentsOff, fs, size_frameS, size_stepS,
                                   nB, nMFCC)

    DMFCCon = np.asarray(
        [np.diff(MFCCon[:, nf], n=1) for nf in range(MFCCon.shape[1])]).T
    DDMFCCon = np.asarray(
        [np.diff(MFCCon[:, nf], n=2) for nf in range(MFCCon.shape[1])]).T

    DMFCCoff = np.asarray(
        [np.diff(MFCCoff[:, nf], n=1) for nf in range(MFCCoff.shape[1])]).T
    DDMFCCoff = np.asarray(
        [np.diff(MFCCoff[:, nf], n=2) for nf in range(MFCCoff.shape[1])]).T

    # TODO: Make parameters configurable. (If worth it)
    name_audio = audio_filename.split('/')
    temp_uuid = 'artic' + name_audio[-1][0:-4]
    temp_filename = '../tempfiles/tempFormants' + temp_uuid + '.txt'
    praat_functions.praat_formants(audio_filename, temp_filename, sizeframe,
                                   step)
    [F1, F2] = praat_functions.decodeFormants(temp_filename)
    os.remove(temp_filename)

    if len(F0) < len(F1):
        F0 = np.hstack((F0, np.zeros(len(F1) - len(F0))))
    else:
        F1 = np.hstack((F1, np.zeros(len(F0) - len(F1))))
        F2 = np.hstack((F2, np.zeros(len(F0) - len(F2))))

    pos0 = np.where(F0 == 0)[0]
    dpos0 = np.hstack(([1], np.diff(pos0)))
    f0u = np.split(pos0, np.where(dpos0 > 1)[0])

    thr_sil = int(len_thr_miliseconds / step)

    sil_seg = []
    for l in range(len(f0u)):
        if len(f0u[l]) >= thr_sil:
            F1[f0u[l]] = 0
            F2[f0u[l]] = 0
        sil_seg.append(f0u)

    sil_seg = np.hstack(sil_seg)

    F1nz = F1[F1 != 0]
    F2nz = F2[F2 != 0]
    DF1 = np.diff(F1, n=1)
    DF2 = np.diff(F2, n=1)
    DDF1 = np.diff(F1, n=2)
    DDF2 = np.diff(F2, n=2)

    if flag_plots:
        plot_art(data_audio, fs, F0, F1, F2, segmentsOn, segmentsOff)

    return BBEon, MFCCon, DMFCCon, DDMFCCon, BBEoff, MFCCoff, DMFCCoff, DDMFCCoff, F1nz, DF1, DDF1, F2nz, DF2, DDF2
Example #3
0
def intonation_duration(audio,
                        size_step=0.01,
                        minf0=60,
                        maxf0=350,
                        stol=0.150,
                        flag_plots=False):
    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))

    temp_filename_f0 = path_app + '/../tempfiles/pitchtemp.txt'
    temp_filename_vuv = path_app + '/../tempfiles/voicetemp.txt'

    praat_functions.praat_vuv(audio,
                              temp_filename_f0,
                              temp_filename_vuv,
                              time_stepF0=size_step,
                              minf0=minf0,
                              maxf0=maxf0,
                              path_praat_script=path_app + "/../praat")
    pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0,
                                               len(data_audio) / fs, size_step)

    #Slopes
    slopes = []
    #buffers for voiced and unvoiced segments
    vbuffer = []
    ubuffer = []
    #energy for total voiced and unvoiced segments
    venergy = []
    uenergy = []
    #arrays for time-storing
    voicedtimes = []
    unvoicedtimes = []
    silencetimes = []
    #flag for starting point voiced time and unvoiced time
    startvoicedflag = True
    startUNvoicedflag = True
    #flag to compare with last segment
    recordneighbor = True
    energydifflocalneighbors = []

    F0_rec = np.zeros(len(pitch_z))
    slopesE = []
    for i in range(0, len(pitch_z) - 1):
        #condition for voiced segment
        if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
            vbuffer.append(pitch_z[i])
            #voiced segment starting time
            if (startvoicedflag):
                t_start_venergy = ttotal[i]
                startvoicedflag = False
                frameF0start = i

            if len(ubuffer) != 0:
                samples = len(ubuffer)

                t = float(samples * size_step
                          )  #unvoiced time based on F0 Fs and actual samples
                #silence condition
                if t > stol:
                    silencetimes.append(t)
                else:
                    unvoicedtimes.append(t)

                #clear the mess
                ubuffer = []
                #final time for unvoiced
                t_end_uenergy = ttotal[i]
                startUNvoicedflag = True
                #calculate segments with obtained times
                n_start_unvoiced = fs * t_start_uenergy
                n_end_unvoiced = fs * t_end_uenergy
                #energy of real audio segment based on fs and timestamp from F0
                #store
                uenergy.append(
                    logEnergy(
                        data_audio[int(n_start_unvoiced):int(n_end_unvoiced)]))
        #start appending unvoiced segments
        else:
            if (len(vbuffer) != 0):
                #based on F0 Fs and in buffer length, actual time is calculated
                samples = len(vbuffer)
                t = float(samples * size_step)
                #pick up voiced times
                voicedtimes.append(t)
                #voiced segment slope process
                #temporal x axis vector for slope calculation
                xtemp_slope = []
                tempslope = np.array(vbuffer)
                for j in range(0, len(vbuffer)):
                    xtemp_slope.append(j)
                #get slopes of voiced segments

                if len(xtemp_slope) > 1:
                    pol = np.polyfit(xtemp_slope, tempslope, 1)
                    if not np.isnan(pol[0]):
                        slopes.append(pol[0])
                else:
                    pol = [np.nan, np.nan]
                    print("detected short voiced segment", len(xtemp_slope))
                    #print(xtemp_slope, tempslope)
                #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope)))

                #clear the mess

                vbuffer = []

                #final time of voiced segment
                t_end_venergy = ttotal[i]
                frameF0end = i
                if np.isnan(pol[0]):
                    F0_rec[int(frameF0start):int(frameF0end)] = tempslope
                else:
                    F0_rec[int(frameF0start):int(frameF0end)] = pol[
                        0] * np.asarray(xtemp_slope) + pol[1]

                tempslope = []
                xtemp_slope = []
                startvoicedflag = True
                #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps
                n_start_voiced = fs * t_start_venergy
                n_end_voiced = fs * t_end_venergy
                #calculate energy and make venergy append the result
                envoiced = logEnergy(
                    data_audio[int(n_start_voiced):int(n_end_voiced)])
                venergy.append(envoiced)

                #store last element energy in neighbor, at next iteration calculate local  and operate
                if recordneighbor:
                    recordneighbor = False
                    neighbor = logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)])
                else:
                    recordneighbor = True
                    local = logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)])
                    local = np.array(local)
                    neighbor = np.array(neighbor)
                    #diferencia de energia entre semgento actual y anterior ALV
                    energydifflocalneighbors.append(
                        abs(np.mean(local) - np.mean(neighbor)))

            else:
                ubuffer.append(pitch_z[i])
                #initial time of unvoiced segment
                if (startUNvoicedflag):
                    t_start_uenergy = ttotal[i]
                    startUNvoicedflag = False

    #if last segment was not computed with the next one then
    #compute it with the previous one
    start = True
    end = False
    #record last segment
    if recordneighbor == False:
        for i in range(len(pitch_z) - 1, 0):
            if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
                if start == True:
                    startseg = i
                    start = False
            else:
                if end == False:
                    endseg = i
                    end = True
            if (end == True):
                #retrieve from timestamp in F0 the actual time segments
                startseg = fs * ttotal[startseg]
                endseg = fs * ttotal[endseg]
                #compute energy
                lastseg = logEnergy(data_audio[int(startseg):int(endseg)])
                #cast as array
                local = np.array(lastseg)
                neighbor = np.array(neighbor)
                #take mean difference between them
                energydifflocalneighbors.append(
                    abs(np.mean(local) - np.mean(neighbor)))
                break

    energydifflocalneighbors = np.array(energydifflocalneighbors)

    voicedtimes = np.array(voicedtimes)
    unvoicedtimes = np.array(unvoicedtimes)

    silencetimes = np.array(silencetimes)
    #print(unvoicedtimes, silencetimes)
    uenergy = np.array(uenergy)
    venergy = np.array(venergy)
    """Measures"""
    """Intonation"""
    avgF0slopes = np.average(slopes)  # 1. average F0 slope
    stdF0slopes = np.std(slopes)  # 2. std F0 slope
    """Duration"""
    if ((silencetimes.size > 0)):
        SVU = (np.sum(silencetimes)) / (
            np.sum(voicedtimes) + np.sum(unvoicedtimes))  #  3.S/(V+U)
    else:
        SVU = 0
    VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes)  #  4.V/U
    UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                   )  #  5.U/(V+U)
    VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                 )  #  6.V/V+U

    if ((silencetimes.size > 0)):
        VS = np.sum(voicedtimes) / np.sum(silencetimes)  # 7. V/S
        US = np.sum(unvoicedtimes) / np.sum(silencetimes)  # 8. U/S
    else:
        VS = 0
        US = 0

    URD = np.std(unvoicedtimes)  # 9. (std U)
    VRD = np.std(voicedtimes)  # 10. (std V)

    URE = np.std(uenergy)  # 11. (std Energy U) wtf
    VRE = np.std(venergy)  # 12. (std Energy V)
    MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2)
    if ((silencetimes.size > 0)):  # 13. (std S)
        PR = np.std(silencetimes)
    else:
        PR = 0

    os.remove(temp_filename_f0)
    os.remove(temp_filename_vuv)

    #nextmeasures
    maxvoicedlen = np.max(voicedtimes)  #max voiced duration
    maxunvoicedlen = np.max(unvoicedtimes)  #max unvoiced duration
    minvoicedlen = np.min(voicedtimes)  #min voiced duration
    minunvoicedlen = np.min(unvoicedtimes)  #min unvoiced duration
    rvuv = len(voicedtimes) / len(
        unvoicedtimes)  #ratio voiced unvoiced segments
    #meansqrd error voiced energy segments and voiced energy segments regression coefficient
    energyslope, intercept, RegCoefenergy, p_value, std_err = st.linregress(
        venergy, np.arange(len(venergy)))
    t = np.arange(len(venergy))
    energyslope1 = np.polyval([energyslope, intercept], t)
    msqerrenergy = mean_squared_error(energyslope1, venergy)
    #mean sqrd error voiced f0 and f0 regression coefficient
    pitch_znz = pitch_z[pitch_z != minf0]
    F0slope, intercept, RegCoeff0, p_value, std_err = st.linregress(
        pitch_znz, np.arange(len(pitch_znz)))
    #neighbor segment measures
    meanNeighborenergydiff = np.mean(energydifflocalneighbors)
    stdNeighborenergydiff = np.std(energydifflocalneighbors)

    if flag_plots:
        plt.figure(1)
        plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0)
        plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0)
        plt.text(min(ttotal),
                 max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 10,
                 "Avg. tilt=" + str(np.round(avgF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 15,
                 "Std. tilt=" + str(np.round(stdF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 20, "R^2=" + str(np.round(RegCoeff0, 3)))

        plt.xlabel("Time (s)")
        plt.ylabel("Frequency (Hz)")
        plt.legend()

        plt.grid(True)
        plt.show()

    return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR, maxvoicedlen, maxunvoicedlen, minvoicedlen, minunvoicedlen, rvuv, energyslope, RegCoefenergy, msqerrenergy, RegCoeff0, meanNeighborenergydiff, stdNeighborenergydiff, F0_rec, pitch_z, venergy, uenergy
Example #4
0
def phonationVowels(audio, flag_plots, size_frame=0.04,size_step=0.02,minf0=60,maxf0=350, voice_bias=-0.2,energy_thr_percent=0.025, pitch_method='praat'):



    fs, data_audio=read(audio)
    data_audio=data_audio-np.mean(data_audio)
    data_audio=data_audio/float(np.max(np.abs(data_audio)))
    size_frameS=size_frame*float(fs)
    size_stepS=size_step*float(fs)
    overlap=size_stepS/size_frameS
    if pitch_method == 'praat':
        name_audio=audio.split('/')
        temp_uuid='phon'+name_audio[-1][0:-4]
        if not os.path.exists('../tempfiles/'):
            os.makedirs('../tempfiles/')
        temp_filename_vuv='../tempfiles/tempVUV'+temp_uuid+'.txt'
        temp_filename_f0='../tempfiles/tempF0'+temp_uuid+'.txt'
        praat_functions.praat_vuv(audio, temp_filename_f0, temp_filename_vuv, time_stepF0=size_step, minf0=minf0, maxf0=maxf0)
        F0,_=praat_functions.decodeF0(temp_filename_f0,len(data_audio)/float(fs),size_step)
        #os.remove(temp_filename_vuv)
        #os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        data_audiof=np.asarray(data_audio*(2**15), dtype=np.float32)
        F0=pysptk.sptk.rapt(data_audiof, fs, int(size_stepS), min=minf0, max=maxf0, voice_bias=voice_bias, otype='f0')
    F0nz=F0[F0!=0]
    Jitter=jitter_env(F0nz, len(F0nz))

    nF=int((len(data_audio)/size_frameS/overlap))-1
    Amp=[]
    logE=[]
    apq=[]
    ppq=[]

    DF0=np.diff(F0nz, 1)
    DDF0=np.diff(DF0,1)

    F0z=F0[F0==0]
    totaldurU=len(F0z)

    thresholdE=10*logEnergy([energy_thr_percent])
    degreeU=100*float(totaldurU)/len(F0)
    lnz=0
    for l in range(nF):
        data_frame=data_audio[int(l*size_stepS):int(l*size_stepS+size_frameS)]
        energy=10*logEnergy(data_frame)
        if F0[l]!=0:
            Amp.append(np.max(np.abs(data_frame)))
            logE.append(10*logEnergy(data_frame))
            if lnz>=12: # TODO:
                amp_arr=np.asarray([Amp[j] for j in range(lnz-12, lnz)])
                #print(amp_arr)
                apq.append(APQ(amp_arr))
            if lnz>=6: # TODO:
                f0arr=np.asarray([F0nz[j] for j in range(lnz-6, lnz)])
                ppq.append(PPQ(1/f0arr))
            lnz=lnz+1
        print("frame "+str(l) +" from "+str(nF)+"-"*int(100*l/nF)+">"+str(int(100*(l+1)/nF))+"%", sep=' ', end='\r', flush=True)

    Shimmer=shimmer_env(Amp, len(Amp))
    apq=np.asarray(apq)
    ppq=np.asarray(ppq)
    logE=np.asarray(logE)
    F0semi=np.asarray([Hz2semitones(F0nz[l]) for l in range(len(F0nz))])

    if flag_plots:
        plot_phon(data_audio,fs,F0,logE)

    print("Jitter=", len(Jitter))
    print("Shimmer", len(Shimmer))
    print("APQ", len(apq))
    print("PPQ", len(ppq))
    print("DF0", len(DF0))
    print("DDF0", len(DDF0))
    print("Energy", len(logE))
    print("degree unvoiced",degreeU)

    return F0, DF0, DDF0, F0semi, Jitter, Shimmer, apq, ppq, logE, degreeU
Example #5
0
def prosody_dynamic(audio,
                    size_frame=0.03,
                    size_step=0.01,
                    minf0=60,
                    maxf0=350,
                    voice_bias=-0.2,
                    energy_thr_percent=0.025,
                    P=5,
                    pitch_method='praat'):
    """
    Based on:
    Najim Dehak, "Modeling Prosodic Features With Joint Factor Analysis for Speaker Verification", 2007
    """
    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = size_frame * float(fs)
    size_stepS = size_step * float(fs)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1
    data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
    if pitch_method == 'praat':
        name_audio = audio.split('/')
        temp_uuid = 'pros' + name_audio[-1][0:-4]
        temp_filename_vuv = path_app + '/../tempfiles/tempVUV' + temp_uuid + '.txt'
        temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=size_step,
                                  minf0=minf0,
                                  maxf0=maxf0)
        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs),
                                         size_step)
        os.remove(temp_filename_vuv)
        os.remove(temp_filename_f0)
    elif pitch_method == 'rapt':
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=minf0,
                              max=maxf0,
                              voice_bias=voice_bias,
                              otype='f0')

    #Find pitch contour of EACH voiced segment
    pitchON = np.where(F0 != 0)[0]
    dchange = np.diff(pitchON)
    change = np.where(dchange > 1)[0]
    iniV = pitchON[0]

    featvec = []
    iniVoiced = (pitchON[0] * size_stepS) + size_stepS  #To compute energy
    seg_voiced = []
    f0v = []
    Ev = []
    for indx in change:
        finV = pitchON[indx] + 1
        finVoiced = (pitchON[indx] *
                     size_stepS) + size_stepS  #To compute energy
        VoicedSeg = data_audio[int(iniVoiced):int(
            finVoiced)]  #To compute energy
        temp = F0[iniV:finV]
        tempvec = []
        if len(VoicedSeg) > int(
                size_frameS):  #Take only segments greater than frame size
            seg_voiced.append(VoicedSeg)
            #Compute duration
            dur = len(VoicedSeg) / float(fs)
            tempvec.append(dur)
            #Pitch coefficients
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, P))
            f0v.append(temp)
            #fitCoeff.append(z.coeffs)
            tempvec.extend(z.coeffs)
            #Energy coefficients
            temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
            Ev.append(temp)
            x = np.arange(0, len(temp))
            z = np.poly1d(np.polyfit(x, temp, P))
            tempvec.extend(z.coeffs)
            featvec.append(tempvec)
        iniV = pitchON[indx + 1]
        iniVoiced = (pitchON[indx + 1] *
                     size_stepS) + size_stepS  #To compute energy

    #Add the last voiced segment
    finV = (pitchON[len(pitchON) - 1])
    finVoiced = (pitchON[len(pitchON) - 1] *
                 size_stepS) + size_stepS  #To compute energy
    VoicedSeg = data_audio[int(iniVoiced):int(finVoiced)]  #To compute energy
    temp = F0[iniV:finV]
    tempvec = []
    if len(VoicedSeg) > int(
            size_frameS):  #Take only segments greater than frame size
        #Compute duration
        dur = len(VoicedSeg) / float(fs)
        tempvec.append(dur)
        x = np.arange(0, len(temp))
        z = np.poly1d(np.polyfit(x, temp, P))
        tempvec.extend(z.coeffs)
        #Energy coefficients
        temp = E_cont(VoicedSeg, size_frameS, size_stepS, overlap)
        x = np.arange(0, len(temp))
        z = np.poly1d(np.polyfit(x, temp, P))
        tempvec.extend(z.coeffs)
        #Compute duration
        featvec.append(tempvec)

    if flag_plots:
        plot_pros(data_audio, fs, F0, seg_voiced, Ev, featvec, f0v)

    return np.asarray(featvec)
Example #6
0
def intonation_duration(audio,
                        size_step=0.01,
                        minf0=60,
                        maxf0=350,
                        stol=0.150,
                        flag_plots=False):
    fs, data_audio = read(audio)
    temp_filename_f0 = '../tempfiles/pitchtemp.txt'
    temp_filename_vuv = '../tempfiles/voicetemp.txt'

    praat_functions.praat_vuv(audio,
                              temp_filename_f0,
                              temp_filename_vuv,
                              time_stepF0=size_step,
                              minf0=minf0,
                              maxf0=maxf0)
    pitch_z, ttotal = praat_functions.decodeF0(temp_filename_f0,
                                               len(data_audio) / fs, size_step)

    #Slopes
    slopes = []
    #buffers for voiced and unvoiced segments
    vbuffer = []
    ubuffer = []
    #energy for total voiced and unvoiced segments
    venergy = []
    uenergy = []
    #arrays for time-storing
    voicedtimes = []
    unvoicedtimes = []
    silencetimes = []
    #flag for starting point voiced time and unvoiced time
    startvoicedflag = True
    startUNvoicedflag = True
    F0_rec = np.zeros(len(pitch_z))
    for i in range(0, len(pitch_z) - 1):
        #condition for voiced segment
        if pitch_z[i] >= minf0 and pitch_z[i] <= maxf0:
            vbuffer.append(pitch_z[i])
            #voiced segment starting time
            if (startvoicedflag):
                t_start_venergy = ttotal[i]
                startvoicedflag = False
                frameF0start = i

            if len(ubuffer) != 0:
                samples = len(ubuffer)

                t = float(samples * size_step
                          )  #unvoiced time based on F0 Fs and actual samples
                #silence condition
                if t > stol:
                    silencetimes.append(t)
                else:
                    unvoicedtimes.append(t)

                #clear the mess
                ubuffer = []
                #final time for unvoiced
                t_end_uenergy = ttotal[i]
                startUNvoicedflag = True
                #calculate segments with obtained times
                n_start_unvoiced = fs * t_start_uenergy
                n_end_unvoiced = fs * t_end_uenergy
                #energy of real audio segment based on fs and timestamp from F0
                #store
                uenergy.append(
                    logEnergy(
                        data_audio[int(n_start_unvoiced):int(n_end_unvoiced)]))
        #start appending unvoiced segments
        else:
            if (len(vbuffer) != 0):
                #based on F0 Fs and in buffer length, actual time is calculated
                samples = len(vbuffer)
                t = float(samples * size_step)
                #pick up voiced times
                voicedtimes.append(t)
                #voiced segment slope process
                #temporal x axis vector for slope calculation
                xtemp_slope = []
                tempslope = np.array(vbuffer)
                for j in range(0, len(vbuffer)):
                    xtemp_slope.append(j)
                #get slopes of voiced segments

                pol = np.polyfit(xtemp_slope, tempslope, 1)
                if np.isnan(pol[0]):
                    print("#################################")
                    print("detected short voiced segment")
                    #print(xtemp_slope, tempslope)
                else:
                    slopes.append(pol[0])
                #slopes.append(np.average(np.diff(tempslope)) / np.average(np.diff(xtemp_slope)))

                #clear the mess

                vbuffer = []

                #final time of voiced segment
                t_end_venergy = ttotal[i]
                frameF0end = i
                if np.isnan(pol[0]):
                    F0_rec[int(frameF0start):int(frameF0end)] = tempslope
                else:
                    F0_rec[int(frameF0start):int(frameF0end)] = pol[
                        0] * np.asarray(xtemp_slope) + pol[1]

                tempslope = []
                xtemp_slope = []
                startvoicedflag = True
                #calculate how many segments are in voiced time on the original audio file, based on start-end time stamps
                n_start_voiced = fs * t_start_venergy
                n_end_voiced = fs * t_end_venergy
                #calculate energy and make venergy append the result
                venergy.append(
                    logEnergy(
                        data_audio[int(n_start_voiced):int(n_end_voiced)]))
            else:
                ubuffer.append(pitch_z[i])
                #initial time of unvoiced segment
                if (startUNvoicedflag):
                    t_start_uenergy = ttotal[i]
                    startUNvoicedflag = False

    voicedtimes = np.array(voicedtimes)
    unvoicedtimes = np.array(unvoicedtimes)

    silencetimes = np.array(silencetimes)
    #print(unvoicedtimes, silencetimes)
    uenergy = np.array(uenergy)
    venergy = np.array(venergy)
    """Measures"""
    """Intonation"""
    avgF0slopes = np.average(slopes)  # 1. average F0 slope
    stdF0slopes = np.std(slopes)  # 2. std F0 slope
    """Duration"""
    if ((silencetimes.size > 0)):
        SVU = (np.sum(silencetimes)) / (
            np.sum(voicedtimes) + np.sum(unvoicedtimes))  #  3.S/(V+U)
    else:
        SVU = 0
    VU = (np.sum(voicedtimes)) / np.sum(unvoicedtimes)  #  4.V/U
    UVU = np.sum(unvoicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                   )  #  5.U/(V+U)
    VVU = np.sum(voicedtimes) / (np.sum(voicedtimes) + np.sum(unvoicedtimes)
                                 )  #  6.V/V+U
    #si no hay silencios hay que prevenir dividir por cero
    if ((silencetimes.size > 0)):
        VS = np.sum(voicedtimes) / np.sum(silencetimes)  # 7. V/S
        US = np.sum(unvoicedtimes) / np.sum(silencetimes)  # 8. U/S
    else:
        VS = 0
        US = 0

    URD = np.std(unvoicedtimes)  # 9. (std U)
    VRD = np.std(voicedtimes)  # 10. (std V)

    URE = np.std(uenergy)  # 11. (std Energy U) wtf
    VRE = np.std(venergy)  # 12. (std Energy V)
    MSEF0 = np.mean((np.asarray(pitch_z) - np.asarray(F0_rec))**2)
    if ((silencetimes.size > 0)):  # 13. (std S)
        PR = np.std(silencetimes)
    else:
        PR = 0

    os.remove(temp_filename_f0)
    os.remove(temp_filename_vuv)

    if flag_plots:
        plt.figure(1)
        plt.plot(ttotal, pitch_z, label="F0 (Hz)", linewidth=2.0)
        plt.plot(ttotal, F0_rec, label="Linear regresion F0", linewidth=2.0)
        plt.text(min(ttotal),
                 max(pitch_z) - 5, "MSE=" + str(np.round(MSEF0, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 10,
                 "Avg. tilt=" + str(np.round(avgF0slopes, 3)))
        plt.text(min(ttotal),
                 max(pitch_z) - 15,
                 "Std. tilt=" + str(np.round(stdF0slopes, 3)))
        plt.xlabel("Time (s)")
        plt.ylabel("Frequency (Hz)")
        plt.legend()

        plt.grid(True)
        plt.show()

    return avgF0slopes, stdF0slopes, MSEF0, SVU, VU, UVU, VVU, VS, US, URD, VRD, URE, VRE, PR
Example #7
0
def prosody_static(audio, flag_plots, pitch_method='praat'):

    fs, data_audio = read(audio)
    data_audio = data_audio - np.mean(data_audio)
    data_audio = data_audio / float(np.max(np.abs(data_audio)))
    size_frameS = 0.02 * float(fs)
    size_stepS = 0.01 * float(fs)
    thr_len_pause = 0.14 * float(fs)
    thr_en_pause = 10 * np.log10(0.02)
    overlap = size_stepS / size_frameS
    nF = int((len(data_audio) / size_frameS / overlap)) - 1

    if pitch_method == 'praat':
        temp_uuid = audio.split('/')[-1][0:-4]
        temp_filename_f0 = path_app + '/../tempfiles/tempF0' + temp_uuid + '.txt'
        temp_filename_vuv = '../tempfiles/tempVUV' + temp_uuid + '.txt'
        praat_functions.praat_vuv(audio,
                                  temp_filename_f0,
                                  temp_filename_vuv,
                                  time_stepF0=0.01,
                                  minf0=60,
                                  maxf0=350)

        F0, _ = praat_functions.decodeF0(temp_filename_f0,
                                         len(data_audio) / float(fs), 0.01)
        os.remove(temp_filename_f0)

    elif pitch_method == 'rapt':
        data_audiof = np.asarray(data_audio * (2**15), dtype=np.float32)
        F0 = pysptk.sptk.rapt(data_audiof,
                              fs,
                              int(size_stepS),
                              min=60,
                              max=350,
                              voice_bias=-0.2,
                              otype='f0')

    segmentsV = V_UV(F0,
                     data_audio,
                     fs,
                     type_seg="Voiced",
                     size_stepS=size_stepS)
    segmentsUP = V_UV(F0,
                      data_audio,
                      fs,
                      type_seg="Unvoiced",
                      size_stepS=size_stepS)

    segmentsP = []
    segmentsU = []
    for k in range(len(segmentsUP)):
        eu = logEnergy(segmentsUP[k])
        if (len(segmentsUP[k]) > thr_len_pause):
            segmentsP.append(segmentsUP[k])
        else:
            segmentsU.append(segmentsUP[k])

    F0_features = F0feat(F0)
    energy_featuresV = energy_feat(segmentsV, fs, size_frameS, size_stepS)
    energy_featuresU = energy_feat(segmentsU, fs, size_frameS, size_stepS)

    duration_features = duration_feat(segmentsV, segmentsU, segmentsP,
                                      data_audio, fs)

    if flag_plots:

        plot_pros(data_audio, fs, F0, segmentsV, segmentsU)

    features = np.hstack(
        (F0_features, energy_featuresV, energy_featuresU, duration_features))
    return features