def compute_coherence_from_timefreq(tf1, tf2, sample_rate, window_size, gauss_window=False, nstd=6): """ Compute the time-varying coherence between two complex-valued time-frequency representations. :param tf1: The first time-frequency representation. :param tf2: The second time-frequency representation. :param sample_rate: The temporal sample rate of the time-frequency representations (units=Hz) :param window_size: The size of the window used to average across samples for computing coherence (units=seconds) :param gauss_window: If True, use a gaussian weighting when averaging (default=False) :param nstd: The number of standard deviations wide the gaussian weighting is (default=6) :return: coherence: A array of shape (nfreqs, T) where nfreqs is the number of frequencies in the time-frequency representation, and T is the temporal length of the time-frequency representation. """ N = tf1.shape[1] # compute the power spectrum of each individual spectrogram tf1_conj = np.conj(tf1) tf2_conj = np.conj(tf2) tf1_ps = (tf1 * tf1_conj).real tf2_ps = (tf2 * tf2_conj).real # compute the sufficient statistics for the cross spectrum, i.e. the stuff that will be averaged # when computing the coherence cross_spec12 = tf1 * tf2_conj cross_spec21 = tf1_conj * tf2 del tf1_conj del tf2_conj # print 'len(s1)=%d, sample_rate=%0.2f, increment=%0.6f, window_size=%0.3f' % (len(s1), self.sample_rate, increment, window_size) # nwinlen = max(np.unique(windows[:, 2] - windows[:, 1])) nwinlen = int(sample_rate*window_size) # print 'nwindows=%d, nwinlen=%d' % (len(windows), nwinlen) # generate a normalized window for computing the weighted mean around a point in time if gauss_window: gauss_t, average_window = gaussian_window(nwinlen, nstd) average_window /= np.abs(average_window).sum() else: average_window = np.ones(nwinlen) / float(nwinlen) # print 'len(average_window)=%d, average_window.sum()=%0.6f' % (len(average_window), average_window.sum()) nfreqs = tf1.shape[0] # compute the coherence at each frequency coherence = np.zeros([nfreqs, N]) for k in range(nfreqs): # convolve the window function with each frequency band tf1_mean = convolve1d(tf1_ps[k, :], average_window, mode='mirror') tf2_mean = convolve1d(tf2_ps[k, :], average_window, mode='mirror') denom = tf1_mean * tf2_mean del tf1_mean del tf2_mean cs12_mean_r = convolve1d(cross_spec12[k, :].real, average_window, mode='mirror') cs12_mean_i = convolve1d(cross_spec12[k, :].imag, average_window, mode='mirror') cs12_mean = np.sqrt(cs12_mean_r**2 + cs12_mean_i**2) del cs12_mean_r del cs12_mean_i cs21_mean_r = convolve1d(cross_spec21[k, :].real, average_window, mode='mirror') cs21_mean_i = convolve1d(cross_spec21[k, :].imag, average_window, mode='mirror') cs21_mean = np.sqrt(cs21_mean_r**2 + cs21_mean_i**2) del cs21_mean_r del cs21_mean_i coherence[k, :] = (cs12_mean*cs21_mean) / denom return coherence
def fundEstimator(soundIn, fs, t=None, debugFig = 0, maxFund = 1500, minFund = 300, lowFc = 200, highFc = 6000, minSaliency = 0.5): """ Estimates the fundamental frequency of a complex sound. soundIn is the sound pressure waveformlog spectrogram. fs is the sampling rate t is a vector of time values in s at which the fundamental will be estimated. The sound must include at least 1024 sample points The optional parameter with defaults are Some user parameters (should be part of the function at some time) debugFig = 0 Set to zero to eliminate figures. maxFund = 1500 Maximum fundamental frequency minFund = 300 Minimum fundamental frequency lowFc = 200 Low frequency cut-off for band-passing the signal prior to auto-correlation. highFc = 6000 High frequency cut-off minSaliency = 0.5 Threshold in the auto-correlation for minimum saliency - returns NaN for pitch values is saliency is below this number Returns sal - the time varying pitch saliency - a number between 0 and 1 corresponding to relative size of the first auto-correlation peak fund - the time-varying fundamental in Hz at the same resolution as the spectrogram. fund2 - a second peak in the spectrum - not a multiple of the fundamental a sign of a second voice form1 - the first formant, if it exists form2 - the second formant, if it exists form3 - the third formant, if it exists soundLen - length of sal, fund, fund2, form1, form2, form3 """ # Band-pass filtering signal prior to auto-correlation soundLen = len(soundIn) nfilt = 1024 if soundLen < 1024: print 'Error in fundEstimator: sound too short for bandpass filtering, len(soundIn)=%d\n' % soundLen return (0, 0, 0, 0, 0, 0, 0) # high pass filter the signal highpassFilter = firwin(nfilt-1, 2*lowFc/fs, pass_zero=False) padlen = min(soundLen-10, 3*len(highpassFilter)) soundIn = filtfilt(highpassFilter, [1.0], soundIn, padlen=padlen) # low pass filter the signal lowpassFilter = firwin(nfilt, 2*highFc/fs) padlen = min(soundLen-10, 3*len(lowpassFilter)) soundIn = filtfilt(lowpassFilter, [1.0], soundIn, padlen=padlen) # Plot a spectrogram? if debugFig: plt.figure(9) (tDebug ,freqDebug ,specDebug , rms) = spectrogram(soundIn, fs, 1000.0, 50, min_freq=0, max_freq=10000, nstd=6, log=True, noise_level_db=50, rectify=True) plot_spectrogram(tDebug, freqDebug, specDebug) # Initializations and useful variables if t is None: # initialize t to be spaced by 500us increments sound_dur = len(soundIn) / fs _si = 1e-3 npts = int(sound_dur / _si) t = np.arange(npts) * _si nt=len(t) soundRMS = np.zeros(nt) fund = np.zeros(nt) fund2 = np.zeros(nt) sal = np.zeros(nt) form1 = np.zeros(nt) form2 = np.zeros(nt) form3 = np.zeros(nt) # Calculate the size of the window for the auto-correlation alpha = 5 # Number of sd in the Gaussian window winLen = int(np.fix((2.0*alpha/minFund)*fs)) # Length of Gaussian window based on minFund if (winLen%2 == 0): # Make a symmetric window winLen += 1 winLen2 = 2**12+1 # This looks like a good size for LPC - 4097 points gt, w = gaussian_window(winLen, alpha) gt2, w2 = gaussian_window(winLen2, alpha) maxlags = int(2*ceil((float(fs)/minFund))) # First calculate the rms in each window for it in range(nt): tval = t[it] # Center of window in time tind = int(np.fix(tval*fs)) # Center of window in ind tstart = tind - (winLen-1)/2 tend = tind + (winLen-1)/2 if tstart < 0: winstart = - tstart tstart = 0 else: winstart = 0 if tend >= soundLen: windend = winLen - (tend-soundLen+1) - 1 tend = soundLen-1 else: windend = winLen-1 soundWin = soundIn[tstart:tend]*w[winstart:windend] soundRMS[it] = np.std(soundWin) soundRMSMax = max(soundRMS) # Calculate the auto-correlation in windowed segments and obtain 4 guess values of the fundamental # fundCorrGuess - guess from the auto-correlation function # fundCorrAmpGuess - guess form the amplitude of the auto-correlation function # fundCepGuess - guess from the cepstrum # fundStackGuess - guess taken from a fit of the power spectrum with a harmonic stack, using the fundCepGuess as a starting point # Current version use fundStackGuess as the best estimate... soundlen = 0 for it in range(nt): fund[it] = float('nan') sal[it] = float('nan') fund2[it] = float('nan') form1[it] = float('nan') form2[it] = float('nan') form3[it] = float('nan') if (soundRMS[it] < soundRMSMax*0.1): continue soundlen += 1 tval = t[it] # Center of window in time tind = int(np.fix(tval*fs)) # Center of window in ind tstart = tind - (winLen-1)/2 tend = tind + (winLen-1)/2 if tstart < 0: winstart = - tstart tstart = 0 else: winstart = 0 if tend >= soundLen: windend = winLen - (tend-soundLen+1) - 1 tend = soundLen-1 else: windend = winLen-1 tstart2 = tind - (winLen2-1)/2 tend2 = tind + (winLen2-1)/2 if tstart2 < 0: winstart2 = - tstart2 tstart2 = 0 else: winstart2 = 0 if tend2 >= soundLen: windend2 = winLen2 - (tend2-soundLen+1) - 1 tend2 = soundLen-1 else: windend2 = winLen2-1 soundWin = soundIn[tstart:tend]*w[winstart:windend] soundWin2 = soundIn[tstart2:tend2]*w2[winstart2:windend2] # Apply LPC to get time-varying formants and one additional guess for the fundamental frequency A, E, K = talkbox.lpc(soundWin2, 8) # 8 degree polynomial rts = np.roots(A) # Find the roots of A rts = rts[np.imag(rts)>=0] # Keep only half of them angz = np.arctan2(np.imag(rts),np.real(rts)) # Calculate the frequencies and the bandwidth of the formants frqsFormants = angz*(fs/(2*np.pi)) indices = np.argsort(frqsFormants) bw = -1/2*(fs/(2*np.pi))*np.log(np.abs(rts)) # Keep formants above 1000 Hz and with bandwidth < 1000 formants = [] for kk in indices: if ( frqsFormants[kk]>1000 and bw[kk] < 1000): formants.append(frqsFormants[kk]) formants = np.array(formants) if len(formants) > 0 : form1[it] = formants[0] if len(formants) > 1 : form2[it] = formants[1] if len(formants) > 2 : form3[it] = formants[2] # Calculate the auto-correlation lags = np.arange(-maxlags, maxlags+1, 1) autoCorr = correlation_function(soundWin, soundWin, lags) ind0 = int(mlab.find(lags == 0)) # need to find lag zero index # find peaks indPeaksCorr = detect_peaks(autoCorr, mph=max(autoCorr)/10) # Eliminate center peak and all peaks too close to middle indPeaksCorr = np.delete(indPeaksCorr,mlab.find( (indPeaksCorr-ind0) < fs/maxFund)) pksCorr = autoCorr[indPeaksCorr] # Find max peak if len(pksCorr)==0: pitchSaliency = 0.1 # 0.1 goes with the detection of peaks greater than max/10 else: indIndMax = mlab.find(pksCorr == max(pksCorr))[0] indMax = indPeaksCorr[indIndMax] fundCorrGuess = fs/abs(lags[indMax]) pitchSaliency = autoCorr[indMax]/autoCorr[ind0] sal[it] = pitchSaliency if sal[it] < minSaliency: continue # Calculate the envelope of the auto-correlation after rectification envCorr = temporal_envelope(autoCorr, fs, cutoff_freq=maxFund, resample_rate=None) locsEnvCorr = detect_peaks(envCorr, mph=max(envCorr)/10) pksEnvCorr = envCorr[locsEnvCorr] # The max peak should be around zero indIndEnvMax = mlab.find(pksEnvCorr == max(pksEnvCorr)) # Take the first peak not in the middle if indIndEnvMax+2 > len(locsEnvCorr): fundCorrAmpGuess = fundCorrGuess indEnvMax = indMax else: indEnvMax = locsEnvCorr[indIndEnvMax+1] fundCorrAmpGuess = fs/lags[indEnvMax] # Calculate power spectrum and cepstrum Y = fft(soundWin, n=winLen+1) f = (fs/2.0)*(np.array(range((winLen+1)/2+1), dtype=float)/float((winLen+1)/2)) fhigh = mlab.find(f >= highFc)[0] powSound = 20.0*np.log10(np.abs(Y[0:(winLen+1)/2+1])) # This is the power spectrum powSoundGood = powSound[0:fhigh] maxPow = max(powSoundGood) powSoundGood = powSoundGood - maxPow # Set zero as the peak amplitude powSoundGood[powSoundGood < - 60] = -60 # Calculate coarse spectral enveloppe p = np.polyfit(f[0:fhigh], powSoundGood, 3) powAmp = np.polyval(p, f[0:fhigh]) # Cepstrum CY = dct(powSoundGood-powAmp, norm = 'ortho') tCY = 2000.0*np.array(range(len(CY)))/fs # Units of Cepstrum in ms fCY = 1000.0/tCY # Corresponding fundamental frequency in Hz. lowInd = mlab.find(fCY<lowFc) if lowInd.size > 0: flowCY = mlab.find(fCY < lowFc)[0] else: flowCY = fCY.size fhighCY = mlab.find(fCY < highFc)[0] # Find peak of Cepstrum indPk = mlab.find(CY[fhighCY:flowCY] == max(CY[fhighCY:flowCY]))[-1] indPk = fhighCY + indPk fmass = 0 mass = 0 indTry = indPk while (CY[indTry] > 0): fmass = fmass + fCY[indTry]*CY[indTry] mass = mass + CY[indTry] indTry = indTry + 1 if indTry >= len(CY): break indTry = indPk - 1 if (indTry >= 0 ): while (CY[indTry] > 0): fmass = fmass + fCY[indTry]*CY[indTry] mass = mass + CY[indTry] indTry = indTry - 1 if indTry < 0: break fGuess = fmass/mass if (fGuess == 0 or np.isnan(fGuess) or np.isinf(fGuess) ): # Failure of cepstral method fGuess = fundCorrGuess fundCepGuess = fGuess # Force fundamendal to be bounded if (fundCepGuess > maxFund ): i = 2 while(fundCepGuess > maxFund): fundCepGuess = fGuess/i i += 1 elif (fundCepGuess < minFund): i = 2 while(fundCepGuess < minFund): fundCepGuess = fGuess*i i += 1 # Fit Gaussian harmonic stack maxPow = max(powSoundGood-powAmp) # This is the matlab code... # fundFitCep = NonLinearModel.fit(f(1:fhigh)', powSoundGood'-powAmp, @synSpect, [fundCepGuess ones(1,9).*log(maxPow)]) # modelPowCep = synSpect(double(fundFitCep.Coefficients(:,1)), f(1:fhigh)) vars = np.concatenate(([fundCepGuess], np.ones(9)*np.log(maxPow))) bout = leastsq(residualSyn, vars, args = (f[0:fhigh], powSoundGood-powAmp)) modelPowCep = synSpect(bout[0], f[0:fhigh]) errCep = sum((powSoundGood - powAmp - modelPowCep)**2) vars = np.concatenate(([fundCepGuess*2], np.ones(9)*np.log(maxPow))) bout2 = leastsq(residualSyn, vars, args = (f[0:fhigh], powSoundGood-powAmp)) modelPowCep2 = synSpect(bout2[0], f[0:fhigh]) errCep2 = sum((powSoundGood - powAmp - modelPowCep2)**2) if errCep2 < errCep: bout = bout2 modelPowCep = modelPowCep2 fundStackGuess = bout[0][0] if (fundStackGuess > maxFund) or (fundStackGuess < minFund ): fundStackGuess = float('nan') # A second cepstrum for the second voice # CY2 = dct(powSoundGood-powAmp'- modelPowCep) fund[it] = fundStackGuess if not np.isnan(fundStackGuess): powLeft = powSoundGood- powAmp - modelPowCep maxPow2 = max(powLeft) f2 = 0 if ( maxPow2 > maxPow*0.5): # Possible second peak in central area as indicator of second voice. f2 = f[mlab.find(powLeft == maxPow2)] if ( f2 > 1000 and f2 < 4000): if (pitchSaliency > minSaliency): fund2[it] = f2 #% modelPowCorrAmp = synSpect(double(fundFitCorrAmp.Coefficients(:,1)), f(1:fhigh)) #% #% errCorr = sum((powSoundGood - powAmp' - modelPowCorr).^2) #% errCorrAmp = sum((powSoundGood - powAmp' - modelPowCorrAmp).^2) #% errCorrSum = sum((powSoundGood - powAmp' - (modelPowCorr+modelPowCorrAmp) ).^2) #% #% f1 = double(fundFitCorr.Coefficients(1,1)) #% f2 = double(fundFitCorrAmp.Coefficients(1,1)) #% #% if (pitchSaliency > minSaliency) #% if (errCorr < errCorrAmp) #% fund(it) = f1 #% if errCorrSum < errCorr #% fund2(it) = f2 #% end #% else #% fund(it) = f2 #% if errCorrSum < errCorrAmp #% fund2(it) = f1 #% end #% end #% #% end if (debugFig ): plt.figure(10) plt.subplot(4,1,1) plt.cla() plt.plot(soundWin) # f1 = double(fundFitCorr.Coefficients(1,1)) # f2 = double(fundFitCorrAmp.Coefficients(1,1)) titleStr = 'Saliency = %.2f Pitch AC = %.2f (Hz) Pitch ACA = %.2f Pitch C %.2f (Hz)' % (pitchSaliency, fundCorrGuess, fundCorrAmpGuess, fundStackGuess) plt.title(titleStr) plt.subplot(4,1,2) plt.cla() plt.plot(1000*(lags/fs), autoCorr) plt.plot([1000.*lags[indMax]/fs, 1000*lags[indMax]/fs], [0, autoCorr[ind0]], 'k') plt.plot(1000.*lags/fs, envCorr, 'r', linewidth= 2) plt.plot([1000*lags[indEnvMax]/fs, 1000*lags[indEnvMax]/fs], [0, autoCorr[ind0]], 'g') plt.xlabel('Time (ms)') plt.subplot(4,1,3) plt.cla() plt.plot(f[0:fhigh],powSoundGood) plt.axis([0, highFc, -60, 0]) plt.plot(f[0:fhigh], powAmp, 'b--') plt.plot(f[0:fhigh], modelPowCep + powAmp, 'k') # plt.plot(f(1:fhigh), modelPowCorrAmp + powAmp', 'g') for ih in range(1,6): plt.plot([fundCorrGuess*ih, fundCorrGuess*ih], [-60, 0], 'r') plt.plot([fundStackGuess*ih, fundStackGuess*ih], [-60, 0], 'k') if f2 != 0: plt.plot([f2, f2], [-60, 0], 'g') plt.xlabel('Frequency (Hz)') # title(sprintf('Err1 = %.1f Err2 = %.1f', errCorr, errCorrAmp)) plt.subplot(4,1,4) plt.cla() plt.plot(tCY, CY) # plot(tCY, CY2, 'k--') plt.plot([1000/fundCorrGuess, 1000/fundCorrGuess], [0, max(CY)], 'r') plt.plot([1000/fundStackGuess, 1000/fundStackGuess], [0, max(CY)], 'k') #% plot([(pkClosest-1)/fs (pkClosest-1)/fs], [0 max(CY)], 'g') #% if ~isempty(ipk2) #% plot([(pk2-1)/fs (pk2-1)/fs], [0 max(CY)], 'b') #% end #% for ip=1:length(pks) #% plot([(locs(ip)-1)/fs (locs(ip)-1)/fs], [0 pks(ip)/4], 'r') #% end plt.axis([0, 1000*np.size(CY)/(2*fs), 0, max(CY)]) plt.xlabel('Time (ms)') plt.pause(1) # Fix formants. meanf1 = np.mean(form1[~np.isnan(form1)]) meanf2 = np.mean(form2[~np.isnan(form2)]) meanf3 = np.mean(form3[~np.isnan(form3)]) for it in range(nt): if ~np.isnan(form1[it]): df11 = np.abs(form1[it]-meanf1) df12 = np.abs(form1[it]-meanf2) df13 = np.abs(form1[it]-meanf3) if df12 < df11: if df13 < df12: if ~np.isnan(form3[it]): df33 = np.abs(form3[it]-meanf3) if df13 < df33: form3[it] = form1[it] else: form3[it] = form1[it] else: if ~np.isnan(form2[it]): df22 = np.abs(form2[it]-meanf2) if df12 < df22: form2[it] = form1[it] else: form2[it] = form1[it] form1[it] = float('nan') if ~np.isnan(form2[it]): df21 = np.abs(form2[it]-meanf1) df22 = np.abs(form2[it]-meanf2) df23 = np.abs(form2[it]-meanf3) if df21 < df22 : if ~np.isnan(form1[it]): df11 = np.abs(form1[it]-meanf1) if df21 < df11: form1[it] = form2[it] else: form1[it] = form2[it] form2[it] = float('nan') elif df23 < df22: if ~np.isnan(form3[it]): df33 = np.abs(form3[it]-meanf3) if df23 < df33: form3[it] = form2[it] else: form3[it] = form2[it] form2[it] = float('nan') if ~np.isnan(form3[it]): df31 = np.abs(form3[it]-meanf1) df32 = np.abs(form3[it]-meanf2) df33 = np.abs(form3[it]-meanf3) if df32 < df33: if df31 < df32: if ~np.isnan(form1[it]): df11 = np.abs(form1[it]-meanf1) if df31 < df11: form1[it] = form3[it] else: form1[it] = form3[it] else: if ~np.isnan(form2[it]): df22 = np.abs(form2[it]-meanf2) if df32 < df22: form2[it] = form3[it] else: form2[it] = form3[it] form3[it] = float('nan') return (sal, fund, fund2, form1, form2, form3, soundlen)
def compute_coherence_from_timefreq(tf1, tf2, sample_rate, window_size, gauss_window=False, nstd=6): """ Compute the time-varying coherence between two complex-valued time-frequency representations. :param tf1: The first time-frequency representation. :param tf2: The second time-frequency representation. :param sample_rate: The temporal sample rate of the time-frequency representations (units=Hz) :param window_size: The size of the window used to average across samples for computing coherence (units=seconds) :param gauss_window: If True, use a gaussian weighting when averaging (default=False) :param nstd: The number of standard deviations wide the gaussian weighting is (default=6) :return: coherence: A array of shape (nfreqs, T) where nfreqs is the number of frequencies in the time-frequency representation, and T is the temporal length of the time-frequency representation. """ N = tf1.shape[1] # compute the power spectrum of each individual spectrogram tf1_conj = np.conj(tf1) tf2_conj = np.conj(tf2) tf1_ps = (tf1 * tf1_conj).real tf2_ps = (tf2 * tf2_conj).real # compute the sufficient statistics for the cross spectrum, i.e. the stuff that will be averaged # when computing the coherence cross_spec12 = tf1 * tf2_conj cross_spec21 = tf1_conj * tf2 del tf1_conj del tf2_conj # print 'len(s1)=%d, sample_rate=%0.2f, increment=%0.6f, window_size=%0.3f' % (len(s1), self.sample_rate, increment, window_size) # nwinlen = max(np.unique(windows[:, 2] - windows[:, 1])) nwinlen = int(sample_rate * window_size) # print 'nwindows=%d, nwinlen=%d' % (len(windows), nwinlen) # generate a normalized window for computing the weighted mean around a point in time if gauss_window: gauss_t, average_window = gaussian_window(nwinlen, nstd) average_window /= np.abs(average_window).sum() else: average_window = np.ones(nwinlen) / float(nwinlen) # print 'len(average_window)=%d, average_window.sum()=%0.6f' % (len(average_window), average_window.sum()) nfreqs = tf1.shape[0] # compute the coherence at each frequency coherence = np.zeros([nfreqs, N]) for k in range(nfreqs): # convolve the window function with each frequency band tf1_mean = convolve1d(tf1_ps[k, :], average_window, mode='mirror') tf2_mean = convolve1d(tf2_ps[k, :], average_window, mode='mirror') denom = tf1_mean * tf2_mean del tf1_mean del tf2_mean cs12_mean_r = convolve1d(cross_spec12[k, :].real, average_window, mode='mirror') cs12_mean_i = convolve1d(cross_spec12[k, :].imag, average_window, mode='mirror') cs12_mean = np.sqrt(cs12_mean_r**2 + cs12_mean_i**2) del cs12_mean_r del cs12_mean_i cs21_mean_r = convolve1d(cross_spec21[k, :].real, average_window, mode='mirror') cs21_mean_i = convolve1d(cross_spec21[k, :].imag, average_window, mode='mirror') cs21_mean = np.sqrt(cs21_mean_r**2 + cs21_mean_i**2) del cs21_mean_r del cs21_mean_i coherence[k, :] = (cs12_mean * cs21_mean) / denom return coherence