def writeExampleFiles(): """ A convenience function: writes out example files, some of them with optimal parameters found by exploreSineModelMultiRes() """ inputFile='../../sounds/orchestra.wav' fs, x = UF.wavread(inputFile) W = np.array(['blackmanharris']) M = np.array([1001]) N = np.array([4096]) B = np.array([ ]) T = np.array([-90]) Ns = 512 best = Best() y = best.calculateAndUpdate(x, fs, Ns, W, M, N, B, T) outputFile = inputFile[:-4] + '_optimizedSineModel.wav' print '->',outputFile UF.wavwrite(y, fs, outputFile) inputFile='../../sounds/121061__thirsk__160-link-strings-2-mono.wav' fs, x = UF.wavread(inputFile) W = np.array(['hamming','hamming','hamming']) M = np.array([3001,1501,751]) N = np.array([16384,8192,4096]) B = np.array([2756.25,5512.5]) T = np.array([-90,-90,-90]) Ns = 512 best = Best() y = best.calculateAndUpdate(x, fs, Ns, W, M, N, B, T) outputFile = inputFile[:-4] + '_optimizedSineModel.wav' print '->',outputFile UF.wavwrite(y, fs, outputFile) inputFile='../../sounds/orchestra.wav' fs, x = UF.wavread(inputFile) W = np.array(['hamming','hamming','hamming']) M = np.array([3001,1501,751]) N = np.array([16384,8192,4096]) B = np.array([2756.25,5512.5]) T = np.array([-90,-90,-90]) Ns = 512 best = Best() y = best.calculateAndUpdate(x, fs, Ns, W, M, N, B, T) outputFile = inputFile[:-4] + '_nonOptimizedSineModel.wav' print '->',outputFile UF.wavwrite(y, fs, outputFile) inputFile='../../sounds/121061__thirsk__160-link-strings-2-mono.wav' fs, x = UF.wavread(inputFile) W = np.array(['blackmanharris']) M = np.array([1001]) N = np.array([4096]) B = np.array([ ]) T = np.array([-90]) Ns = 512 best = Best() y = best.calculateAndUpdate(x, fs, Ns, W, M, N, B, T) outputFile = inputFile[:-4] + '_nonOptimizedSineModel.wav' print '->',outputFile UF.wavwrite(y, fs, outputFile)
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here def energy(X, k1, k2): X2 = np.power(X, 2) return np.sum(X2[k1:k2]) fs, x = UF.wavread(inputFile) w = get_window(window, M) xsyn = stft.stft(x, fs, w, N, H) noise = np.subtract(xsyn, x) Esignal1 = energy(x, 0, len(x)) Enoise1 = energy(noise, 0, len(noise)) SNR1 = 10*np.log10(Esignal1/Enoise1) Esignal2 = energy(x, M+1, len(x)-M-1) Enoise2 = energy(noise, M+1, len(noise)-M-1) SNR2 = 10*np.log10(Esignal2/Enoise2) return SNR1, SNR2
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) mX = stft.stftAnal(x, fs, w, N, H)[0] X = 10 ** (mX / 20.0) b3k = int(N*3000.0/fs) b10k = int(N*10000.0/fs) o3k = odf(X[:, 1:b3k+1]) o10k = odf(X[:, b3k+1:b10k+1]) return np.column_stack((o3k, o10k))
def sineModelOriginalTest1(inputFile, M): print "\n\n\n############### RUN THE ORIGINAL TEST (without multiresolution) ###############\n" #M1 = 4095 M1 = M print "M: " print M N1 = int(pow(2, np.ceil(np.log2(M1)))) # FFT Size, power of 2 larger than M print "N1: " print N1 t = -80.0 # threshold fs, x = UF.wavread(inputFile) # read input sound #print "Ploting \"x\"" #plt.plot(x) window = 'blackman' # Window type w1 = get_window(window, M1) # compute analysis window return sineModelOriginal(x,fs,w1,N1,t)
def phaseFlux(inFile, window, M, bins, H, passes, th, inhibTh, inhibRel, plot=False): fs, x = UF.wavread(inFile) # read file x = normalise(x) # normalise if plot: plt.plot(-transient(x, 10)) N = len(x) # length of file win = get_window(window, M) # create window # STFT: X = np.ndarray(shape=(N/H, bins), dtype='complex') for n in range(N/H): Xpart = x[n * H:n * H + M] if len(Xpart) < len(win): Xpart = zeropad(Xpart, len(win)) X[n] = UF.fft(Xpart * win, bins) # gets auto zerophased and padded ''' bins: 0 1 2 3 .. fftSize timefrms: 0 [ . . . . .. . 1 . . . . .. . 2 . . . . .. . .. .. .. .. .. .. .. ] N/H ''' mX = np.abs(X) # get magnitude mX = normalise(mX) pX = np.angle(X) # get phase pX = normalise(pX) pX_uw = nd_unwrapPhase(pX, 0); # unwrapPhase # pX_uw = pX derv1 = nd_derivative(pX_uw, 0) derv2 = nd_derivative(derv1, 0) binmul = np.ndarray(shape=(N/H, bins), dtype='float') for n in range(N/H): for k in range(bins): binmul[n][k] = 1 onsets = np.ndarray(shape=(passes, N), dtype='float') for p in range(passes): for n in range(N/H): val = np.sum(derv2[n] * mX[n]) if val / bins > th: onsets[p][n*H] = val for k in range(bins): if derv2[n][k] > inhibTh: for m in range(inhibRel): binmul[n+m][k] = m/inhibRel if n+m > N/H: break; mX *= binmul; # onsets = np.transpose(onsets) return normalise(onsets)
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here def energy(mag): e = np.sum((10 ** (mag / 20)) ** 2) return e (fs, x) = UF.wavread(inputFile) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) y = STFT.stftSynth(mX, pX, M, H) n = x - y[:x.size] n2 = x[w.size:-w.size] - y[:x.size][w.size:-w.size] mN, pN = STFT.stftAnal(n, fs, w, N, H) mN2, pN2 = STFT.stftAnal(n2, fs, w, N, H) snr1 = 10 * np.log10(energy(mX) / energy(mN)) snr2 = 10 * np.log10(energy(mX) / energy(mN2)) return snr1, snr2
def minFreqEstErr(inputFile, f): """ Inputs: inputFile (string) = wav file including the path f (float) = frequency of the sinusoid present in the input audio signal (Hz) Output: fEst (float) = Estimated frequency of the sinusoid (Hz) M (int) = Window size N (int) = FFT size """ # analysis parameters: window = 'blackman' t = -40 fs, x = UF.wavread(inputFile) x_half = len(x) // 2 f_error = np.inf k = 1 while f_error > 0.05: # Hz M = 100 * k + 1 M2 = M // 2 W = get_window(window, M) N = int(2 ** np.ceil(np.log2(M))) mX, pX = DFT.dftAnal(x[x_half - M2: x_half - M2 + M], W, N) ploc = UF.peakDetection(mX, t) iploc, ipmag, ipphase = UF.peakInterp(mX, pX, ploc) fEst = iploc * fs / N f_error = np.abs(f - fEst) k += 1 return(fEst, M, N)
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) y = stft.stft(x, fs, w, N, H) noise = np.array(x - y) E_x = np.sum( abs(x)**2 ) E_noise = np.sum( abs(noise)**2 ) E_xAfterM = np.sum( abs( x[M : x.size-M] )**2 ) E_nAfterM = np.sum( abs( noise[M : x.size-M] )**2 ) SNR1 = 10 * np.log10(E_x / E_noise) SNR2 = 10 * np.log10(E_xAfterM/E_nAfterM) return (SNR1, SNR2)
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs,x = UF.wavread(inputFile) w = get_window(window,M) mX,pX = stft.stftAnal(x,w,N,H) mX = pow(10,mX/20.) band_energy = np.zeros((len(mX),2)) for frm_idx in range(len(mX)): frm = mX[frm_idx] for k in range(len(frm)): cur_f = k*44100/N if cur_f > 0 and cur_f < 3000: band_energy[frm_idx,0] += (frm[k]*frm[k]) elif cur_f > 3000 and cur_f < 10000: band_energy[frm_idx,1] += (frm[k]*frm[k]) band_energy = 10.0*np.log10(band_energy) return band_energy
def computeModel(inputFile, B, M, window = 'hanning', t = -90): bands = range(len(B)) fs, x = UF.wavread(inputFile) w = [get_window(window, M[i]) for i in bands] N = (2**np.ceil(np.log2(B))).astype(int) y_combined = SMMR.sineModelMultiRes(x, fs, w, N, t, B) #y, y_combined = SMMR.sineModelMultiRes_combined(x, fs, w, N, t, B) # output sound file name outputFileInputFile = 'output_sounds/' + os.path.basename(inputFile) #outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_sineModel.wav' outputFile_combined = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_sineModelMultiRes.wav' # write the synthesized sound obtained from the sinusoidal synthesis UF.wavwrite(x, fs, outputFileInputFile) #UF.wavwrite(y, fs, outputFile) UF.wavwrite(y_combined, fs, outputFile_combined) plt.figure() plt.plot(x) plt.plot(y_combined) plt.show()
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) xrec = stft.stft(x, fs, w, N, H) eSignal = energy(x) eSignal_part = energy(x[M:-M]) eNoise = energy(x-xrec) eNoise_part = energy((x-xrec)[M:-M]) snr = 10 * np.log10(eSignal / eNoise) snr_part = 10 * np.log10(eSignal_part / eNoise_part) return snr, snr_part
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here w = get_window(window, M) # get the window (fs, x) = UF.wavread(inputFile) # x: input sound, w: analysis window, N: FFT size, H: hop size # returns y: output sound STFTX = stft.stft(x, fs, w, N, H) xoutput = np.arange(x.size) energynoise = 0 energynoise2 = 0 for i in range(0, x.size): energynoise += np.power(np.abs(x[i].real) - np.abs(STFTX[i].real), 2) if i > M and i < x.size - M: energynoise2 += np.power(np.abs(x[i].real) - np.abs(STFTX[i].real), 2) energysignal = 0 energysignal2 = 0 for i in range(0, x.size): energysignal += np.power(np.abs(x[i].real), 2) if i > M and i < x.size - M: energysignal2 += np.power(np.abs(x[i].real), 2) SNR1 = 10 * np.log10(energysignal / energynoise) SNR2 = 10 * np.log10(energysignal2 / energynoise2) return SNR1, SNR2
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ (fs,x) = UF.wavread(inputFile) w = get_window(window, M) (xmX, xpX) = stft.stftAnal(x, fs, w, N, H) kLow1 = 0 kLow2 = 0 while (True): kLow2 += 1 if( (kLow2 < N*(fLow2)/float(fs)) & (kLow2 > N*(fLow2)/float(fs) - 1.0 ) ): break kHigh1 = 0 while (True): kHigh1 += 1 if( (kHigh1 < N*(fHigh1)/float(fs)) & (kHigh1 > N*(fHigh1)/float(fs) - 1.0 ) ): break kHigh2 = 0 while (True): kHigh2 += 1 if( (kHigh2 < N*(fHigh2)/float(fs)) & (kHigh2 > N*(fHigh2)/float(fs) - 1.0 ) ): break nHops = int(xmX.shape[0]) out = np.zeros((nHops,2)) i = 0 while i < nHops: subxmX = xmX[i,:] subLowxmX = subxmX[kLow1+1:kLow2+1] subLowxmX = 10**(subLowxmX/20) eSignalLow = sum(subLowxmX**2) out[i,0] = 10.0*np.log10(eSignalLow) subHighxmX = subxmX[kHigh1+1:kHigh2+1] subHighxmX = 10**(subHighxmX/20) eSignalHigh = sum(subHighxmX**2) out[i,1] = 10.0*np.log10(eSignalHigh) i += 1 return out
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ #read from the file FS, x = UF.wavread(inputFile) w = get_window(window, M) #do a stft computation y = stft.stft(x, FS, w, N, H) #compute SNR over complete signal diff = y - x energy_signal = (y**2).sum() energy_noise = (diff**2).sum() SNR1 = 10 * np.log10(energy_signal/energy_noise) #compute SNR over sliced signal energy_signal_sliced = (y[M:-M]**2).sum() energy_noise_sliced = (diff[M:-M]**2).sum() SNR2 = 10 * np.log10(energy_signal_sliced/energy_noise_sliced) return (SNR1, SNR2)
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here windowing = get_window(window, M) (fs, x) = UF.wavread(inputFile) mX, pX = stft.stftAnal(x, fs, windowing, N, H) bin0 = 1 bin3000 = np.floor(3000.0*N/fs) bin10000 = np.floor(10000.0*N/fs) bin3000up = np.ceil(3000.0*N/fs) ODF = np.zeros((mX.shape[0], 2)) prevODF3000 = 0.0 prevODF10000 = 0.0 for i in range(mX.shape[0]): env3000 = np.sum(np.square(10**(mX[i,1:bin3000+1] / 20))) env3000db = 10 * np.log10(env3000) odf3000 = env3000db - prevODF3000 prevODF3000 = env3000db if odf3000 <= 0.0: odf3000 = 0.0 ODF[i,0] = odf3000 env10000 = np.sum(np.square(10**(mX[i,bin3000up:bin10000+1] / 20))) env10000db = 10 * np.log10(env10000) odf10000 = env10000db - prevODF10000 prevODF10000 = env10000db if odf10000 <= 0.0: odf10000 = 0.0 ODF[i,1] = odf10000 return ODF
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ x = UF.wavread(inputFile)[1] w = get_window(window, M) xSynth = stft(x, 1.0, w, N, H) eSignal1 = sum(x**2) eNoise1 = sum((x-xSynth)**2) SNR1 = 10.0*np.log10(eSignal1/eNoise1) x2 = x[M:len(x)-M] xSynth2 = xSynth[M:len(xSynth)-M] eSignal2 = sum(x2**2) eNoise2 = sum((x2-xSynth2)**2) SNR2 = 10.0*np.log10(eSignal2/eNoise2) return (SNR1,SNR2)
def getJawaab(ipFile = '../dataset/testInputs/testInput_1.wav', ipulsePos = getPulsePosFromAnn('../dataset/testInputs/testInput_1.csv'), strokeModels = None, oFile = './tablaOutput.wav', randomFlag = 1): # If poolFeats are not built, give an error! if strokeModels == None: print "Train models first before calling getJawaab() ..." opulsePos = None strokeSeq = None oFile = None ts = None else: print "Getting jawaab..." pulsePeriod = np.median(np.diff(ipulsePos)) print pulsePeriod fss, audioIn = UF.wavread(ipFile) if randomFlag == 1: strokeSeq, tStamps, opulsePos = genRandomComposition(pulsePeriod, pieceDur = len(audioIn)/params.Fs, strokeModels = strokeModels) else: invCmat = getInvCovarianceMatrix(strokeModels) strokeSeq, tStamps, opulsePos = genSimilarComposition(pulsePeriod, pieceDur = len(audioIn)/params.Fs, strokeModels = strokeModels, iAudioFile = ipFile, iPos = ipulsePos,invC = invCmat) print strokeSeq print tStamps print opulsePos if oFile != None: audio = genAudioFromStrokeSeq(strokeModels,strokeSeq,tStamps) audio = audio/(np.max(audio) + 0.01) UF.wavwrite(audio, params.Fs, oFile) return opulsePos, strokeSeq, tStamps, oFile
def main(inputFile = '../../sounds/piano.wav', window = 'hamming', M = 1024, N = 1024, H = 512): """ analysis/synthesis using the STFT inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (choice of rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) H: hop size (at least 1/2 of analysis window size to have good overlap-add) """ # read input sound (monophonic with sampling rate of 44100) fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # compute the magnitude and phase spectrogram mX, pX = STFT.stftAnal(x, fs, w, N, H) # perform the inverse stft y = STFT.stftSynth(mX, pX, M, H) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_stft.wav' # write the sound resulting from the inverse stft UF.wavwrite(y, fs, outputFile) return x, fs, mX, pX, y
def sineODF(file='../../../../../audioDSP_course/assignments/sms-tools/sounds/piano.wav'): fs, x = UF.wavread(file) # set params: M = 1024 # window size H = int(M/3) # hop size t = -80.0 #treshold (dB??) window = 'blackman' # window type fftSize = int(pow(2, np.ceil(np.log2(M)))) # size of FFT N = fftSize maxnSines = 10 # maximum simultaneous sines minSineDur = 0.1 # minimal duration of sines freqDevOffset = 30 # min(??) frequency deviation at 0Hz freqDevSlope = 0.001 # slope increase of min freq dev. w = get_window(window, M) # get analysis window tStamps = genTimeStamps(len(x), M, fs, H) # generate timestamp return? fTrackEst, mTrackEst, pTreckEst = SM.sineModelAnal(x, fs, w, fftSize, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks(tStamps) # get true freq. tracks # plotting: mX, pX = stft.stftAnal(x, fs, w, fftSize, H) maxplotfreq = 1500.0 binFreq = fs*np.arange(N*maxplotfreq/fs)/N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:,:N*maxplotfreq/fs+1]),cmap = 'hot_r') # plt.plot(fTrackTrue, 'o-', color = 'c', linewidth=3.0) plt.plot(tStamps, fTrackEst, color = 'y', linewidth=2.0) # plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) return fTrackEst
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here def energy(mag): e = 10 * np.log10(np.sum((10 ** (mag / 20)) ** 2, axis=1)) return e (fs, x) = UF.wavread(inputFile) border_bin = int(np.ceil(float(3000) * N / fs)) max_bin = int(np.ceil(float(10000) * N / fs)) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) low = np.transpose(np.transpose(mX)[1:border_bin]) high = np.transpose(np.transpose(mX)[border_bin:max_bin]) e_low = energy(low) e_high = energy(high) envs = np.append([e_low], [e_high], axis=0) envs = np.transpose(envs) # draw graph plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = mX.shape[0] frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(mX.shape[1])*float(fs)/N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('mX ({0}), M={1}, N={2}, H={3}'.format(inputFile, M, N, H)) plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, e_low, color="blue", label="row") plt.plot(frmTime, e_high, color="red", label="high") plt.title('Energy of Envelopes') plt.autoscale(tight=True) plt.tight_layout() plt.show() return envs
def readAudio(inputFile='../../sounds/piano.wav'): sys.path.append('/home/vagrant/sms-tools/software/models') import utilFunctions as UF print("Input File: ", inputFile) (fs, x) = UF.wavread(inputFile) y = x[50000:50010] return y
def testModuleLive(inputFile = '../dataset/testInputs/testInput_3.wav', pulsePos = getPulsePosFromAnn('../dataset/testInputs/testInput_3.csv')): global strokeModelsG ipulsePer = np.median(np.diff(pulsePos))/10 # print ipulsePer fss, ipAudio = UF.wavread(inputFile) print "Analysing input..." testFeatFull, strokeSeq, strokeTime, strokeAmp, opulsePer = getJawaabLive(ipAudio, ipulsePer) audioOut = genAudioFromStrokeSeq(strokeModelsG,strokeSeq,strokeAmp,strokeTime) return testFeatFull, audioOut, strokeSeq, strokeTime, strokeAmp, opulsePer
def minMaxAudio(inputFile): """ Input: inputFile: file name of the wav file (including path) Output: A tuple of the minimum and the maximum value of the audio samples, like: (min_val, max_val) """ wav_array = wavread(inputFile) return (wav_array[1].min(), wav_array[1].max())
def main(inputFile='../../sounds/ocean.wav', H=256, stocf=.1): # ------- analysis parameters ------------------- # inputFile: input sound file (monophonic with sampling rate of 44100) # H: hop size # stocf: decimation factor used for the stochastic approximation # --------- computation ----------------- # read input sound (fs, x) = UF.wavread(inputFile) # compute stochastic model mYst = STM.stochasticModelAnal(x, H, stocf) # synthesize sound from stochastic model y = STM.stochasticModelSynth(mYst, H) outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_stochasticModel.wav' # write output sound UF.wavwrite(y, fs, outputFile) # --------- plotting -------------------- # create figure to plot plt.figure(figsize=(12, 9)) # plot the input sound plt.subplot(3,1,1) plt.plot(np.arange(x.size)/float(fs), x) plt.axis([0, x.size/float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot stochastic representation plt.subplot(3,1,2) numFrames = int(mYst[:,0].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(stocf*H)*float(fs)/(stocf*2*H) plt.pcolormesh(frmTime, binFreq, np.transpose(mYst)) plt.autoscale(tight=True) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('stochastic approximation') # plot the output sound plt.subplot(3,1,3) plt.plot(np.arange(y.size)/float(fs), y) plt.axis([0, y.size/float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.tight_layout() plt.show()
def readAudio(inputFile): """ Input: inputFile: the path to the wav file Output: The function should return a numpy array that contains 10 samples of the audio. """ x, audio = wavread(inputFile) first10 = audio[50000:50010] return first10
def downsampleAudio(inputFile, M): """ Inputs: inputFile: file name of the wav file (including path) M: downsampling factor (positive integer) """ ## Your code here fs, x = wavread(inputFile) y = hopSamples(x, M) wavwrite(y, fs, 'test.wav')
def minMaxAudio(inputFile): """ Input: inputFile: file name of the wav file (including path) Output: A tuple of the minimum and the maximum value of the audio samples, like: (min_val, max_val) """ ## Your code here (fs, x) = wavread(inputFile) return (min(x), max(x))
def readAudio(inputFile): """ Input: inputFile: the path to the wav file Output: The function should return a numpy array that contains 10 samples of the audio. """ ## Your code here (_, arr) = wavread(inputFile) return arr[50000:50010]
def segmentStableNotesRegions(inputFile = '../../sounds/sax-phrase-short.wav', stdThsld=10, minNoteDur=0.1, winStable = 3, window='hamming', M=1024, N=2048, H=256, f0et=5.0, t=-100, minf0=310, maxf0=650): """ Function to segment the stable note regions in an audio signal Input: inputFile (string): wav file including the path stdThsld (float): threshold for detecting stable regions in the f0 contour minNoteDur (float): minimum allowed segment length (note duration) winStable (integer): number of samples used for computing standard deviation window (string): analysis window M (integer): window size used for computing f0 contour N (integer): FFT size used for computing f0 contour H (integer): Hop size used for computing f0 contour f0et (float): error threshold used for the f0 computation t (float): magnitude threshold in dB used in spectral peak picking minf0 (float): minimum fundamental frequency in Hz maxf0 (float): maximum fundamental frequency in Hz Output: segments (np.ndarray): Numpy array containing starting and ending frame indices of every segment. """ fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window f0 = HM.f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 ### Your code here # 1. convert f0 values from Hz to Cents (as described in pdf document) f0[f0 < eps] = eps f0_cents = 1200 * np.log2(f0 / 55.0) # 2. create an array containing standard deviation of last winStable samples numFrames = len(f0_cents) frameIndex = np.arange(winStable - 1, numFrames) sds = np.array(map(lambda i: np.std(f0_cents[i + 1 - winStable:i+1]), frameIndex)) # 3. apply threshold on standard deviation values to find indices of the stable points in melody stableF0Indices = winStable - 1 + np.where(sds < stdThsld)[0] #print zip(sds, winStable - 1 + np.arange(len(sds))) # 4. create segments of continuous stable points such that concequtive stable points belong to # same segment segments = groupConsecutiveRuns(stableF0Indices) # 5. apply segment filtering, i.e. remove segments with are < minNoteDur in length minNoteDurFrames = int(minNoteDur * fs / H) segments = filter(lambda x: len(x) >= minNoteDurFrames, segments) segments = map(lambda xs: [xs[0], xs[-1]], segments) segments = np.array(segments) #plotSpectogramF0Segments(x, fs, w, N, H, f0, segments) return segments
def segmentStableNotesRegions(inputFile = '../../sounds/sax-phrase-short.wav', stdThsld=10, minNoteDur=0.1, winStable = 3, window='hamming', M=1024, N=2048, H=256, f0et=5.0, t=-100, minf0=310, maxf0=650): """ Function to segment the stable note regions in an audio signal Input: inputFile (string): wav file including the path stdThsld (float): threshold for detecting stable regions in the f0 contour (in cents) minNoteDur (float): minimum allowed segment length (note duration) winStable (integer): number of samples used for computing standard deviation window (string): analysis window M (integer): window size used for computing f0 contour N (integer): FFT size used for computing f0 contour H (integer): Hop size used for computing f0 contour f0et (float): error threshold used for the f0 computation t (float): magnitude threshold in dB used in spectral peak picking minf0 (float): minimum fundamental frequency in Hz maxf0 (float): maximum fundamental frequency in Hz Output: segments (np.ndarray): Numpy array containing starting and ending frame indexes of every segment. """ fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window f0 = HM.f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 ### your code here # 1. convert f0 values from Hz to Cents (as described in pdf document) f0Cents = 1200. * np.log2(f0 / 55.) #2. create an array containing standard deviation of last winStable samples #3. apply threshold on standard deviation values to find indexes of the stable points in melody stdBelowTh = np.zeros(np.shape(f0), np.bool) for i in range(winStable,len(f0)): stdBelowTh[i] = np.std(f0Cents[i-winStable:i]) < stdThsld #4. create segments of continuous stable points such that consecutive stable points belong to same segment segments = [] currSeg = [] for i in range(winStable,len(f0)): if stdBelowTh[i]: currSeg.append(i) else: if len(currSeg) > 0: segments.append([currSeg[0]-1, currSeg[-1]-1]) currSeg = [] #5. apply segment filtering, i.e. remove segments with are < minNoteDur in length segments = np.array(filter(lambda x: x[1] - x[0] >= 1.*fs*minNoteDur/H, segments)) # plotSpectogramF0Segments(x, fs, w, N, H, f0, segments) # Plot spectrogram and F0 if needed # return segments return segments
def main(inputFile='../../sounds/sax-phrase-short.wav', window='blackman', M=601, N=1024, t=-100, minSineDur=0.1, nH=100, minf0=350, maxf0=700, f0et=5, harmDevSlope=0.01, stocf=0.1): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics; minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound; f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics have higher allowed deviation stocf: decimation factor used for the stochastic approximation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # compute the harmonic plus stochastic model of the whole sound hfreq, hmag, hphase, stocEnv = HPS.hpsModelAnal(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope, minSineDur, Ns, stocf) # synthesize a sound from the harmonic plus stochastic representation y, yh, yst = HPS.hpsModelSynth(hfreq, hmag, hphase, stocEnv, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hpsModel_sines.wav' outputFileStochastic = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hpsModel_stochastic.wav' outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hpsModel.wav' # write sounds files for harmonics, stochastic, and the sum UF.wavwrite(yh, fs, outputFileSines) UF.wavwrite(yst, fs, outputFileStochastic) UF.wavwrite(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 15000.0 # plot the input sound plt.subplot(3,1,1) plt.plot(np.arange(x.size)/float(fs), x) plt.axis([0, x.size/float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot spectrogram stochastic component plt.subplot(3,1,2) numFrames = int(stocEnv[:,0].size) sizeEnv = int(stocEnv[0,:].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = (.5*fs)*np.arange(sizeEnv*maxplotfreq/(.5*fs))/sizeEnv plt.pcolormesh(frmTime, binFreq, np.transpose(stocEnv[:,:sizeEnv*maxplotfreq/(.5*fs)+1])) plt.autoscale(tight=True) # plot harmonic on top of stochastic spectrogram if (hfreq.shape[1] > 0): harms = hfreq*np.less(hfreq,maxplotfreq) harms[harms==0] = np.nan numFrames = harms.shape[0] frmTime = H*np.arange(numFrames)/float(fs) plt.plot(frmTime, harms, color='k', ms=3, alpha=1) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.autoscale(tight=True) plt.title('harmonics + stochastic spectrogram') # plot the output sound plt.subplot(3,1,3) plt.plot(np.arange(y.size)/float(fs), y) plt.axis([0, y.size/float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.show()
mXenv = resample(np.maximum(-200, mX), mX.size*stocf) # decimate the mag spectrum pX = np.angle(X[:hN]) #-----synthesis----- mY = resample(mXenv, hN) # interpolate to original size pY = 2*np.pi*np.random.rand(hN) # generate phase random values Y = np.zeros(N, dtype = complex) Y[:hN] = 10**(mY/20) * np.exp(1j*pY) # generate positive freq. Y[hN:] = 10**(mY[-2:0:-1]/20) * np.exp(-1j*pY[-2:0:-1]) # generate negative freq. fftbuffer = np.real( ifft(Y) ) # inverse FFT y = fftbuffer*N/2 return mX, pX, mY, pY, y # example call of stochasticModel function if __name__ == '__main__': (fs, x) = UF.wavread('../../../sounds/ocean.wav') w = np.hanning(1024) N = 1024 stocf = 0.2 maxFreq = 10000.0 lastbin = N*maxFreq/fs first = 1000 last = first+w.size mX, pX, mY, pY, y = stochasticModelFrame(x[first:last], w, N, stocf) plt.figure(1, figsize=(9, 7)) plt.subplot(4,1,1) plt.plot(np.arange(first, last)/float(fs), x[first:last]) plt.axis([first/float(fs), last/float(fs), min(x[first:last]), max(x[first:last])]) plt.title('x (ocean.wav)') plt.subplot(4,1,2)
inputFile = '../../../sounds/flute-A4.wav' window = 'blackman' M = 801 N = 2048 t = -90 minSineDur = 0.1 nH = 40 minf0 = 350 maxf0 = 700 f0et = 8 harmDevSlope = 0.1 Ns = 512 H = 128 (fs, x) = UF.wavread(inputFile) w = get_window(window, M) hfreq, hmag, hphase, xr = HPR.hprModelAnal(x, fs, w, N, H, t, minSineDur, nH, minf0, maxf0, f0et, harmDevSlope) mXr, pXr = STFT.stftAnal(xr, fs, w, N, H) freqScaling = np.array([0, 1.5, 1, 1.5]) freqStretching = np.array([0, 1.1, 1, 1.1]) timbrePreservation = 1 hfreqt, hmagt = HT.harmonicFreqScaling(hfreq, hmag, freqScaling, freqStretching, timbrePreservation, fs) y, yh = HPR.hprModelSynth(hfreqt, hmagt, np.array([]), xr, Ns, H, fs)
import numpy as np import time, os, sys from scipy.signal import hamming, resample import matplotlib.pyplot as plt sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/transformations/')) import dftModel as DFT import utilFunctions as UF import stftTransformations as STFTT import stochasticModel as STOC import math import stft as STFT (fs, x1) = UF.wavread('../../../sounds/orchestra.wav') (fs, x2) = UF.wavread('../../../sounds/speech-male.wav') w1 = np.hamming(1024) N1 = 1024 H1 = 256 w2 = np.hamming(1024) N2 = 1024 smoothf = .2 balancef = 0.5 y = STFTT.stftMorph(x1, x2, fs, w1, N1, w2, N2, H1, smoothf, balancef) L = int(x1.size/H1) H2 = int(x2.size/L) mX2 = STOC.stochasticModelAnal(x2,H2,H2*2, smoothf) mX,pX = STFT.stftAnal(x1, fs, w1, N1, H1) mY,pY = STFT.stftAnal(y, fs, w1, N1, H1) maxplotfreq = 10000.0
def estimateInharmonicity(inputFile='../../sounds/piano.wav', t1=0.1, t2=0.5, window='hamming', M=2048, N=2048, H=128, f0et=5.0, t=-90, minf0=130, maxf0=180, nH=10): """ Function to estimate the extent of inharmonicity present in a sound Input: inputFile (string): wav file including the path t1 (float): start time of the segment considered for computing inharmonicity t2 (float): end time of the segment considered for computing inharmonicity window (string): analysis window M (integer): window size used for computing f0 contour N (integer): FFT size used for computing f0 contour H (integer): Hop size used for computing f0 contour f0et (float): error threshold used for the f0 computation t (float): magnitude threshold in dB used in spectral peak picking minf0 (float): minimum fundamental frequency in Hz maxf0 (float): maximum fundamental frequency in Hz nH (integer): number of integers considered for computing inharmonicity Output: meanInharm (float or np.float): mean inharmonicity over all the frames between the time interval t1 and t2. """ # 0. Read the audio file and obtain an analysis window fs, x = UF.wavread(inputFile) w = get_window(window, M) # 1. Use harmonic model to compute the harmonic frequencies and magnitudes xhfreq, xhmag, xhphase = HM.harmonicModelAnal(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope=0.01, minSineDur=0.0) # 2. Extract the time segment in which you need to compute the inharmonicity. interval_start = int(math.ceil(t1 * fs / float(H))) interval_end = int(math.ceil(t2 * fs / float(H))) # 3. Compute the mean inharmonicity of the segment # Refer to the pdf for the formulas used f0 = HM.f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) f0_slice = f0[interval_start:interval_end] sliced = xhfreq[interval_start:interval_end] inharmon = np.zeros(sliced.size) for index, arr in enumerate(sliced): tmp_sum = 0 for j in range(1, arr.size): val = j + 1 tmp_sum += np.abs(arr[j] - val * f0_slice[index]) / float(val) inharmon[index] = tmp_sum * (1 / float(nH)) mean_inharmon = sum(inharmon) / (interval_end - interval_start + 1) return mean_inharmon
def sineModelAnalEnhanced( inputFile='../../sounds/sines-440-602-transient.wav'): """ Input: inputFile (string): wav file including the path Output: tStamps: A Kx1 numpy array of time stamps at which the frequency components were estimated tfreq: A Kx2 numpy array of frequency values, one column per component """ phaseDevThres = 1e-2 # Allowed deviation in phase M = 2047 # window size N = 4096 # FFT size t = -80 # threshold in negative dB H = 128 # hop-size window = 'blackman' # window type fs, x = UF.wavread(inputFile) # Read input file w = get_window(window, M) # Get the window hM1 = int(np.floor( (w.size + 1) / 2)) # half analysis window size by rounding hM2 = int(np.floor(w.size / 2)) # half analysis window size by floor x = np.append( np.zeros(hM2), x) # add zeros at beginning to center first window at sample 0 x = np.append(x, np.zeros(hM2)) # add zeros at the end to analyze last sample pin = hM1 # initialize sound pointer in middle of analysis window pend = x.size - hM1 # last sample to start a frame tStamps = np.arange(pin, pend, H) / float(fs) # Generate time stamps w = w / sum(w) # normalize analysis window tfreq = np.array([]) while pin < pend: # while input sound pointer is within sound x1 = x[pin - hM1:pin + hM2] # select frame mX, pX = SM.DFT.dftAnal(x1, w, N) # compute dft ploc = UF.peakDetection(mX, t) # detect locations of peaks ###### CODE DIFFERENT FROM sineModelAnal() ######### # Phase based mainlobe tracking plocSelMask = np.zeros(len(ploc)) for pindex, p in enumerate(ploc): if p > 2 and p < ( len(pX) - 2 ): # Peaks at either end of the spectrum are not processed if selectFlatPhasePeak( pX, p, phaseDevThres ): # Select the peak if the phase spectrum around the peak is flat plocSelMask[pindex] = 1 else: plocSelMask[pindex] = 1 plocSel = ploc[plocSelMask.nonzero()[0]] # Select the ones chosen if len(plocSel ) != 2: # Ignoring frames that don't return two selected peaks ipfreq = [0.0, 0.0] else: iploc, ipmag, ipphase = UF.peakInterp( mX, pX, plocSel ) # Only selected peaks to refine peak values by interpolation ipfreq = fs * iploc / float(N) # convert peak locations to Hertz ###### CODE DIFFERENT FROM sineModelAnal() ######### if pin == hM1: # if first frame initialize output frequency track tfreq = ipfreq else: # rest of frames append values to frequency track tfreq = np.vstack((tfreq, ipfreq)) pin += H # Plot the estimated frequency tracks mX, pX = stft.stftAnal(x, w, N, H) maxplotfreq = 1500.0 binFreq = fs * np.arange(N * maxplotfreq / fs) / N numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1]), cmap='hot_r') plt.plot(tStamps, tfreq[:, 0], color='y', linewidth=2.0) plt.plot(tStamps, tfreq[:, 1], color='c', linewidth=2.0) plt.legend(('Estimated f1', 'Estimated f2')) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) plt.show() return tStamps, tfreq
import numpy as np import matplotlib.pyplot as plt import sys, os, time sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) import stft as STFT import sineModel as SM import utilFunctions as UF (fs, x) = UF.wavread( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../sounds/flute-A4.wav')) w = np.blackman(601) N = 1024 H = 150 t = -80 minSineDur = .1 maxnSines = 150 mX, pX = STFT.stftAnal(x, w, N, H) tfreq, tmag, tphase = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur) plt.figure(1, figsize=(9.5, 5)) maxplotfreq = 5000.0 maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N
def estimateInharmonicity(inputFile='../../sounds/piano.wav', t1=0.1, t2=0.5, window='hamming', M=2048, N=2048, H=128, f0et=5.0, t=-90, minf0=130, maxf0=180, nH=10): """ Function to estimate the extent of inharmonicity present in a sound Input: inputFile (string): wav file including the path t1 (float): start time of the segment considered for computing inharmonicity t2 (float): end time of the segment considered for computing inharmonicity window (string): analysis window M (integer): window size used for computing f0 contour N (integer): FFT size used for computing f0 contour H (integer): Hop size used for computing f0 contour f0et (float): error threshold used for the f0 computation t (float): magnitude threshold in dB used in spectral peak picking minf0 (float): minimum fundamental frequency in Hz maxf0 (float): maximum fundamental frequency in Hz nH (integer): number of integers considered for computing inharmonicity Output: meanInharm (float or np.float): mean inharmonicity over all the frames between the time interval t1 and t2. """ # 0. Read the audio file and obtain an analysis window fs, x = UF.wavread(inputFile) # reading inputFile w = get_window(window, M) # obtaining analysis window # 1. Use harmonic model to compute the harmonic frequencies and magnitudes xhfreq, xhmag, xhphase = HM.harmonicModelAnal(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope=0.01, minSineDur=0.0) # 2. Extract the time segment in which you need to compute the inharmonicity. lt1 = int(np.ceil(fs * t1 / float(H))) lt2 = int(np.floor(fs * t2 / float(H))) xSeg = xhfreq[lt1:lt2] # 3. Compute the mean inharmonicity of the segment I = np.zeros(xSeg.shape[0]) for l in range(0, xSeg.shape[0]): nonZeroFreqs = np.where(xSeg[l, :] > 0.0)[0] nonZeroFreqs = np.delete(nonZeroFreqs, 0) for r in nonZeroFreqs: I[l] += (np.abs(xSeg[l, r] - (r + 1) * xSeg[l, 0])) / float(r + 1) #I[l] = 1.0/nonZeroFreqs.size * I[l] I[l] = 1.0 / nH * I[l] meanInharm = 1.0 / (lt2 - lt1) * np.sum(I) return meanInharm
import numpy as np import matplotlib.pyplot as plt from scipy.signal import hamming, triang, blackmanharris import math import sys, os, functools, time sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) import dftModel as DFT import utilFunctions as UF (fs, x) = UF.wavread('../../../sounds/oboe-A4.wav') N = 512 * 2 M = 511 t = -60 w = np.hamming(M) start = .8 * fs hN = N / 2 hM = (M + 1) / 2 x1 = x[start:start + M] mX, pX = DFT.dftAnal(x1, w, N) ploc = UF.peakDetection(mX, t) iploc, ipmag, ipphase = UF.peakInterp(mX, pX, ploc) pmag = mX[ploc] freqaxis = fs * np.arange(mX.size) / float(N) plt.figure(1, figsize=(9, 6)) plt.subplot(2, 1, 1) plt.plot(freqaxis, mX, 'r', lw=1.5)
def main(inputFile='../../sounds/bendir.wav', window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # perform sinusoidal plus residual analysis tfreq, tmag, tphase, xr = SPR.sprModelAnal(x, fs, w, N, H, t, minSineDur, maxnSines, freqDevOffset, freqDevSlope) # compute spectrogram of residual mXr, pXr = STFT.stftAnal(xr, w, N, H) # sum sinusoids and residual y, ys = SPR.sprModelSynth(tfreq, tmag, tphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel_sines.wav' outputFileResidual = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel_residual.wav' outputFile = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel.wav' # write sounds files for sinusoidal, residual, and the sum UF.wavwrite(ys, fs, outputFileSines) UF.wavwrite(xr, fs, outputFileResidual) UF.wavwrite(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the magnitude spectrogram of residual plt.subplot(3, 1, 2) maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mXr[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mXr[:, :maxplotbin + 1])) plt.autoscale(tight=True) # plot the sinusoidal frequencies on top of the residual spectrogram if (tfreq.shape[1] > 0): tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k') plt.title('sinusoidal tracks + residual spectrogram') plt.autoscale(tight=True) # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.ion() plt.show()
import numpy as np import sys, os, math from scipy.fftpack import fft sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../software/models/')) import utilFunctions as UF M = 501 hM1 = int(math.floor((M + 1) / 2)) hM2 = int(math.floor(M / 2)) (fs, x) = UF.wavread('../sounds/soprano-E4.wav') x1 = x[5000:5000 + M] * np.hamming(M) N = 1024 fftbuffer = np.zeros(N) fftbuffer[:hM1] = x1[hM2:] fftbuffer[N - hM2:] = x1[:hM2] X = fft(fftbuffer) mX = 20 * np.log10(abs(X)) pX = np.unwrap(np.angle(X))
def main(inputFile='../../sounds/piano.wav', window='blackman', M=511, N=1024, time=.2): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (choice of rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size (odd integer value) N: fft size (power of two, bigger or equal than than M) time: time to start analysis (in seconds) """ # read input sound (monophonic with sampling rate of 44100) fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # get a fragment of the input sound of size M sample = int(time * fs) if (sample + M >= x.size or sample < 0): # raise error if time outside of sound raise ValueError("Time outside sound boundaries") x1 = x[sample:sample + M] # compute the dft of the sound fragment mX, pX = DFT.dftAnal(x1, w, N) # compute the inverse dft of the spectrum y = DFT.dftSynth(mX, pX, w.size) * sum(w) # create figure plt.figure(figsize=(12, 9)) # plot the sound fragment plt.subplot(4, 1, 1) plt.plot(time + np.arange(M) / float(fs), x1) plt.axis([time, time + M / float(fs), min(x1), max(x1)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the magnitude spectrum plt.subplot(4, 1, 2) plt.plot(float(fs) * np.arange(mX.size) / float(N), mX, 'r') plt.axis([0, fs / 2.0, min(mX), max(mX)]) plt.title('magnitude spectrum: mX') plt.ylabel('amplitude (dB)') plt.xlabel('frequency (Hz)') # plot the phase spectrum plt.subplot(4, 1, 3) plt.plot(float(fs) * np.arange(pX.size) / float(N), pX, 'c') plt.axis([0, fs / 2.0, min(pX), max(pX)]) plt.title('phase spectrum: pX') plt.ylabel('phase (radians)') plt.xlabel('frequency (Hz)') # plot the sound resulting from the inverse dft plt.subplot(4, 1, 4) plt.plot(time + np.arange(M) / float(fs), y) plt.axis([time, time + M / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.ion() plt.show()
import numpy as np import time, os, sys import matplotlib.pyplot as plt sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/transformations/')) import utilFunctions as UF import stftTransformations as STFTT import stft as STFT (fs, x) = UF.wavread('../../../sounds/orchestra.wav') w = np.hamming(2048) N = 2048 H = 512 # design a band stop filter using a hanning window startBin = int(N * 500.0 / fs) nBins = int(N * 2000.0 / fs) bandpass = (np.hanning(nBins) * 65.0) - 60 filt = np.zeros(N / 2) - 60 filt[startBin:startBin + nBins] = bandpass y = STFTT.stftFiltering(x, fs, w, N, H, filt) mX, pX = STFT.stftAnal(x, fs, w, N, H) mY, pY = STFT.stftAnal(y, fs, w, N, H) plt.figure(1, figsize=(12, 9)) plt.subplot(311) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs)
import numpy as np import matplotlib.pyplot as plt from scipy.signal import get_window import sys, os sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), "/home/pvardanis/sms-tools/software/models/")) import dftModel as DFT import utilFunctions as UF (fs, x) = UF.wavread("/home/pvardanis/sms-tools/sounds/sine-440.wav") M = 501 N = 2048 t = -20 #threshold w = get_window('hamming', M) x1 = x[int(.8 * fs):int(.8 * fs + M)] mX, pX = DFT.dftAnal(x1, w, N) ploc = UF.peakDetection(mX, t) iploc, ipmag, ipphase = UF.peakInterp(mX, pX, ploc) pmag = mX[ploc] freqaxis = fs * np.arange(N / 2 + 1) / float(N) plt.plot(freqaxis, mX) plt.plot(fs * iploc / float(N), ipmag, marker='x', linestyle='') plt.show()
import numpy as np import matplotlib.pyplot as plt from scipy.signal import hamming, triang, blackmanharris import math from scipy.fftpack import fft, ifft, fftshift import sys, os, functools, time sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) import dftModel as DFT import utilFunctions as UF import harmonicModel as HM (fs, x) = UF.wavread('../../../sounds/flute-A4.wav') pos = int(.8 * fs) M = 601 hM1 = (M + 1) // 2 hM2 = M // 2 w = np.hamming(M) N = 1024 t = -100 nH = 40 minf0 = 420 maxf0 = 460 f0et = 5 maxnpeaksTwm = 5 minSineDur = .1 harmDevSlope = 0.01 Ns = 512 H = Ns // 4
def extractHarmSpec(inputFile='../../sounds/piano.wav', window='hamming', M=1024, N=1024, H=512): """ analysis/synthesis using the STFT inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (choice of rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) H: hop size (at least 1/2 of analysis window size to have good overlap-add) """ # read input sound (monophonic with sampling rate of 44100) fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # compute the magnitude and phase spectrogram mX, pX = STFT.stftAnal(x, fs, w, N, H) # perform the inverse stft y = STFT.stftSynth(mX, pX, M, H) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_stft.wav' # write the sound resulting from the inverse stft UF.wavwrite(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(4, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot magnitude spectrogram plt.subplot(4, 1, 2) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram') plt.autoscale(tight=True) # plot the phase spectrogram plt.subplot(4, 1, 3) numFrames = int(pX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh( frmTime, binFreq, np.transpose(np.diff(pX[:, :N * maxplotfreq / fs + 1], axis=1))) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('phase spectrogram (derivative)') plt.autoscale(tight=True) # plot the output sound plt.subplot(4, 1, 4) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.show()
def main(inputFile1='../../sounds/ocean.wav', inputFile2='../../sounds/speech-male.wav', window1='hamming', window2='hamming', M1=1024, M2=1024, N1=1024, N2=1024, H1=256, smoothf=.5, balancef=0.2): """ Function to perform a morph between two sounds inputFile1: name of input sound file to be used as source inputFile2: name of input sound file to be used as filter window1 and window2: windows for both files M1 and M2: window sizes for both files N1 and N2: fft sizes for both sounds H1: hop size for sound 1 (the one for sound 2 is computed automatically) smoothf: smoothing factor to be applyed to magnitude spectrum of sound 2 before morphing balancef: balance factor between booth sounds, 0 is sound 1 and 1 is sound 2 """ # read input sounds (fs, x1) = UF.wavread(inputFile1) (fs, x2) = UF.wavread(inputFile2) # compute analysis windows w1 = get_window(window1, M1) w2 = get_window(window2, M2) # perform morphing y = STFTT.stftMorph(x1, x2, fs, w1, N1, w2, N2, H1, smoothf, balancef) # compute the magnitude and phase spectrogram of input sound (for plotting) mX1, pX1 = STFT.stftAnal(x1, w1, N1, H1) # compute the magnitude and phase spectrogram of output sound (for plotting) mY, pY = STFT.stftAnal(y, w1, N1, H1) # write output sound outputFile = 'output_sounds/' + os.path.basename( inputFile1)[:-4] + '_stftMorph.wav' UF.wavwrite(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 10000.0 # plot sound 1 plt.subplot(4, 1, 1) plt.plot(np.arange(x1.size) / float(fs), x1) plt.axis([0, x1.size / float(fs), min(x1), max(x1)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot magnitude spectrogram of sound 1 plt.subplot(4, 1, 2) numFrames = int(mX1[:, 0].size) frmTime = H1 * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N1 * maxplotfreq / fs) / N1 plt.pcolormesh(frmTime, binFreq, np.transpose(mX1[:, :int(N1 * maxplotfreq / fs) + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram of x') plt.autoscale(tight=True) # plot magnitude spectrogram of morphed sound plt.subplot(4, 1, 3) numFrames = int(mY[:, 0].size) frmTime = H1 * np.arange(numFrames) / float(fs) binFreq = fs * np.arange(N1 * maxplotfreq / fs) / N1 plt.pcolormesh(frmTime, binFreq, np.transpose(mY[:, :int(N1 * maxplotfreq / fs) + 1])) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.title('magnitude spectrogram of y') plt.autoscale(tight=True) # plot the morphed sound plt.subplot(4, 1, 4) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.show()
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here # read input sound (monophonic with sampling rate of 44100) fs, x = UF.wavread(inputFile) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) ## bin = (f * N) / fs Bin0hz = 0 BinUp3000hz = int(np.ceil((3000.0 * N) / fs)) BinTo3000hz = int(np.floor((3000.0 * N) / fs)) Bin10000hz = int(np.ceil((10000.0 * N) / fs)) Bins0hzbetween3000hz = np.arange(Bin0hz + 1, BinUp3000hz) Bins3000hzbetween10000hz = np.arange(BinTo3000hz + 1, Bin10000hz) nFrames = mX[:, 0].size # number of frames mXlow = np.zeros( Bins0hzbetween3000hz.size) # initialize low frecuency array mXhigh = np.zeros( Bins3000hzbetween10000hz.size) # initialize high frecuency array engEnv = np.zeros((nFrames, 2)) # create energy envelopes array ODF = np.zeros((nFrames, 2)) # create onset detection array for i in range(nFrames): # iterate over all frames mXlow = np.take(mX[i, :], Bins0hzbetween3000hz) # take only low frecuency bins mXhigh = np.take( mX[i, :], Bins3000hzbetween10000hz) # take only high frecuency bins mXlowLinear = 10.0**(mXlow / 20) # transform db to linear Elow = sum(mXlowLinear**2) # compute energy Edblow = 10 * np.log10(Elow) # transform linear to db engEnv[i, 0] = Edblow # assign energy to right frame mXhighLinear = 10.0**(mXhigh / 20) # transform db to linear Ehigh = sum(mXhighLinear**2) # compute energy Edbhigh = 10 * np.log10(Ehigh) # transform linear to db engEnv[i, 1] = Edbhigh # assign energy to right frame if i > 0: ODFLow = engEnv[i, 0] - engEnv[i - 1, 0] ODFHigh = engEnv[i, 1] - engEnv[i - 1, 1] ODF[i, 0] = ODFLow if ODFLow > 0.0 else 0.0 ODF[i, 1] = ODFHigh if ODFHigh > 0.0 else 0.0 #----plot the spectrum and low/high frecuencies energy plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(N / 2 + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('mX (' + inputFile + '), M=' + str(M) + ', N=' + str(N) + ', H=' + str(H) + '') plt.autoscale(tight=True) plt.subplot(212) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(N / 2 + 1) * float(fs) / N #plt.plot(frmTime,np.transpose(ODF[:,0]),label='ODF low') #plt.plot(frmTime,np.transpose(ODF[:,1]),label='ODF high') plt.bar(frmTime, np.transpose(ODF[:, 0]), width=frmTime / numFrames, label='ODF low', color='blue') plt.bar(frmTime, np.transpose(ODF[:, 1]), width=frmTime / numFrames, label='ODF high', color='green') plt.title('ODF low and high (' + inputFile + '), M=' + str(M) + ', N=' + str(N) + ', H=' + str(H) + '') plt.autoscale(tight=True) plt.tight_layout() plt.legend() plt.grid(True) #plt.savefig('spectrogram.png') plt.show() return ODF
def segment_stable_notes_monophonic( inputFile='../../sounds/sax-phrase-short.wav', stdThsld=10, minNoteDur=0.1, winStable=3, window='hamming', M=1024, N=2048, H=256, f0et=5.0, t=-100, minf0=310, maxf0=650): """ Function to segment the stable note regions in an audio signal Input: inputFile (string): wav file including the path stdThsld (float): threshold for detecting stable regions in the f0 contour (in cents) minNoteDur (float): minimum allowed segment length (note duration) winStable (integer): number of samples used for computing standard deviation window (string): analysis window M (integer): window size used for computing f0 contour N (integer): FFT size used for computing f0 contour H (integer): Hop size used for computing f0 contour f0et (float): error threshold used for the f0 computation t (float): magnitude threshold in dB used in spectral peak picking minf0 (float): minimum fundamental frequency in Hz maxf0 (float): maximum fundamental frequency in Hz Output: segments (np.ndarray): Numpy array containing starting and ending frame indexes of every segment. """ fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window f0 = HM.f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 # your code here # 1. convert f0 values from Hz to Cents (as described in pdf document) f0[f0 < eps] = eps tuning = 55.0 # A4=440 Hz -> tuning=A1=55 Hz cent_f0 = 1200 * np.log2(f0 / tuning) # 2. create an array containing standard deviation of last winStable samples std_winStable = [ np.std(cent_f0[index - winStable:index]) for index in range(winStable, cent_f0.size + 1) ] std_winStable = np.array(std_winStable) # 3. apply threshold on standard deviation values to find indexes of the stable points in melody std_below_threshold = np.where(std_winStable < stdThsld)[0] # 4. create segments of continuous stable points such that consecutive stable points belong to same segment std_contiguous = std_below_threshold[1:] - std_below_threshold[:-1] contiguous_index = np.where(std_contiguous == 1) initial = [ x for x in contiguous_index[0] if x - 1 not in contiguous_index[0] and x + 1 in contiguous_index[0] ] final = [ x for x in contiguous_index[0] if x - 1 in contiguous_index[0] and x + 1 not in contiguous_index[0] ] segments = list(zip(initial, final)) # 5. apply segment filtering, i.e. remove segments with are < minNoteDur in length samples_minNoteDur = int(minNoteDur * fs / H) segments = [(x, y) for x, y in segments if y - x >= samples_minNoteDur] segments = np.array(segments) # plotSpectogramF0Segments(x, fs, w, N, H, f0, segments) # Plot spectrogram and F0 if needed return segments
def stft_onset(inputFile, window, M, N, H, freq_thresholds, show_plot=False, debug=False): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation freq_thresholds (list): Contains frequency tuples (initial, end) of the different chunks, these frequencies are excluded from the chunk. show_plot (boolean): enable/disable spectrogram, energy envelope and onset function visualization debug (boolean): enable/disable debug messages during execution Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ # your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) dBX, pX = stft.stftAnal(x, w, N, H) X = 10**(dBX/20.0) dft_size = X[0, :].size if debug: print("Spectrogram shape = {}".format(X.shape)) band_index = dsp_toolbox.slice_spectrum(X, freq_thresholds, fs, N, dft_size) if debug: print("band_index elements = {}".format(len(band_index))) onset_result = np.array([]) energy_result = np.array([]) for band in band_index: energy = dsp_toolbox.energy(X[:, band], axis=-1) energy[energy < eps] = eps db_energy = 10*np.log10(energy) odf_band = db_energy[1:] - db_energy[:-1] odf_band[odf_band < 0] = 0 # Half wave rectification odf_band = np.insert(odf_band, 0, 0) # add extra sample in the beginning to match with energy envelope size db_energy = np.reshape(db_energy, (db_energy.size, 1)) odf_band = np.reshape(odf_band, (odf_band.size, 1)) # print("db_energy shape = {}".format(db_energy.shape)) # print("odf_band shape = {}".format(odf_band.shape)) energy_result = db_energy if energy_result.size == 0 else np.concatenate((energy_result, db_energy), axis=0) onset_result = odf_band if onset_result.size == 0 else np.concatenate((onset_result, odf_band), axis=0) if debug: print("energy_result shape = {}".format(energy_result.shape)) print("onset_result shape = {}".format(onset_result.shape)) if show_plot: plt.figure(1, figsize=(9.5, 6)) plt.subplot(311) numFrames = int(dBX[:, 0].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(N/2+1)*float(fs)/N plt.pcolormesh(frmTime, binFreq, np.transpose(dBX)) plt.title('Spectrogram') plt.autoscale(tight=True) plt.subplot(312) for i in range(np.size(energy_result, 1)): plt.plot(energy_result[:, i], label='band {}'.format(i)) # plt.plot(frmTime, db_high_mx, label='high band') plt.title('Energy envelopes') plt.legend() plt.autoscale(tight=True) plt.show(block=False) plt.subplot(313) for i in range(np.size(onset_result, 1)): plt.plot(onset_result[:, i], label='band {}'.format(i)) # plt.plot(frmTime, odf_high, label='high band') plt.title('Onset detection function') plt.legend() plt.autoscale(tight=True) plt.show(block=False) return onset_result
from scipy.interpolate import interp1d sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/transformations/')) import sineModel as SM import stft as STFT import sineModel as SM import utilFunctions as UF import sineTransformations as SMT (fs, x) = UF.wavread('../../../sounds/mridangam.wav') w = np.hamming(801) N = 2048 t = -90 minSineDur = .005 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.02 Ns = 512 H = Ns // 4 mX, pX = STFT.stftAnal(x, w, N, H) tfreq, tmag, tphase = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) timeScale = np.array([ .01, .0, .03, .03, .335, .4, .355, .42, .671, .8, .691, .82, .858, 1.2, .878, 1.22, 1.185, 1.6, 1.205, 1.62, 1.497, 2.0, 1.517, 2.02, 1.686, 2.4,
def main(inputFile='../../sounds/bendir.wav', window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001, stocf=0.2): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation stocf: decimation factor used for the stochastic approximation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # perform sinusoidal+sotchastic analysis tfreq, tmag, tphase, stocEnv = SPS.spsModelAnal(x, fs, w, N, H, t, minSineDur, maxnSines, freqDevOffset, freqDevSlope, stocf) # synthesize sinusoidal+stochastic model y, ys, yst = SPS.spsModelSynth(tfreq, tmag, tphase, stocEnv, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_spsModel_sines.wav' outputFileStochastic = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_spsModel_stochastic.wav' outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_spsModel.wav' # write sounds files for sinusoidal, residual, and the sum UF.wavwrite(ys, fs, outputFileSines) UF.wavwrite(yst, fs, outputFileStochastic) UF.wavwrite(y, fs, outputFile) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 10000.0 # plot the input sound plt.subplot(3,1,1) plt.plot(np.arange(x.size)/float(fs), x) plt.axis([0, x.size/float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') plt.subplot(3,1,2) numFrames = int(stocEnv[:,0].size) sizeEnv = int(stocEnv[0,:].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = (.5*fs)*np.arange(sizeEnv*maxplotfreq/(.5*fs))/sizeEnv plt.pcolormesh(frmTime, binFreq, np.transpose(stocEnv[:,:sizeEnv*maxplotfreq/(.5*fs)+1])) plt.autoscale(tight=True) # plot sinusoidal frequencies on top of stochastic component if (tfreq.shape[1] > 0): sines = tfreq*np.less(tfreq,maxplotfreq) sines[sines==0] = np.nan numFrames = int(sines[:,0].size) frmTime = H*np.arange(numFrames)/float(fs) plt.plot(frmTime, sines, color='k', ms=3, alpha=1) plt.xlabel('time(s)') plt.ylabel('Frequency(Hz)') plt.autoscale(tight=True) plt.title('sinusoidal + stochastic spectrogram') # plot the output sound plt.subplot(3,1,3) plt.plot(np.arange(y.size)/float(fs), y) plt.axis([0, y.size/float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.ion() plt.show()
def analysis(inputFile='../../sounds/vignesh.wav', window='blackman', M=1201, N=2048, t=-90, minSineDur=0.1, nH=100, minf0=130, maxf0=300, f0et=7, harmDevSlope=0.01): """ Analyze a sound with the harmonic model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics have higher allowed deviation returns inputFile: input file name; fs: sampling rate of input file, tfreq, tmag: sinusoidal frequencies and magnitudes """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # compute the harmonic model of the whole sound hfreq, hmag, hphase = HM.harmonicModelAnal(x, fs, w, N, H, t, nH, minf0, maxf0, f0et, harmDevSlope, minSineDur) # synthesize the sines without original phases y = SM.sineModelSynth(hfreq, hmag, np.array([]), Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_harmonicModel.wav' # write the sound resulting from the inverse stft UF.wavwrite(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') if (hfreq.shape[1] > 0): plt.subplot(3, 1, 2) tracks = np.copy(hfreq) numFrames = tracks.shape[0] frmTime = H * np.arange(numFrames) / float(fs) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of harmonic tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.show(block=False) return inputFile, fs, hfreq, hmag
import numpy as np from scipy.signal import get_window, resample from scipy.fftpack import fft import sys, os, math sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../software/models/')) import utilFunctions as UF import dftModel as DFT fs, x1 = UF.wavread('../../sounds/rain.wav') fs, x2 = UF.wavread('../../sounds/soprano-E4.wav') M = N = 512 w = get_window('hanning', M) x1w = x1[10000:10000 + M] * w x2w = x2[10000:10000 + M] * w mX1, pX1 = DFT.dftAnal(x1w, w, N) mX2, pX2 = DFT.dftAnal(x2w, w, N) smoothf = 0.2 mX2smooth1 = resample(np.maximum(-200.0, mX2), mX2.size * smoothf) mX2smooth2 = resample(mX2smooth1, N / 2 + 1) balancef = 0.5 mY = balancef * mX2smooth2 + (1.0 - balancef) * mX1 y = DFT.dftSynth(mY, pX1, N) * sum(w) import matplotlib.pyplot as plt
def estimateF0(inputFile='../../sounds/cello-double-2.wav'): """ Function to estimate fundamental frequency (f0) in an audio signal. This function also plots the f0 contour on the spectrogram and synthesize the f0 contour. Input: inputFile (string): wav file including the path Output: f0 (numpy array): array of the estimated fundamental frequency (f0) values """ ### Change these analysis parameter values marked as XX window = 'blackman' M = 21000 N = 8192 * 4 f0et = 5 t = -70 minf0 = 140 maxf0 = 210 ### Do not modify the code below H = 256 #fix hop size fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window ### Method 1 f0 = HM.f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 startFrame = int(np.floor(0.5 * fs / H)) endFrame = int(np.ceil(4.0 * fs / H)) f0[:startFrame] = 0 f0[endFrame:] = 0 y = UF.sinewaveSynth(f0, 0.8, H, fs) UF.wavwrite(y, fs, 'synthF0Contour.wav') ## Code for plotting the f0 contour on top of the spectrogram # frequency range to plot maxplotfreq = 500.0 fontSize = 16 plot = 1 fig = plt.figure() ax = fig.add_subplot(111) mX, pX = stft.stftAnal(x, w, N, H) #using same params as used for analysis mX = np.transpose(mX[:, :int(N * (maxplotfreq / fs)) + 1]) timeStamps = np.arange(mX.shape[1]) * H / float(fs) binFreqs = np.arange(mX.shape[0]) * fs / float(N) plt.pcolormesh(timeStamps, binFreqs, mX) plt.plot(timeStamps, f0, color='k', linewidth=1.5) plt.plot([0.5, 0.5], [0, maxplotfreq], color='b', linewidth=1.5) plt.plot([4.0, 4.0], [0, maxplotfreq], color='b', linewidth=1.5) plt.autoscale(tight=True) plt.ylabel('Frequency (Hz)', fontsize=fontSize) plt.xlabel('Time (s)', fontsize=fontSize) plt.legend(('f0', )) xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1] - xLim[0]) / (2.0 * (yLim[1] - yLim[0]))) if plot == 1: #save the plot too! plt.autoscale(tight=True) plt.show() else: fig.tight_layout() fig.savefig('f0_over_Spectrogram.png', dpi=150, bbox_inches='tight') return f0
import numpy as np import matplotlib.pyplot as plt from scipy.signal import hamming, triang, blackmanharris import math import sys, os, functools, time sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) import dftModel as DFT import utilFunctions as UF (fs, x) = UF.wavread('../../../sounds/piano.wav') M = 1100 w = np.blackman(M) N = 2048 pin = .3 * fs hM1 = int(math.floor((w.size + 1) / 2)) hM2 = int(math.floor(w.size / 2)) x1 = x[pin - hM1:pin + hM2] mX, pX = DFT.dftAnal(x1, w, N) plt.figure(1, figsize=(9, 7)) plt.subplot(311) plt.plot(np.arange(-hM1, hM2) / float(fs), x1, lw=1.5) plt.axis([-hM1 / float(fs), hM2 / float(fs), min(x1), max(x1)]) plt.title('x (piano.wav)') plt.subplot(3, 1, 2) plt.plot(fs * np.arange(mX.size) / float(N), mX, 'r', lw=1.5) plt.axis([0, fs / 4, -90, max(mX)])
def main(inputFile='../../sounds/Dark Guitar String.wav', window='blackmanharris', M=3001, N=4096, t=-100, minSineDur=0.02, maxnSines=30, freqDevOffset=10, freqDevSlope=0.001): """ Perform analysis/synthesis using the sinusoidal model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # analyze the sound with the sinusoidal model tfreq, tmag, tphase = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) # synthesize the output sound from the sinusoidal representation y = SM.sineModelSynth(tfreq, tmag, tphase, Ns, H, fs) # output sound file name outputFile = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sineModel_test.wav' # write the synthesized sound obtained from the sinusoidal synthesis UF.wavwrite(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(9, 6)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the sinusoidal frequencies plt.subplot(3, 1, 2) if (tfreq.shape[1] > 0): numFrames = tfreq.shape[0] frmTime = H * np.arange(numFrames) / float(fs) tfreq[tfreq <= 0] = np.nan plt.plot(frmTime, tfreq) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of sinusoidal tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() plt.ion() plt.show() return tfreq, tmag, tphase
import numpy as np import matplotlib.pyplot as plt from scipy.signal import hamming, triang, blackmanharris import math import sys, os, functools, time sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../software/models/')) import dftModel as DFT import utilFunctions as UF (fs, x) = UF.wavread('../../../sounds/carnatic.wav') pin = 1.4 * fs w = np.blackman(1601) N = 4096 hM1 = int(math.floor((w.size + 1) / 2)) hM2 = int(math.floor(w.size / 2)) x1 = x[pin - hM1:pin + hM2] mX, pX = DFT.dftAnal(x1, w, N) plt.figure(1, figsize=(9, 7)) plt.subplot(311) plt.plot(np.arange(-hM1, hM2) / float(fs), x1, lw=1.5) plt.axis([-hM1 / float(fs), hM2 / float(fs), min(x1), max(x1)]) plt.title('x (carnatic.wav)') plt.subplot(3, 1, 2) plt.plot(fs * np.arange(mX.size) / float(N), mX, 'r', lw=1.5) plt.axis([0, fs / 4, -100, max(mX)])
import sys, os sys.path.append( os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../software/models')) import dftModel as DFT import utilFunctions as UF # For sine-440.wav: # (fs, x) = UF.wavread('../../sounds/sine-440.wav') # M = 501 # N = 512 # N = 2048 # better freq resolution for sine-440.wav # For sine-440-490.wav (fs, x) = UF.wavread('../../sounds/sine-440-490.wav') M = 3528 # M = 4 bins * 44100 / (490-440) N = 4096 # N > M t = -20 # threshold w = get_window('hamming', M) x1 = x[0.8 * fs:0.8 * fs + M] mX, pX = DFT.dftAnal(x1, w, N) ploc = UF.peakDetection(mX, t) pmag = mX[ploc] freqaxis = fs * np.arange(N / 2 + 1) / float(N) plt.plot(freqaxis, mX) plt.plot(fs * ploc / float(N), pmag, marker='x', linestyle='') # quadratic interpolation:
def analysis(inputFile1='../../sounds/violin-B3.wav', window1='blackman', M1=1001, N1=1024, t1=-100, minSineDur1=0.05, nH=60, minf01=200, maxf01=300, f0et1=10, harmDevSlope1=0.01, stocf=0.1, inputFile2='../../sounds/soprano-E4.wav', window2='blackman', M2=901, N2=1024, t2=-100, minSineDur2=0.05, minf02=250, maxf02=500, f0et2=10, harmDevSlope2=0.01): """ Analyze two sounds with the harmonic plus stochastic model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics have higher allowed deviation stocf: decimation factor used for the stochastic approximation returns inputFile: input file name; fs: sampling rate of input file, hfreq, hmag: harmonic frequencies, magnitude; stocEnv: stochastic residual """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sounds (fs1, x1) = UF.wavread(inputFile1) (fs2, x2) = UF.wavread(inputFile2) # compute analysis windows w1 = get_window(window1, M1) w2 = get_window(window2, M2) # compute the harmonic plus stochastic models hfreq1, hmag1, hphase1, stocEnv1 = HPS.hpsModelAnal(x1, fs1, w1, N1, H, t1, nH, minf01, maxf01, f0et1, harmDevSlope1, minSineDur1, Ns, stocf) hfreq2, hmag2, hphase2, stocEnv2 = HPS.hpsModelAnal(x2, fs2, w2, N2, H, t2, nH, minf02, maxf02, f0et2, harmDevSlope2, minSineDur2, Ns, stocf) # create figure to plot plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 15000.0 # plot spectrogram stochastic component of sound 1 plt.subplot(2,1,1) numFrames = int(stocEnv1[:,0].size) sizeEnv = int(stocEnv1[0,:].size) frmTime = H*np.arange(numFrames)/float(fs1) binFreq = (.5*fs1)*np.arange(sizeEnv*maxplotfreq/(.5*fs1))/sizeEnv plt.pcolormesh(frmTime, binFreq, np.transpose(stocEnv1[:,:int(sizeEnv*maxplotfreq/(.5*fs1))+1])) plt.autoscale(tight=True) # plot harmonic on top of stochastic spectrogram of sound 1 if (hfreq1.shape[1] > 0): harms = np.copy(hfreq1) harms = harms*np.less(harms,maxplotfreq) harms[harms==0] = np.nan numFrames = int(harms[:,0].size) frmTime = H*np.arange(numFrames)/float(fs1) plt.plot(frmTime, harms, color='k', ms=3, alpha=1) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.autoscale(tight=True) plt.title('harmonics + stochastic spectrogram of sound 1') # plot spectrogram stochastic component of sound 2 plt.subplot(2,1,2) numFrames = int(stocEnv2[:,0].size) sizeEnv = int(stocEnv2[0,:].size) frmTime = H*np.arange(numFrames)/float(fs2) binFreq = (.5*fs2)*np.arange(sizeEnv*maxplotfreq/(.5*fs2))/sizeEnv plt.pcolormesh(frmTime, binFreq, np.transpose(stocEnv2[:,:int(sizeEnv*maxplotfreq/(.5*fs2))+1])) plt.autoscale(tight=True) # plot harmonic on top of stochastic spectrogram of sound 2 if (hfreq2.shape[1] > 0): harms = np.copy(hfreq2) harms = harms*np.less(harms,maxplotfreq) harms[harms==0] = np.nan numFrames = int(harms[:,0].size) frmTime = H*np.arange(numFrames)/float(fs2) plt.plot(frmTime, harms, color='k', ms=3, alpha=1) plt.xlabel('time (sec)') plt.ylabel('frequency (Hz)') plt.autoscale(tight=True) plt.title('harmonics + stochastic spectrogram of sound 2') plt.tight_layout() plt.show(block=False) return inputFile1, fs1, hfreq1, hmag1, stocEnv1, inputFile2, hfreq2, hmag2, stocEnv2