def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=100) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image',cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 tmp = np.zeros((data.size, 2)) tmp[:,0] = data tmp[:,1] = data data = tmp if data.shape[0] == data.size: # data is multi-channel print "The audio file is not stereo. Try separateLead.py instead." raise ValueError("number of dimensions of the input not 2") if data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:,0:2] # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter R = options.R if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R XR, F, N = stft(data[:,0], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:,1], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) SXR = np.abs(XR) ** 2 SXL = np.abs(XL) ** 2 del data, F, N # TODO: also process these as options: eps = 10 ** -9 minF0 = 100 maxF0 = 800 Fs = fs F, N = SXR.shape stepNotes = 20 # this is the number of F0s within one semitone # until 17/09/2010 : stepNotes = 20 # 17/09/2010 : trying stepNotes = 8, checking for less artefacts K = 10 # number of spectral shapes for the filter part # R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: SX = np.maximum(np.abs((XR + XL) / 2.0) ** 2, 10 ** -8) # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") del logHF0 # detection of silences: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (XR+XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print "The melody should be provided as <Time (s)><F0 (Hz)>." raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:,1] if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz>40.0].min() *.97 maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03) print "Recomputing the source basis for " print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz." # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath<=0] = 0 freqMelody = F0Table[np.array(indexBestPath,dtype=int)] freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath!=0,:] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1,dim1index.size) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ) dim2index = dim2index[indexBestPath!=0,:] dim2index = dim2index.reshape(1,dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00, SX alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(options.voc_output_file, fs, \ np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(options.mus_output_file, fs, \ np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([mestR,mestL]).T) if displayEvolution: plt.close('all') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to end the program... !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print "Done!"
def stereo_NMF(SXR, SXL, numberOfAccompanimentSpectralShapes, WM0=None, HM0=None, numberOfIterations=50, updateRulePower=1.0, verbose=False, displayEvolution=False): eps = 10 ** (-20) if displayEvolution: import matplotlib.pyplot as plt from imageMatlab import imageM plt.ion() print("Is the display interactive? ", plt.isinteractive()) R = numberOfAccompanimentSpectralShapes omega = updateRulePower F, N = SXR.shape if (F, N) != SXL.shape: print("The input STFT matrices do not have the same dimension.\n") print("Please check what happened...") raise ValueError("Dimension of STFT matrices must be the same.") if HM0 is None: HM0 = np.abs(randn(R, N)) else: if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N: HM0 = np.array(HM0) else: print("Wrong dimensions for given HM0, \n") print("random initialization used instead") HM0 = np.abs(randn(R, N)) HM = np.copy(HM0) if WM0 is None: WM0 = np.abs(randn(F, R)) else: if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R: WM0 = np.array(WM0) else: print("Wrong dimensions for given WM0, \n") print("random initialization used instead") WM0 = np.abs(randn(F, R)) WM = np.copy(WM0) betaR = np.diag(np.random.rand(R)) betaL = np.eye(R) - betaR hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) # temporary matrices tempNumFbyN = np.zeros([F, N]) tempDenFbyN = np.zeros([F, N]) recoError = np.zeros([numberOfIterations * 3 + 1]) recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL) if verbose: print("Reconstruction error at beginning: ", recoError[0]) counterError = 1 if displayEvolution: h1 = plt.figure(1) for n in np.arange(numberOfIterations): # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM if verbose: print("iteration ", n, " over ", numberOfIterations) if displayEvolution: h1.clf() imageM(db(hatSXR)) plt.clim([np.amax(db(hatSXR))-100, np.amax(db(hatSXR))]) plt.draw() # updating HM HM = HM * \ ((np.dot(np.dot((betaR**2), WM.T), SXR / np.maximum(hatSXR ** 2, eps)) + np.dot(np.dot((betaL**2), WM.T), SXL / np.maximum(hatSXL ** 2, eps)) ) / np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 / np.maximum(hatSXR, eps)) + np.dot(np.dot((betaL**2), WM.T), 1 / np.maximum(hatSXL, eps)), eps)) ** omega hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2),HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print("Reconstruction error difference after HM : ",\ recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating WM WM = WM * \ ((np.dot(SXR / np.maximum(hatSXR ** 2, eps), np.dot(HM.T, betaR ** 2)) + np.dot(SXL / np.maximum(hatSXL ** 2, eps), np.dot(HM.T, betaL ** 2)) ) / (np.dot(1 / np.maximum(hatSXR, eps), np.dot(HM.T, betaR ** 2)) + np.dot(1 / np.maximum(hatSXL, eps), np.dot(HM.T, betaL ** 2)) )) ** omega sumWM = np.sum(WM, axis=0) WM[:, sumWM>0] = (WM[:, sumWM>0] / np.outer(np.ones(F),sumWM[sumWM>0])) HM = HM * np.outer(sumWM, np.ones(N)) hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print("Reconstruction error difference after WM : ",) print(recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating betaR and betaL betaR = np.diag(np.diag(np.maximum(betaR * ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR, eps)), HM.T))) ** (omega*.1), eps))) betaL = np.diag(np.diag(np.maximum(betaL * ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL, eps)), HM.T))) ** (omega*.1), eps))) betaR = betaR / np.maximum(betaR + betaL, eps) betaL = np.copy(np.eye(R) - betaR) hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print("Reconstruction error difference after BETA : ",) print(recoError[counterError] - recoError[counterError - 1]) counterError += 1 return betaR, betaL, HM, WM
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part. \n"\ "If None, appends _lead to inputAudioFile.", default=None) parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part.\n"\ "If None, appends _acc to inputAudioFile.", default=None) parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile", default=None) # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("-n", "--dontseparate", dest="separateSignals", action="store_false", help="Trigger this option if you only desire to "+\ "estimate the melody", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=None, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") parser.add_option("--numAtomFilters", dest="P_numAtomFilters", type="int", default=30, help="Number of atomic filters - in WGAMMA.") parser.add_option("--numFilters", dest="K_numFilters", type="int", default=10, help="Number of filters for decomposition - in WPHI") parser.add_option("--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0.") parser.add_option("--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0.") parser.add_option("--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image',cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] if inputAudioFile[-4:] != ".wav": raise ValueError("File not WAV file? Only WAV format support, for now...") if options.mus_output_file is None: options.mus_output_file = inputAudioFile[:-4]+'_acc.wav' if options.voc_output_file is None: options.voc_output_file = inputAudioFile[:-4]+'_lead.wav' if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4]+'_pitches.txt' print("Writing the different following output files:") print(" separated lead in", options.voc_output_file) print(" separated accompaniment in", options.mus_output_file) print(" separated lead + unvoc in", options.voc_output_file[:-4] + \ '_VUIMM.wav') print(" separated acc - unvoc in", options.mus_output_file[:-4] + \ '_VUIMM.wav') print(" estimated pitches in", options.pitch_output_file) Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 is_stereo = True if data.shape[0] == data.size: # data is multi-channel print("The audio file is not stereo. Making stereo out of mono.") print("(You could also try the older separateLead.py...)") is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print("The data is multichannel, but not stereo... \n") print("Unfortunately this program does not scale well. Data is \n") print("reduced to its 2 first channels.\n") data = data[:,0:2] # Processing the options: windowSizeInSamples = int(nextpow2(np.round(options.windowSize * Fs)) ) hopsize = np.round(options.hopsize * Fs) if hopsize != windowSizeInSamples/8: #print "Overriding given hopsize to use 1/8th of window size" #hopsize = windowSizeInSamples/8 warnings.warn("Chosen hopsize: "+str(hopsize)+\ ", while windowsize: "+str(windowSizeInSamples)) if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = options.R eps = 10 ** -9 if options.verbose: print("Some parameter settings:") print(" Size of analysis windows: ", windowSizeInSamples) print(" Hopsize: ", hopsize) print(" Size of Fourier transforms: ", NFT) print(" Number of iterations to be done: ", niter) print(" Number of elements in WM: ", R) if is_stereo: XR, F, N = stft(data[:,0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:,1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) #SXR = np.abs(XR) ** 2 #SXL = np.abs(XL) ** 2 SX = np.maximum((0.5*np.abs(XR+XL)) ** 2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X) ** 2, eps) del data, F, N # TODO: also process these as options: # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = options.K_numFilters # number of spectral shapes for the filter part P = options.P_numAtomFilters # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) )) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))) dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int)\ ).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))) HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0)/hatSX)**2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print("The melody should be provided as <Time (s)><F0 (Hz)>.") raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:,1] if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz>40.0].min() *.97 maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03) print("Recomputing the source basis for ") print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.") # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath<=0] = 0 freqMelody = F0Table[np.array(indexBestPath,dtype=int)] freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # If separation is required: if options.separateSignals: # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1) ))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath!=0,:] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1,dim1index.size) dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int) ) dim2index = dim2index[indexBestPath!=0,:] dim2index = dim2index.reshape(1,dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00 if is_stereo: del SX SXR = np.maximum(np.abs(XR) ** 2, eps) SXL = np.maximum(np.abs(XL) ** 2, eps) alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, Fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, \ np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, Fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, Fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, \ np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, Fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(outputFileName, Fs, \ np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(outputFileName, Fs, \ np.array([mestR,mestL]).T) else: # running on monophonic data: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM,HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest*scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest*scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, mest) del hatM, vest, mest, hatV, hatSX, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) SM = np.dot(WM,HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest*scaleData), dtype=dataType) outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' wav.write(outputFileName, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest*scaleData), dtype=dataType) outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' wav.write(outputFileName, Fs, mest) if displayEvolution: plt.close('all') print("Done!")
def stereo_NMF(SXR, SXL, numberOfAccompanimentSpectralShapes, WM0=None, HM0=None, numberOfIterations=50, updateRulePower=1.0, verbose=False, displayEvolution=False): eps = 10 ** (-20) if displayEvolution: import matplotlib.pyplot as plt from imageMatlab import imageM plt.ion() print "Is the display interactive? ", plt.isinteractive() R = numberOfAccompanimentSpectralShapes omega = updateRulePower F, N = SXR.shape if (F, N) != SXL.shape: print "The input STFT matrices do not have the same dimension.\n" print "Please check what happened..." raise ValueError("Dimension of STFT matrices must be the same.") if HM0 is None: HM0 = np.abs(randn(R, N)) else: if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N: HM0 = np.array(HM0) else: print "Wrong dimensions for given HM0, \n" print "random initialization used instead" HM0 = np.abs(randn(R, N)) HM = np.copy(HM0) if WM0 is None: WM0 = np.abs(randn(F, R)) else: if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R: WM0 = np.array(WM0) else: print "Wrong dimensions for given WM0, \n" print "random initialization used instead" WM0 = np.abs(randn(F, R)) WM = np.copy(WM0) betaR = np.diag(np.random.rand(R)) betaL = np.eye(R) - betaR hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) # temporary matrices tempNumFbyN = np.zeros([F, N]) tempDenFbyN = np.zeros([F, N]) recoError = np.zeros([numberOfIterations * 3 + 1]) recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL) if verbose: print "Reconstruction error at beginning: ", recoError[0] counterError = 1 if displayEvolution: h1 = plt.figure(1) for n in np.arange(numberOfIterations): # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM if verbose: print "iteration ", n, " over ", numberOfIterations if displayEvolution: h1.clf() imageM(db(hatSXR)) plt.clim([np.amax(db(hatSXR))-100, np.amax(db(hatSXR))]) plt.draw() # updating HM HM = HM * \ ((np.dot(np.dot((betaR**2), WM.T), SXR / np.maximum(hatSXR ** 2, eps)) + np.dot(np.dot((betaL**2), WM.T), SXL / np.maximum(hatSXL ** 2, eps)) ) / np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 / np.maximum(hatSXR, eps)) + np.dot(np.dot((betaL**2), WM.T), 1 / np.maximum(hatSXL, eps)), eps)) ** omega hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2),HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print "Reconstruction error difference after HM : ",\ recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating WM WM = WM * \ ((np.dot(SXR / np.maximum(hatSXR ** 2, eps), np.dot(HM.T, betaR ** 2)) + np.dot(SXL / np.maximum(hatSXL ** 2, eps), np.dot(HM.T, betaL ** 2)) ) / (np.dot(1 / np.maximum(hatSXR, eps), np.dot(HM.T, betaR ** 2)) + np.dot(1 / np.maximum(hatSXL, eps), np.dot(HM.T, betaL ** 2)) )) ** omega sumWM = np.sum(WM, axis=0) WM[:, sumWM>0] = (WM[:, sumWM>0] / np.outer(np.ones(F),sumWM[sumWM>0])) HM = HM * np.outer(sumWM, np.ones(N)) hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print "Reconstruction error difference after WM : ", print recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating betaR and betaL betaR = np.diag(np.diag(np.maximum(betaR * ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR, eps)), HM.T))) ** (omega*.1), eps))) betaL = np.diag(np.diag(np.maximum(betaL * ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL, eps)), HM.T))) ** (omega*.1), eps))) betaR = betaR / np.maximum(betaR + betaL, eps) betaL = np.copy(np.eye(R) - betaR) hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps) hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps) recoError[counterError] = ISDistortion(SXR, hatSXR) \ + ISDistortion(SXL, hatSXL) if verbose: print "Reconstruction error difference after BETA : ", print recoError[counterError] - recoError[counterError - 1] counterError += 1 return betaR, betaL, HM, WM
def SIMM(# the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=4, numberOfAccompanimentSpectralShapes=10, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=1000, updateRulePower=1.0, stepNotes=4, lambdaHF0=0.00,alphaHF0=0.99, displayEvolution=False, verbose=True, makeMovie=False, updateHGAMMA=True, computeISDistortion=False): """ HGAMMA, HPHI, HF0, HM, WM, recoError = SIMM(SX, WF0, WGAMMA, numberOfFilters=4, numberOfAccompanimentSpectralShapes=10, HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, numberOfIterations=1000, updateRulePower=1.0, stepNotes=4, lambdaHF0=0.00, alphaHF0=0.99, displayEvolution=False, verbose=True) Implementation of the Smooth-filters Instantaneous Mixture Model (SIMM). This model can be used to estimate the main melody of a song, and separate the lead voice from the accompaniment, provided that the basis WF0 is constituted of elements associated to particular pitches. Inputs: SX the F x N power spectrogram to be approximated. F is the number of frequency bins, while N is the number of analysis frames WF0 the F x NF0 basis matrix containing the NF0 source elements WGAMMA the F x P basis matrix of P smooth elementary filters numberOfFilters the number of filters K to be considered numberOfAccompanimentSpectralShapes the number of spectral shapes R for the accompaniment HGAMMA0 the P x K decomposition matrix of WPHI on WGAMMA HPHI0 the K x N amplitude matrix of the filter part of the lead instrument HF00 the NF0 x N amplitude matrix for the source part of the lead instrument WM0 the F x R the matrix for spectral shapes of the accompaniment HM0 the R x N amplitude matrix associated with each of the R accompaniment spectral shapes numberOfIterations the number of iterations for the estimatino algorithm updateRulePower the power to which the multiplicative gradient is elevated to stepNotes the number of elements in WF0 per semitone. stepNotes=4 means that there are 48 elements per octave in WF0. lambdaHF0 Lagrangian multiplier for the octave control alphaHF0 parameter that controls how much influence a lower octave can have on the upper octave's amplitude. Outputs: HGAMMA the estimated P x K decomposition matrix of WPHI on WGAMMA HPHI the estimated K x N amplitude matrix of the filter part HF0 the estimated NF0 x N amplitude matrix for the source part HM the estimated R x N amplitude matrix for the accompaniment WM the estimate F x R spectral shapes for the accompaniment recoError the successive values of the Itakura Saito divergence between the power spectrogram and the spectrogram computed thanks to the updated estimations of the matrices. Please also refer to the following article for more details about the algorithm within this function, as well as the meaning of the different matrices that are involved: J.-L. Durrieu, G. Richard, B. David and C. Fevotte Source/Filter Model for Unsupervised Main Melody Extraction From Polyphonic Audio Signals IEEE Transactions on Audio, Speech and Language Processing Vol. 18, No. 3, March 2010 """ eps = 10 ** (-20) if displayEvolution: import matplotlib.pyplot as plt from imageMatlab import imageM plt.ion() print("Is the display interactive? ", plt.isinteractive()) # renamed for convenience: K = numberOfFilters R = int(numberOfAccompanimentSpectralShapes) omega = updateRulePower F, N = SX.shape Fwf0, NF0 = WF0.shape Fwgamma, P = WGAMMA.shape # Checking the sizes of the matrices if Fwf0 != F: return False # A REVOIR!!! if HGAMMA0 is None: HGAMMA0 = np.abs(randn(P, K)) else: if not(isinstance(HGAMMA0,np.ndarray)): # default behaviour HGAMMA0 = np.array(HGAMMA0) Phgamma0, Khgamma0 = HGAMMA0.shape if Phgamma0 != P or Khgamma0 != K: print("Wrong dimensions for given HGAMMA0, \n") print("random initialization used instead") HGAMMA0 = np.abs(randn(P, K)) HGAMMA = np.copy(HGAMMA0) if HPHI0 is None: # default behaviour HPHI = np.abs(randn(K, N)) else: Khphi0, Nhphi0 = np.array(HPHI0).shape if Khphi0 != K or Nhphi0 != N: print("Wrong dimensions for given HPHI0, \n") print("random initialization used instead") HPHI = np.abs(randn(K, N)) else: HPHI = np.copy(np.array(HPHI0)) if HF00 is None: HF00 = np.abs(randn(NF0, N)) else: if np.array(HF00).shape[0] == NF0 and np.array(HF00).shape[1] == N: HF00 = np.array(HF00) else: print("Wrong dimensions for given HF00, \n") print("random initialization used instead") HF00 = np.abs(randn(NF0, N)) HF0 = np.copy(HF00) if HM0 is None: HM0 = np.abs(randn(R, N)) else: if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N: HM0 = np.array(HM0) else: print("Wrong dimensions for given HM0, \n") print("random initialization used instead") HM0 = np.abs(randn(R, N)) HM = np.copy(HM0) if WM0 is None: WM0 = np.abs(randn(F, R)) else: if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R: WM0 = np.array(WM0) else: print("Wrong dimensions for given WM0, \n") print("random initialization used instead") WM0 = np.abs(randn(F, R)) WM = np.copy(WM0) # Iterations to estimate the SIMM parameters: WPHI = np.dot(WGAMMA, HGAMMA) SF0 = np.dot(WF0, HF0) SPHI = np.dot(WPHI, HPHI) SM = np.dot(WM, HM) hatSX = SF0 * SPHI + SM ## SX = SX + np.abs(randn(F, N)) ** 2 # should not need this line # which ensures that data is not # 0 everywhere. # temporary matrices tempNumFbyN = np.zeros([F, N]) tempDenFbyN = np.zeros([F, N]) # Array containing the reconstruction error after the update of each # of the parameter matrices: recoError = np.zeros([numberOfIterations * 5 * 2 + NF0 * 2 + 1]) recoError[0] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error at beginning: ", recoError[0]) counterError = 1 if displayEvolution: h1 = plt.figure(1) if makeMovie: dirName = 'tmp%s/' %time.strftime("%Y%m%d%H%M%S") os.system('mkdir %s' %dirName) # Main loop for multiplicative updating rules: for n in np.arange(numberOfIterations): # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM if verbose: print("iteration ", n, " over ", numberOfIterations) if displayEvolution: h1.clf();imageM(db(HF0)); plt.clim([np.amax(db(HF0))-100, np.amax(db(HF0))]);plt.draw(); ## h1.clf(); ## imageM(HF0 * np.outer(np.ones([NF0, 1]), ## 1 / (HF0.max(axis=0)))); if makeMovie: filename = dirName + '%04d' % n + '.png' plt.savefig(filename, dpi=100) # updating HF0: tempNumFbyN = (SPHI * SX) / np.maximum(hatSX ** 2, eps) tempDenFbyN = SPHI / np.maximum(hatSX, eps) # This to enable octave control HF0[np.arange(12 * stepNotes, NF0), :] \ = HF0[np.arange(12 * stepNotes, NF0), :] \ * (np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T, tempNumFbyN) \ / np.maximum( np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T, tempDenFbyN) \ + lambdaHF0 * (- (alphaHF0 - 1.0) \ / np.maximum(HF0[ np.arange(12 * stepNotes, NF0), :], eps) \ + HF0[ np.arange(NF0 - 12 * stepNotes), :]), eps)) ** omega HF0[np.arange(12 * stepNotes), :] \ = HF0[np.arange(12 * stepNotes), :] \ * (np.dot(WF0[:, np.arange(12 * stepNotes)].T, tempNumFbyN) / np.maximum( np.dot(WF0[:, np.arange(12 * stepNotes)].T, tempDenFbyN), eps)) ** omega ### normal update rules without checking octaves: ##HF0 = HF0 * (np.dot(WF0.T, tempNumFbyN) / ## np.maximum(np.dot(WF0.T, tempDenFbyN), eps)) ** omega SF0 = np.maximum(np.dot(WF0, HF0),eps) hatSX = np.maximum(SF0 * SPHI + SM,eps) if computeISDistortion: recoError[counterError] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error difference after HF0 : ",) print(recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating HPHI tempNumFbyN = (SF0 * SX) / np.maximum(hatSX ** 2, eps) tempDenFbyN = SF0 / np.maximum(hatSX, eps) HPHI = HPHI * (np.dot(WPHI.T, tempNumFbyN) / \ np.maximum(np.dot(WPHI.T, tempDenFbyN), eps)) ** omega sumHPHI = np.sum(HPHI, axis=0) HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / \ np.outer(np.ones(K), sumHPHI[sumHPHI>0]) HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI) SF0 = np.maximum(np.dot(WF0, HF0), eps) SPHI = np.maximum(np.dot(WPHI, HPHI), eps) hatSX = np.maximum(SF0 * SPHI + SM, eps) if computeISDistortion: recoError[counterError] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error difference after HPHI : ", \ recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating HM tempNumFbyN = SX / np.maximum(hatSX ** 2, eps) tempDenFbyN = 1 / np.maximum(hatSX, eps) HM = np.maximum(HM * (np.dot(WM.T, tempNumFbyN) / \ np.maximum(np.dot(WM.T, tempDenFbyN), eps)) ** \ omega, eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SF0 * SPHI + SM, eps) if computeISDistortion: recoError[counterError] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error difference after HM : ", \ recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating HGAMMA if updateHGAMMA: tempNumFbyN = (SF0 * SX) / np.maximum(hatSX ** 2, eps) tempDenFbyN = SF0 / np.maximum(hatSX, eps) HGAMMA = np.maximum(\ HGAMMA * (np.dot(WGAMMA.T, \ np.dot(tempNumFbyN, HPHI.T)) / \ np.maximum(\ np.dot(WGAMMA.T, \ np.dot(tempDenFbyN, HPHI.T)), eps)) ** \ omega, eps) sumHGAMMA = np.sum(HGAMMA, axis=0) HGAMMA[:, sumHGAMMA>0] = HGAMMA[:, sumHGAMMA>0] / \ np.outer(np.ones(P), \ sumHGAMMA[sumHGAMMA>0]) HPHI = HPHI * np.outer(sumHGAMMA, np.ones(N)) sumHPHI = np.sum(HPHI, axis=0) HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0]) HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI) WPHI = np.maximum(np.dot(WGAMMA, HGAMMA), eps) SF0 = np.maximum(np.dot(WF0, HF0), eps) SPHI = np.maximum(np.dot(WPHI, HPHI), eps) hatSX = np.maximum(SF0 * SPHI + SM, eps) if computeISDistortion: recoError[counterError] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error difference after HGAMMA: ",) print(recoError[counterError] - recoError[counterError - 1]) counterError += 1 # updating WM, after a certain number of iterations (here, after 1 iteration) if n > -1: # this test can be used such that WM is updated only # after a certain number of iterations tempNumFbyN = SX / np.maximum(hatSX ** 2, eps) tempDenFbyN = 1 / np.maximum(hatSX, eps) WM = np.maximum(WM * (np.dot(tempNumFbyN, HM.T) / np.maximum(np.dot(tempDenFbyN, HM.T), eps)) ** omega, eps) sumWM = np.sum(WM, axis=0) WM[:, sumWM>0] = (WM[:, sumWM>0] / np.outer(np.ones(F),sumWM[sumWM>0])) HM = HM * np.outer(sumWM, np.ones(N)) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SF0 * SPHI + SM, eps) if computeISDistortion: recoError[counterError] = ISDistortion(SX, hatSX) if verbose: print("Reconstruction error difference after WM : ",) print(recoError[counterError] - recoError[counterError - 1]) counterError += 1 return HGAMMA, HPHI, HF0, HM, WM, recoError
def Stereo_SIMM(# the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=4, numberOfAccompanimentSpectralShapes=10, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=1000, updateRulePower=1.0, stepNotes=4, lambdaHF0=0.00,alphaHF0=0.99, displayEvolution=False, verbose=True, updateHGAMMA=True): """ HGAMMA, HPHI, HF0, HM, WM, recoError = SIMM(SXR, SXL, WF0, WGAMMA, numberOfFilters=4, numberOfAccompanimentSpectralShapes=10, HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, numberOfIterations=1000, updateRulePower=1.0, stepNotes=4, lambdaHF0=0.00, alphaHF0=0.99, displayEvolution=False, verbose=True) Implementation of the Smooth-filters Instantaneous Mixture Model (SIMM). This model can be used to estimate the main melody of a song, and separate the lead voice from the accompaniment, provided that the basis WF0 is constituted of elements associated to particular pitches. Inputs: SX the F x N power spectrogram to be approximated. F is the number of frequency bins, while N is the number of analysis frames WF0 the F x NF0 basis matrix containing the NF0 source elements WGAMMA the F x P basis matrix of P smooth elementary filters numberOfFilters the number of filters K to be considered numberOfAccompanimentSpectralShapes the number of spectral shapes R for the accompaniment HGAMMA0 the P x K decomposition matrix of WPHI on WGAMMA HPHI0 the K x N amplitude matrix of the filter part of the lead instrument HF00 the NF0 x N amplitude matrix for the source part of the lead instrument WM0 the F x R the matrix for spectral shapes of the accompaniment HM0 the R x N amplitude matrix associated with each of the R accompaniment spectral shapes numberOfIterations the number of iterations for the estimatino algorithm updateRulePower the power to which the multiplicative gradient is elevated to stepNotes the number of elements in WF0 per semitone. stepNotes=4 means that there are 48 elements per octave in WF0. lambdaHF0 Lagrangian multiplier for the octave control alphaHF0 parameter that controls how much influence a lower octave can have on the upper octave's amplitude. Outputs: HGAMMA the estimated P x K decomposition matrix of WPHI on WGAMMA HPHI the estimated K x N amplitude matrix of the filter part HF0 the estimated NF0 x N amplitude matrix for the source part HM the estimated R x N amplitude matrix for the accompaniment WM the estimate F x R spectral shapes for the accompaniment recoError the successive values of the Itakura Saito divergence between the power spectrogram and the spectrogram computed thanks to the updated estimations of the matrices. Please also refer to the following article for more details about the algorithm within this function, as well as the meaning of the different matrices that are involved: J.-L. Durrieu, G. Richard, B. David and C. Fevotte Source/Filter Model for Unsupervised Main Melody Extraction From Polyphonic Audio Signals IEEE Transactions on Audio, Speech and Language Processing Vol. 18, No. 3, March 2010 """ eps = 10 ** (-20) if displayEvolution: import matplotlib.pyplot as plt from imageMatlab import imageM plt.ion() print "Is the display interactive? ", plt.isinteractive() # renamed for convenience: K = numberOfFilters R = numberOfAccompanimentSpectralShapes omega = updateRulePower F, N = SXR.shape if (F, N) != SXL.shape: print "The input STFT matrices do not have the same dimension.\n" print "Please check what happened..." raise ValueError("Dimension of STFT matrices must be the same.") Fwf0, NF0 = WF0.shape Fwgamma, P = WGAMMA.shape # Checking the sizes of the matrices if Fwf0 != F: return False # A REVOIR!!! if HGAMMA0 is None: HGAMMA0 = np.abs(randn(P, K)) else: if not(isinstance(HGAMMA0,np.ndarray)): # default behaviour HGAMMA0 = np.array(HGAMMA0) Phgamma0, Khgamma0 = HGAMMA0.shape if Phgamma0 != P or Khgamma0 != K: print "Wrong dimensions for given HGAMMA0, \n" print "random initialization used instead" HGAMMA0 = np.abs(randn(P, K)) HGAMMA = np.copy(HGAMMA0) if HPHI0 is None: # default behaviour HPHI = np.abs(randn(K, N)) else: Khphi0, Nhphi0 = np.array(HPHI0).shape if Khphi0 != K or Nhphi0 != N: print "Wrong dimensions for given HPHI0, \n" print "random initialization used instead" HPHI = np.abs(randn(K, N)) else: HPHI = np.copy(np.array(HPHI0)) if HF00 is None: HF00 = np.abs(randn(NF0, N)) else: if np.array(HF00).shape[0] == NF0 and np.array(HF00).shape[1] == N: HF00 = np.array(HF00) else: print "Wrong dimensions for given HF00, \n" print "random initialization used instead" HF00 = np.abs(randn(NF0, N)) HF0 = np.copy(HF00) if HM0 is None: HM0 = np.abs(randn(R, N)) else: if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N: HM0 = np.array(HM0) else: print "Wrong dimensions for given HM0, \n" print "random initialization used instead" HM0 = np.abs(randn(R, N)) HM = np.copy(HM0) if WM0 is None: WM0 = np.abs(randn(F, R)) else: if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R: WM0 = np.array(WM0) else: print "Wrong dimensions for given WM0, \n" print "random initialization used instead" WM0 = np.abs(randn(F, R)) WM = np.copy(WM0) alphaR = 0.5 alphaL = 0.5 betaR = np.diag(np.random.rand(R)) betaL = np.eye(R) - betaR # Iterations to estimate the SIMM parameters: WPHI = np.dot(WGAMMA, HGAMMA) SF0 = np.dot(WF0, HF0) SPHI = np.dot(WPHI, HPHI) # SM = np.dot(WM, HM) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) # SX = SX + np.abs(randn(F, N)) ** 2 # should not need this line # which ensures that data is not # 0 everywhere. # temporary matrices tempNumFbyN = np.zeros([F, N]) tempDenFbyN = np.zeros([F, N]) # Array containing the reconstruction error after the update of each # of the parameter matrices: recoError = np.zeros([numberOfIterations * 5 * 2 + NF0 * 2 + 1]) recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL) if verbose: print "Reconstruction error at beginning: ", recoError[0] counterError = 1 if displayEvolution: h1 = plt.figure(1) # Main loop for multiplicative updating rules: for n in np.arange(numberOfIterations): # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM if verbose: print "iteration ", n, " over ", numberOfIterations if displayEvolution: h1.clf();imageM(db(HF0)); plt.clim([np.amax(db(HF0))-100, np.amax(db(HF0))]);plt.draw(); # h1.clf(); # imageM(HF0 * np.outer(np.ones([NF0, 1]), # 1 / (HF0.max(axis=0)))); # updating HF0: tempNumFbyN = ((alphaR**2) * SPHI * SXR) / np.maximum(hatSXR ** 2, eps)\ + ((alphaL**2) * SPHI * SXL) / np.maximum(hatSXL ** 2, eps) tempDenFbyN = (alphaR**2) * SPHI / np.maximum(hatSXR, eps)\ + (alphaL**2) * SPHI / np.maximum(hatSXL, eps) # This to enable octave control HF0[np.arange(12 * stepNotes, NF0), :] \ = HF0[np.arange(12 * stepNotes, NF0), :] \ * (np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T, tempNumFbyN) \ / np.maximum( np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T, tempDenFbyN) \ + lambdaHF0 * (- (alphaHF0 - 1.0) \ / np.maximum(HF0[ np.arange(12 * stepNotes, NF0), :], eps) \ + HF0[ np.arange(NF0 - 12 * stepNotes), :]), eps)) ** omega HF0[np.arange(12 * stepNotes), :] \ = HF0[np.arange(12 * stepNotes), :] \ * (np.dot(WF0[:, np.arange(12 * stepNotes)].T, tempNumFbyN) / np.maximum( np.dot(WF0[:, np.arange(12 * stepNotes)].T, tempDenFbyN), eps)) ** omega ## # normal update rules: ## HF0 = HF0 * (np.dot(WF0.T, tempNumFbyN) / ## np.maximum(np.dot(WF0.T, tempDenFbyN), eps)) ** omega SF0 = np.maximum(np.dot(WF0, HF0), eps) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after HF0 : ", ## print recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating HPHI if updateHGAMMA or True: tempNumFbyN = ((alphaR**2) * SF0 * SXR) / np.maximum(hatSXR ** 2, eps)\ + ((alphaL**2) * SF0 * SXL) / np.maximum(hatSXL ** 2, eps) tempDenFbyN = (alphaR**2) * SF0 / np.maximum(hatSXR, eps)\ + (alphaL**2) * SF0 / np.maximum(hatSXL, eps) HPHI = HPHI * (np.dot(WPHI.T, tempNumFbyN) / np.maximum(np.dot(WPHI.T, tempDenFbyN), eps)) ** omega sumHPHI = np.sum(HPHI, axis=0) HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0]) HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI) SF0 = np.maximum(np.dot(WF0, HF0), eps) SPHI = np.maximum(np.dot(WPHI, HPHI), eps) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after HPHI : ", recoError[counterError] - recoError[counterError - 1] ## counterError += 1 # updating HM # tempNumFbyN = SXR / np.maximum(hatSXR ** 2, eps)\ # + SXL / np.maximum(hatSXL ** 2, eps) # tempDenFbyN = 1 / np.maximum(hatSXR, eps)\ # + 1 / np.maximum(hatSXL, eps) # HM = np.maximum(HM * (np.dot(WM.T, tempNumFbyN) / np.maximum(np.dot(WM.T, tempDenFbyN), eps)) ** omega, eps) HM = HM * \ ((np.dot(np.dot((betaR**2), WM.T), SXR / np.maximum(hatSXR ** 2, eps)) + np.dot(np.dot((betaL**2), WM.T), SXL / np.maximum(hatSXL ** 2, eps)) ) / np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 / np.maximum(hatSXR, eps)) + np.dot(np.dot((betaL**2), WM.T), 1 / np.maximum(hatSXL, eps)), eps)) ** omega hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after HM : ", recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating HGAMMA if updateHGAMMA: tempNumFbyN = ((alphaR ** 2) * SF0 * SXR) / np.maximum(hatSXR ** 2, eps)\ + ((alphaL ** 2) * SF0 * SXL) / np.maximum(hatSXL ** 2, eps) tempDenFbyN = (alphaR ** 2) * SF0 / np.maximum(hatSXR, eps) \ + (alphaL ** 2) * SF0 / np.maximum(hatSXL, eps) HGAMMA = np.maximum(HGAMMA * (np.dot(WGAMMA.T, np.dot(tempNumFbyN, HPHI.T)) / np.maximum(np.dot(WGAMMA.T, np.dot(tempDenFbyN, HPHI.T)), eps)) ** omega, eps) sumHGAMMA = np.sum(HGAMMA, axis=0) HGAMMA[:, sumHGAMMA>0] = HGAMMA[:, sumHGAMMA>0] / np.outer(np.ones(P), sumHGAMMA[sumHGAMMA>0]) HPHI = HPHI * np.outer(sumHGAMMA, np.ones(N)) sumHPHI = np.sum(HPHI, axis=0) HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0]) HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI) WPHI = np.maximum(np.dot(WGAMMA, HGAMMA), eps) SF0 = np.maximum(np.dot(WF0, HF0), eps) SPHI = np.maximum(np.dot(WPHI, HPHI), eps) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after HGAMMA: ", ## print recoError[counterError] - recoError[counterError - 1] ## counterError += 1 # updating WM, after a certain number of iterations (here, after 1 iteration) if n > -1: # this test can be used such that WM is updated only # after a certain number of iterations ## tempNumFbyN = SX / np.maximum(hatSX ** 2, eps) ## tempDenFbyN = 1 / np.maximum(hatSX, eps) ## WM = np.maximum(WM * (np.dot(tempNumFbyN, HM.T) / ## np.maximum(np.dot(tempDenFbyN, HM.T), ## eps)) ** omega, eps) WM = WM * \ ((np.dot(SXR / np.maximum(hatSXR ** 2, eps), np.dot(HM.T, betaR ** 2)) + np.dot(SXL / np.maximum(hatSXL ** 2, eps), np.dot(HM.T, betaL ** 2)) ) / (np.dot(1 / np.maximum(hatSXR, eps), np.dot(HM.T, betaR ** 2)) + np.dot(1 / np.maximum(hatSXL, eps), np.dot(HM.T, betaL ** 2)) )) ** omega sumWM = np.sum(WM, axis=0) WM[:, sumWM>0] = (WM[:, sumWM>0] / np.outer(np.ones(F),sumWM[sumWM>0])) HM = HM * np.outer(sumWM, np.ones(N)) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after WM : ", ## print recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating alphaR and alphaL: tempNumFbyN = SF0 * SPHI * SXR / np.maximum(hatSXR ** 2, eps) tempDenFbyN = SF0 * SPHI / np.maximum(hatSXR, eps) alphaR = np.maximum(alphaR * (np.sum(tempNumFbyN) / np.sum(tempDenFbyN)) ** (omega*.1), eps) tempNumFbyN = SF0 * SPHI * SXL / np.maximum(hatSXL ** 2, eps) tempDenFbyN = SF0 * SPHI / np.maximum(hatSXL, eps) alphaL = np.maximum(alphaL * (np.sum(tempNumFbyN) / np.sum(tempDenFbyN)) ** (omega*.1), eps) alphaR = alphaR / np.maximum(alphaR + alphaL, .001) alphaL = np.copy(1 - alphaR) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after ALPHA : ", ## print recoError[counterError] - recoError[counterError - 1] counterError += 1 # updating betaR and betaL betaR = np.diag(np.diag(np.maximum(betaR * ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR, eps)), HM.T))) ** (omega*.1), eps))) betaL = np.diag(np.diag(np.maximum(betaL * ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2, eps)), HM.T)) / (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL, eps)), HM.T))) ** (omega*.1), eps))) betaR = betaR / np.maximum(betaR + betaL, eps) betaL = np.copy(np.eye(R) - betaR) hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaR**2),HM), eps) hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \ np.dot(np.dot(WM, betaL**2),HM), eps) ## recoError[counterError] = ISDistortion(SXR, hatSXR) \ ## + ISDistortion(SXL, hatSXL) ## ## if verbose: ## print "Reconstruction error difference after BETA : ", ## print recoError[counterError] - recoError[counterError - 1] counterError += 1 return alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError
def main(inputAudioFile): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) #Number of iterations parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=50) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") (options, args) = parser.parse_args() #if len(args) != 1: #parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image', cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: #inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 if data.shape[0] == data.size: # data is multi-channel print("The audio file is not stereo. Try separateLead.py instead.") raise ValueError("number of dimensions of the input not 2") if data.shape[1] != 2: print("The data is multichannel, but not stereo... \n") print("Unfortunately this program does not scale well. Data is \n") print("reduced to its 2 first channels.\n") data = data[:, 0:2] # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter R = options.R if options.verbose: print("Some parameter settings:") print(" Size of analysis windows: ", windowSizeInSamples) print(" Hopsize: ", hopsize) print(" Size of Fourier transforms: ", NFT) print(" Number of iterations to be done: ", niter) print(" Number of elements in WM: ", R) XR, F, N = stft(data[:, 0], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) SXR = np.abs(XR)**2 SXL = np.abs(XL)**2 del data, F, N # TODO: also process these as options: eps = 10**-9 minF0 = 100 maxF0 = 800 Fs = fs F, N = SXR.shape stepNotes = 20 # this is the number of F0s within one semitone # until 17/09/2010 : stepNotes = 20 # 17/09/2010 : trying stepNotes = 8, checking for less artefacts K = 10 # number of spectral shapes for the filter part # R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: SX = np.maximum(np.abs((XR + XL) / 2.0)**2, 10**-8) # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10**(-90) p0_0 = transitions[cutoffnote - 1] * 10**(-100) p0_f = transitions[cutoffnote - 1] * 10**(-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") del logHF0 # detection of silences: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (XR+XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print("The melody should be provided as <Time (s)><F0 (Hz)>.") raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:, 0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:, 1] if minF0 > melFreqHz[ melFreqHz > 40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz > 40.0].min() * .97 maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03) print("Recomputing the source basis for ") print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.") # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs( np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs( np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath <= 0] = 0 freqMelody = F0Table[np.array(indexBestPath, dtype=int)] freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0] np.savetxt( options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath != 0, :] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1, dim1index.size) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ) dim2index = dim2index[indexBestPath != 0, :] dim2index = dim2index.reshape(1, dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00, SX alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) # wav.write(options.voc_output_file, fs, \ # np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR mestR = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL mestL = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, fs) mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) # wav.write(options.mus_output_file, fs, \ # np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) # wav.write(outputFileName, fs, \ # np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR mestR = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL mestL = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #This is the required file outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs) os.chdir('media/karaoke') mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([mestR,mestL]).T) if displayEvolution: plt.close('all') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to end the program... !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print("Done!") print outputFileName os.chdir('..') os.chdir('..')
def main(args, options): stereoEstimation = True # Median filtering in spectrogram HPS = False displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image', cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = '' if len(args) >= 2: inputAudioFile = args[0] options.pitch_output_file = args[1] if len(args) == 1: inputAudioFile = args[0] if len(args) == 0: inputAudioFile = options.input_file if inputAudioFile[-4:] != ".wav": raise ValueError( "File not WAV file? Only WAV format support, for now...") #print "Writing the different following output files:" if not (options.vit_pitch_output_file is None): print " estimated pitches in", options.vit_pitch_output_file if not (options.sal_output_file is None): print " salience file in ", options.sal_output_file if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4] + '_pitches.txt' try: from essentia.standard import AudioLoader loaded = AudioLoader(filename=inputAudioFile)() audio = loaded[0] Fs = loaded[1] nchan = loaded[2] loaded = AudioLoader(filename=inputAudioFile)() audio = loaded[0] if nchan == 1: data = audio[:, 0].transpose() else: data = audio.transpose() data = np.double(data) / (1.2 * abs(data).max()) except: # Using scipy to import wav import scipy.io.wavfile as wav Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. data = np.double(data) / scaleData # makes data vary from -1 to 1 options.Fs = Fs is_stereo = True if data.shape[0] == data.size: # data is multi-channel #print "The audio file is not stereo." #print "The audio file is not stereo. Making stereo out of mono." #print "(You could also try the older separateLead.py...)" is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:, 0:2] # Processing the options: windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs)) hopsize = np.round(options.hopsize * Fs) #if hopsize != windowSizeInSamples/8: # #print "Overriding given hopsize to use 1/8th of window size" # #hopsize = windowSizeInSamples/8 # warnings.warn("Chosen hopsize: "+str(hopsize)+\ # ", while windowsize: "+str(windowSizeInSamples)) options.hopsizeInSamples = hopsize if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = int(options.R) eps = 10**-9 if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R if is_stereo: XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) #SXR = np.abs(XR) ** 2 #SXL = np.abs(XL) ** 2 SX = np.maximum((0.5 * np.abs(XR + XL))**2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X)**2, eps) del data, F, N # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = int( options.K_numFilters) # number of spectral shapes for the filter part P = int(options.P_numAtomFilters ) # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if options.sal_output_file is None or not os.path.exists( options.sal_output_file): if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: if (HPS): from scipy.signal import medfilt if (is_stereo & stereoEstimation): SXR = np.maximum(np.abs(XR)**2, eps) SXL = np.maximum(np.abs(XL)**2, eps) if (HPS): SXR = medfilt(SXR, 3) SXL = medfilt(SXL, 3) alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError1 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) else: if (HPS): SX = medfilt(SX, 3) HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) else: print "Loading Salience from file to calculate Melody: " + options.sal_output_file loaded = np.loadtxt(options.sal_output_file).T times = [loaded[0, :]] HF0 = loaded[1:, :] # If vit_pitch_output_file is not null, do melody extraction with Viterbi if not (options.vit_pitch_output_file is None): print "Viterbi decoding" # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10**(-90) p0_0 = transitions[cutoffnote - 1] * 10**(-100) p0_f = transitions[cutoffnote - 1] * 10**(-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * int(np.floor(stepNotes / scopeAllowedHF0)) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * int(np.floor(stepNotes / scopeAllowedHF0)), chirpPerF0 \ * int((np.floor(stepNotes / scopeAllowedHF0))) \ + 1)), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * int(np.floor(stepNotes / scopeAllowedHF0)) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * int(np.floor(stepNotes \ / scopeAllowedHF0)) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * int(np.floor(stepNotes \ / scopeAllowedHF0)) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0) / hatSX)**2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N if not os.path.isdir(os.path.dirname((options.vit_pitch_output_file))): os.mkdir(os.path.dirname((options.vit_pitch_output_file))) np.savetxt(options.vit_pitch_output_file + '.egy', np.array( [np.arange(N) * hopsize / np.double(Fs), energyMel]).T, fmt='%10.5f') # energyMel <= energyMelCumul[ind_999]? melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) # edit: frames predicted as unvoiced will be given negative values # indexBestPath[melNotPresent] = 0 freqMelody = F0Table[np.array(np.minimum(indexBestPath, len(F0Table) - 1), dtype=int)] freqMelody[melNotPresent] = -freqMelody[melNotPresent] if not os.path.exists(os.path.dirname(options.vit_pitch_output_file)): os.makedirs(os.path.dirname(options.vit_pitch_output_file)) np.savetxt(options.vit_pitch_output_file, np.array( [np.arange(N) * hopsize / np.double(Fs), freqMelody]).T, fmt='%10.7f') times = np.array([np.arange(N) * hopsize / np.double(Fs)]) # Save salience file: if not (options.sal_output_file is None): if not os.path.exists(os.path.dirname(options.sal_output_file)): os.makedirs(os.path.dirname(options.sal_output_file)) np.savetxt(options.sal_output_file, np.concatenate((times, HF0), axis=0).T, fmt='%10.6f') # saveSPHI (timbre related) saveSPHI = 0 if saveSPHI: if not os.path.exists( os.path.dirname(options.sal_output_file + '.SPHI')): os.makedirs(os.path.dirname(options.sal_output_file)) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) np.savetxt(options.sal_output_file + '.SPHI', np.concatenate((times, SPHI), axis=0).T, fmt='%10.4f') #np.savetxt(options.sal_output_file+'.WGAMMA',np.concatenate((times,WGAMMA),axis=0).T,fmt='%10.4f') # return times[0],freqMelody,HF0 print "Done!" return times[0], HF0, options
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option( "-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n" "solo (vocal) part. \n" "If None, appends _lead to inputAudioFile.", default=None, ) parser.add_option( "-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n" "music part.\n" "If None, appends _acc to inputAudioFile.", default=None, ) parser.add_option( "-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile", default=None, ) # Some more optional options: parser.add_option( "-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False ) parser.add_option( "-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True ) parser.add_option( "-n", "--dontseparate", dest="separateSignals", action="store_false", help="Trigger this option if you only desire to " + "estimate the melody", default=True, ) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30) parser.add_option( "--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s." ) parser.add_option( "--Fourier-size", dest="fourierSize", type="int", default=None, help="size of Fourier transforms, " "in samples.", ) parser.add_option( "--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.", ) parser.add_option( "--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment." ) parser.add_option( "--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, " "with at each line: <time (s)><F0 (Hz)>.", ) parser.add_option( "--numAtomFilters", dest="P_numAtomFilters", type="int", default=30, help="Number of atomic filters - in WGAMMA.", ) parser.add_option( "--numFilters", dest="K_numFilters", type="int", default=10, help="Number of filters for decomposition - in WPHI", ) parser.add_option( "--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0." ) parser.add_option( "--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0." ) parser.add_option( "--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone." ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc("image", cmap="jet") ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] if inputAudioFile[-4:] != ".wav": raise ValueError("File not WAV file? Only WAV format support, for now...") if options.mus_output_file is None: options.mus_output_file = inputAudioFile[:-4] + "_acc.wav" if options.voc_output_file is None: options.voc_output_file = inputAudioFile[:-4] + "_lead.wav" if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4] + "_pitches.txt" print "Writing the different following output files:" print " separated lead in", options.voc_output_file print " separated accompaniment in", options.mus_output_file print " separated lead + unvoc in", options.voc_output_file[:-4] + "_VUIMM.wav" print " separated acc - unvoc in", options.mus_output_file[:-4] + "_VUIMM.wav" print " estimated pitches in", options.pitch_output_file Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 is_stereo = True if data.shape[0] == data.size: # data is multi-channel print "The audio file is not stereo. Making stereo out of mono." print "(You could also try the older separateLead.py...)" is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:, 0:2] # Processing the options: windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs)) hopsize = np.round(options.hopsize * Fs) if hopsize != windowSizeInSamples / 8: # print "Overriding given hopsize to use 1/8th of window size" # hopsize = windowSizeInSamples/8 warnings.warn("Chosen hopsize: " + str(hopsize) + ", while windowsize: " + str(windowSizeInSamples)) if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = options.R eps = 10 ** -9 if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R if is_stereo: XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) # SXR = np.abs(XR) ** 2 # SXL = np.abs(XL) ** 2 SX = np.maximum((0.5 * np.abs(XR + XL)) ** 2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X) ** 2, eps) del data, F, N # TODO: also process these as options: # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = options.K_numFilters # number of spectral shapes for the filter part P = options.P_numAtomFilters # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = generate_WF0_chirped( minF0, maxF0, Fs, Nfft=NFT, stepNotes=stepNotes, lengthWindow=windowSizeInSamples, Ot=0.25, perF0=chirpPerF0, depthChirpInSemiTone=0.15, loadWF0=True, analysisWindow="sinebell", ) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale="linear", numberOfBasis=P, overlap=0.75) if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r"Frame number $n$", fontsize=16) plt.ylabel(r"Leading source number $u$", fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = transitions[ np.array(np.abs(np.outer(np.ones(NF0), b) - np.outer(b, np.ones(NF0))), dtype=int) ] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 / np.outer(sumTransitionMatrixF0, np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray( logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose ) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, "-b") h2.hold(False) plt.axis("tight") del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array( np.maximum( np.minimum( np.outer( chirpPerF0 * indexBestPath, np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), ) + np.outer( np.ones(N), np.arange( -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1), ), ), chirpPerF0 * NF0 - 1, ), 0, ), dtype=int, ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) dim2index = np.outer( np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int) ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0) / hatSX) ** 2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = energyMel <= energyMelCumulNorm[ind_999] indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print "The melody should be provided as <Time (s)><F0 (Hz)>." raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:, 0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:, 1] if minF0 > melFreqHz[melFreqHz > 40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz > 40.0].min() * 0.97 maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03) print "Recomputing the source basis for " print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz." # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = generate_WF0_chirped( minF0, maxF0, Fs, Nfft=NFT, stepNotes=stepNotes, lengthWindow=windowSizeInSamples, Ot=0.25, perF0=chirpPerF0, depthChirpInSemiTone=0.15, ) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs( np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N)) ) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps, range(N)] >= 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath <= 0] = 0 freqMelody = F0Table[np.array(indexBestPath, dtype=int)] freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # If separation is required: if options.separateSignals: # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array( np.maximum( np.minimum( np.outer( chirpPerF0 * indexBestPath, np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), ) + np.outer( np.ones(N), np.arange( -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1), ), ), chirpPerF0 * NF0 - 1, ), 0, ), dtype=int, ) dim1index = dim1index[indexBestPath != 0, :] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1, dim1index.size) dim2index = np.outer( np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int) ) dim2index = dim2index[indexBestPath != 0, :] dim2index = dim2index.reshape(1, dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00 if is_stereo: del SX SXR = np.maximum(np.abs(XR) ** 2, eps) SXL = np.maximum(np.abs(XL) ** 2, eps) alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM) hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM) hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, Fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, np.array([vestR, vestL]).T) # wav.write(options.voc_output_file, Fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, Fs) mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, np.array([mestR, mestL]).T) # wav.write(options.mus_output_file, Fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, # WM, HM0=None, # HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM) hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM) hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav" vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) wav.write(outputFileName, Fs, np.array([vestR, vestL]).T) hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav" mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(outputFileName, Fs, np.array([mestR, mestL]).T) else: # running on monophonic data: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM, HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest * scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest * scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, mest) del hatM, vest, mest, hatV, hatSX, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) SM = np.dot(WM, HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest * scaleData), dtype=dataType) outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav" wav.write(outputFileName, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest * scaleData), dtype=dataType) outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav" wav.write(outputFileName, Fs, mest) if displayEvolution: plt.close("all") print "Done!"
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\nsolo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\nmusic part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=50) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab plt.rc('text', usetex=True) plt.rc('image',cmap='gray_r') plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) #data, fs, enc = scikits.audiolab.wavread(inputAudioFile) if data.shape[0] != data.size: # data is multi-channel data = np.mean(data,axis=1) # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter if options.verbose: print "Size of analysis windows: ", windowSizeInSamples, "\n" print "Hopsize: ", hopsize, "\n" print "Size of Fourier transforms: ", NFT, "\n" print "Number of iterations to be done: ", niter, "\n" X, F, N = stft(data, fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: SX = np.maximum(np.abs(X) ** 2, 10 ** -8) del data, F, N # TODO: also process these as options: minF0 = 100 maxF0 = 800 Fs = fs F, N = SX.shape stepNotes = 20 # this is the number of F0s within one semitone K = 50 # number of spectral shapes for the filter part R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=2048, \ stepNotes=stepNotes, \ lengthWindow=2048, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, 2048, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() raw_input("Press Return to resume the program. \nBe sure that the figure has been already displayed, so that the evolution of HF0 will be visible. ") # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: plt.figure(3);plt.clf() plt.subplot(221) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(222) plt.plot(db(np.dot(WGAMMA, HGAMMA[:,1]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(223) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(224) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.figure(4);plt.clf() imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(5);plt.clf() imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0 cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(6);plt.clf() imageMatlab.imageM(db(np.dot(WM, HM)), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(7);plt.clf() imageMatlab.imageM(db(WM), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Element number $r$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0)) np.savetxt(options.pitch_output_file, np.array([np.arange(N)*options.hopsize, F0Table[np.array(indexBestPath,dtype=int)]]).T) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') raw_input("Press Return to resume the program...") del logHF0 # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 1.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 WF0effective = WF0 HF00effective = HF00 del HF0, HGAMMA, HPHI, HM, WM, HF00 HGAMMA, HPHI, HF0, HM, WM, recoError2 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM, HM) hatSX = SPHI * SF0 + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(vest, options.voc_output_file, fs) wav.write(options.voc_output_file, fs, \ vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(mest, options.mus_output_file, fs) wav.write(options.mus_output_file, fs, \ mest) if displayEvolution: plt.figure(13);plt.clf() plt.subplot(221) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(222) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 1]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(223) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(224) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.figure(14);plt.clf() imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(141);plt.clf() SVhat = db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)) \ + db(np.dot(WF0, HF0)) imageMatlab.imageM(SVhat, vmax=SVhat.max(), vmin=SVhat.max() - 50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(15);plt.clf() imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0 plt.figure(16) plt.clf() imageMatlab.imageM(db(np.dot(WM, HM)), vmin=np.maximum(-50, db(SM.min()))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(17) plt.clf() imageMatlab.imageM(db(WM), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Element number $r$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) raw_input("Press Return to end the program...") print "Done!"