def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=100) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image',cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 tmp = np.zeros((data.size, 2)) tmp[:,0] = data tmp[:,1] = data data = tmp if data.shape[0] == data.size: # data is multi-channel print "The audio file is not stereo. Try separateLead.py instead." raise ValueError("number of dimensions of the input not 2") if data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:,0:2] # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter R = options.R if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R XR, F, N = stft(data[:,0], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:,1], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) SXR = np.abs(XR) ** 2 SXL = np.abs(XL) ** 2 del data, F, N # TODO: also process these as options: eps = 10 ** -9 minF0 = 100 maxF0 = 800 Fs = fs F, N = SXR.shape stepNotes = 20 # this is the number of F0s within one semitone # until 17/09/2010 : stepNotes = 20 # 17/09/2010 : trying stepNotes = 8, checking for less artefacts K = 10 # number of spectral shapes for the filter part # R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: SX = np.maximum(np.abs((XR + XL) / 2.0) ** 2, 10 ** -8) # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") del logHF0 # detection of silences: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (XR+XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print "The melody should be provided as <Time (s)><F0 (Hz)>." raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:,1] if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz>40.0].min() *.97 maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03) print "Recomputing the source basis for " print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz." # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath<=0] = 0 freqMelody = F0Table[np.array(indexBestPath,dtype=int)] freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath!=0,:] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1,dim1index.size) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ) dim2index = dim2index[indexBestPath!=0,:] dim2index = dim2index.reshape(1,dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00, SX alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(options.voc_output_file, fs, \ np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(options.mus_output_file, fs, \ np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([mestR,mestL]).T) if displayEvolution: plt.close('all') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to end the program... !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print "Done!"
def main(inputAudioFile): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) #Number of iterations parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=50) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") (options, args) = parser.parse_args() #if len(args) != 1: #parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image', cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: #inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 if data.shape[0] == data.size: # data is multi-channel print("The audio file is not stereo. Try separateLead.py instead.") raise ValueError("number of dimensions of the input not 2") if data.shape[1] != 2: print("The data is multichannel, but not stereo... \n") print("Unfortunately this program does not scale well. Data is \n") print("reduced to its 2 first channels.\n") data = data[:, 0:2] # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter R = options.R if options.verbose: print("Some parameter settings:") print(" Size of analysis windows: ", windowSizeInSamples) print(" Hopsize: ", hopsize) print(" Size of Fourier transforms: ", NFT) print(" Number of iterations to be done: ", niter) print(" Number of elements in WM: ", R) XR, F, N = stft(data[:, 0], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) SXR = np.abs(XR)**2 SXL = np.abs(XL)**2 del data, F, N # TODO: also process these as options: eps = 10**-9 minF0 = 100 maxF0 = 800 Fs = fs F, N = SXR.shape stepNotes = 20 # this is the number of F0s within one semitone # until 17/09/2010 : stepNotes = 20 # 17/09/2010 : trying stepNotes = 8, checking for less artefacts K = 10 # number of spectral shapes for the filter part # R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: SX = np.maximum(np.abs((XR + XL) / 2.0)**2, 10**-8) # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10**(-90) p0_0 = transitions[cutoffnote - 1] * 10**(-100) p0_f = transitions[cutoffnote - 1] * 10**(-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") del logHF0 # detection of silences: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (XR+XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print("The melody should be provided as <Time (s)><F0 (Hz)>.") raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:, 0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:, 1] if minF0 > melFreqHz[ melFreqHz > 40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz > 40.0].min() * .97 maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03) print("Recomputing the source basis for ") print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.") # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs( np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs( np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath <= 0] = 0 freqMelody = F0Table[np.array(indexBestPath, dtype=int)] freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0] np.savetxt( options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath != 0, :] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1, dim1index.size) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ) dim2index = dim2index[indexBestPath != 0, :] dim2index = dim2index.reshape(1, dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00, SX alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) # wav.write(options.voc_output_file, fs, \ # np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR mestR = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL mestL = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, fs) mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) # wav.write(options.mus_output_file, fs, \ # np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft( hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) # wav.write(outputFileName, fs, \ # np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR mestR = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL mestL = istft( hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #This is the required file outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs) os.chdir('media/karaoke') mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(outputFileName, fs, \ np.array([mestR,mestL]).T) if displayEvolution: plt.close('all') ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to end the program... !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print("Done!") print outputFileName os.chdir('..') os.chdir('..')
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n"\ "solo (vocal) part. \n"\ "If None, appends _lead to inputAudioFile.", default=None) parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n"\ "music part.\n"\ "If None, appends _acc to inputAudioFile.", default=None) parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile", default=None) # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("-n", "--dontseparate", dest="separateSignals", action="store_false", help="Trigger this option if you only desire to "+\ "estimate the melody", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=None, help="size of Fourier transforms, "\ "in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") parser.add_option("--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment.") parser.add_option("--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, "\ "with at each line: <time (s)><F0 (Hz)>.") parser.add_option("--numAtomFilters", dest="P_numAtomFilters", type="int", default=30, help="Number of atomic filters - in WGAMMA.") parser.add_option("--numFilters", dest="K_numFilters", type="int", default=10, help="Number of filters for decomposition - in WPHI") parser.add_option("--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0.") parser.add_option("--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0.") parser.add_option("--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image',cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] if inputAudioFile[-4:] != ".wav": raise ValueError("File not WAV file? Only WAV format support, for now...") if options.mus_output_file is None: options.mus_output_file = inputAudioFile[:-4]+'_acc.wav' if options.voc_output_file is None: options.voc_output_file = inputAudioFile[:-4]+'_lead.wav' if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4]+'_pitches.txt' print("Writing the different following output files:") print(" separated lead in", options.voc_output_file) print(" separated accompaniment in", options.mus_output_file) print(" separated lead + unvoc in", options.voc_output_file[:-4] + \ '_VUIMM.wav') print(" separated acc - unvoc in", options.mus_output_file[:-4] + \ '_VUIMM.wav') print(" estimated pitches in", options.pitch_output_file) Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 is_stereo = True if data.shape[0] == data.size: # data is multi-channel print("The audio file is not stereo. Making stereo out of mono.") print("(You could also try the older separateLead.py...)") is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print("The data is multichannel, but not stereo... \n") print("Unfortunately this program does not scale well. Data is \n") print("reduced to its 2 first channels.\n") data = data[:,0:2] # Processing the options: windowSizeInSamples = int(nextpow2(np.round(options.windowSize * Fs)) ) hopsize = np.round(options.hopsize * Fs) if hopsize != windowSizeInSamples/8: #print "Overriding given hopsize to use 1/8th of window size" #hopsize = windowSizeInSamples/8 warnings.warn("Chosen hopsize: "+str(hopsize)+\ ", while windowsize: "+str(windowSizeInSamples)) if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = options.R eps = 10 ** -9 if options.verbose: print("Some parameter settings:") print(" Size of analysis windows: ", windowSizeInSamples) print(" Hopsize: ", hopsize) print(" Size of Fourier transforms: ", NFT) print(" Number of iterations to be done: ", niter) print(" Number of elements in WM: ", R) if is_stereo: XR, F, N = stft(data[:,0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:,1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) #SXR = np.abs(XR) ** 2 #SXL = np.abs(XL) ** 2 SX = np.maximum((0.5*np.abs(XR+XL)) ** 2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X) ** 2, eps) del data, F, N # TODO: also process these as options: # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = options.K_numFilters # number of spectral shapes for the filter part P = options.P_numAtomFilters # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) )) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))) dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int)\ ).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))) HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0)/hatSX)**2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print("The melody should be provided as <Time (s)><F0 (Hz)>.") raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:,1] if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz>40.0].min() *.97 maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03) print("Recomputing the source basis for ") print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.") # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \ 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath<=0] = 0 freqMelody = F0Table[np.array(indexBestPath,dtype=int)] freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # If separation is required: if options.separateSignals: # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1) ))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int) dim1index = dim1index[indexBestPath!=0,:] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1,dim1index.size) dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int) ) dim2index = dim2index[indexBestPath!=0,:] dim2index = dim2index.reshape(1,dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00 if is_stereo: del SX SXR = np.maximum(np.abs(XR) ** 2, eps) SXL = np.maximum(np.abs(XL) ** 2, eps) alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, Fs) vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, \ np.array([vestR,vestL]).T) #wav.write(options.voc_output_file, Fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, Fs) mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, \ np.array([mestR,mestL]).T) #wav.write(options.mus_output_file, Fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, \ betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None,#WM, HM0=None,#HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM) hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM) hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' vestR = np.array(np.round(vestR*scaleData), dtype=dataType) vestL = np.array(np.round(vestL*scaleData), dtype=dataType) wav.write(outputFileName, Fs, \ np.array([vestR,vestL]).T) hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' mestR = np.array(np.round(mestR*scaleData), dtype=dataType) mestL = np.array(np.round(mestL*scaleData), dtype=dataType) wav.write(outputFileName, Fs, \ np.array([mestR,mestL]).T) else: # running on monophonic data: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM,HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest*scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest*scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, mest) del hatM, vest, mest, hatV, hatSX, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) SM = np.dot(WM,HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest*scaleData), dtype=dataType) outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav' wav.write(outputFileName, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest*scaleData), dtype=dataType) outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav' wav.write(outputFileName, Fs, mest) if displayEvolution: plt.close('all') print("Done!")
def main(args, options): stereoEstimation = True # Median filtering in spectrogram HPS = False displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc('image', cmap='jet') ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = '' if len(args) >= 2: inputAudioFile = args[0] options.pitch_output_file = args[1] if len(args) == 1: inputAudioFile = args[0] if len(args) == 0: inputAudioFile = options.input_file if inputAudioFile[-4:] != ".wav": raise ValueError( "File not WAV file? Only WAV format support, for now...") #print "Writing the different following output files:" if not (options.vit_pitch_output_file is None): print " estimated pitches in", options.vit_pitch_output_file if not (options.sal_output_file is None): print " salience file in ", options.sal_output_file if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4] + '_pitches.txt' try: from essentia.standard import AudioLoader loaded = AudioLoader(filename=inputAudioFile)() audio = loaded[0] Fs = loaded[1] nchan = loaded[2] loaded = AudioLoader(filename=inputAudioFile)() audio = loaded[0] if nchan == 1: data = audio[:, 0].transpose() else: data = audio.transpose() data = np.double(data) / (1.2 * abs(data).max()) except: # Using scipy to import wav import scipy.io.wavfile as wav Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. data = np.double(data) / scaleData # makes data vary from -1 to 1 options.Fs = Fs is_stereo = True if data.shape[0] == data.size: # data is multi-channel #print "The audio file is not stereo." #print "The audio file is not stereo. Making stereo out of mono." #print "(You could also try the older separateLead.py...)" is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:, 0:2] # Processing the options: windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs)) hopsize = np.round(options.hopsize * Fs) #if hopsize != windowSizeInSamples/8: # #print "Overriding given hopsize to use 1/8th of window size" # #hopsize = windowSizeInSamples/8 # warnings.warn("Chosen hopsize: "+str(hopsize)+\ # ", while windowsize: "+str(windowSizeInSamples)) options.hopsizeInSamples = hopsize if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = int(options.R) eps = 10**-9 if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R if is_stereo: XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) #SXR = np.abs(XR) ** 2 #SXL = np.abs(XL) ** 2 SX = np.maximum((0.5 * np.abs(XR + XL))**2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X)**2, eps) del data, F, N # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = int( options.K_numFilters) # number of spectral shapes for the filter part P = int(options.P_numAtomFilters ) # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \ stepNotes=stepNotes, \ lengthWindow=windowSizeInSamples, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15, loadWF0=True,\ analysisWindow='sinebell') WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if options.sal_output_file is None or not os.path.exists( options.sal_output_file): if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: if (HPS): from scipy.signal import medfilt if (is_stereo & stereoEstimation): SXR = np.maximum(np.abs(XR)**2, eps) SXL = np.maximum(np.abs(XL)**2, eps) if (HPS): SXR = medfilt(SXR, 3) SXL = medfilt(SXL, 3) alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError1 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) else: if (HPS): SX = medfilt(SX, 3) HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) else: print "Loading Salience from file to calculate Melody: " + options.sal_output_file loaded = np.loadtxt(options.sal_output_file).T times = [loaded[0, :]] HF0 = loaded[1:, :] # If vit_pitch_output_file is not null, do melody extraction with Viterbi if not (options.vit_pitch_output_file is None): print "Viterbi decoding" # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10**(-90) p0_0 = transitions[cutoffnote - 1] * 10**(-100) p0_f = transitions[cutoffnote - 1] * 10**(-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * int(np.floor(stepNotes / scopeAllowedHF0)) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * int(np.floor(stepNotes / scopeAllowedHF0)), chirpPerF0 \ * int((np.floor(stepNotes / scopeAllowedHF0))) \ + 1)), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * int(np.floor(stepNotes / scopeAllowedHF0)) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * int(np.floor(stepNotes \ / scopeAllowedHF0)) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * int(np.floor(stepNotes \ / scopeAllowedHF0)) \ + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0) / hatSX)**2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N if not os.path.isdir(os.path.dirname((options.vit_pitch_output_file))): os.mkdir(os.path.dirname((options.vit_pitch_output_file))) np.savetxt(options.vit_pitch_output_file + '.egy', np.array( [np.arange(N) * hopsize / np.double(Fs), energyMel]).T, fmt='%10.5f') # energyMel <= energyMelCumul[ind_999]? melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) # edit: frames predicted as unvoiced will be given negative values # indexBestPath[melNotPresent] = 0 freqMelody = F0Table[np.array(np.minimum(indexBestPath, len(F0Table) - 1), dtype=int)] freqMelody[melNotPresent] = -freqMelody[melNotPresent] if not os.path.exists(os.path.dirname(options.vit_pitch_output_file)): os.makedirs(os.path.dirname(options.vit_pitch_output_file)) np.savetxt(options.vit_pitch_output_file, np.array( [np.arange(N) * hopsize / np.double(Fs), freqMelody]).T, fmt='%10.7f') times = np.array([np.arange(N) * hopsize / np.double(Fs)]) # Save salience file: if not (options.sal_output_file is None): if not os.path.exists(os.path.dirname(options.sal_output_file)): os.makedirs(os.path.dirname(options.sal_output_file)) np.savetxt(options.sal_output_file, np.concatenate((times, HF0), axis=0).T, fmt='%10.6f') # saveSPHI (timbre related) saveSPHI = 0 if saveSPHI: if not os.path.exists( os.path.dirname(options.sal_output_file + '.SPHI')): os.makedirs(os.path.dirname(options.sal_output_file)) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) np.savetxt(options.sal_output_file + '.SPHI', np.concatenate((times, SPHI), axis=0).T, fmt='%10.4f') #np.savetxt(options.sal_output_file+'.WGAMMA',np.concatenate((times,WGAMMA),axis=0).T,fmt='%10.4f') # return times[0],freqMelody,HF0 print "Done!" return times[0], HF0, options
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option( "-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\n" "solo (vocal) part. \n" "If None, appends _lead to inputAudioFile.", default=None, ) parser.add_option( "-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\n" "music part.\n" "If None, appends _acc to inputAudioFile.", default=None, ) parser.add_option( "-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile", default=None, ) # Some more optional options: parser.add_option( "-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False ) parser.add_option( "-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True ) parser.add_option( "-n", "--dontseparate", dest="separateSignals", action="store_false", help="Trigger this option if you only desire to " + "estimate the melody", default=True, ) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30) parser.add_option( "--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s." ) parser.add_option( "--Fourier-size", dest="fourierSize", type="int", default=None, help="size of Fourier transforms, " "in samples.", ) parser.add_option( "--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.", ) parser.add_option( "--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment." ) parser.add_option( "--with-melody", dest="melody", type="string", default=None, help="provide the melody in a file named MELODY, " "with at each line: <time (s)><F0 (Hz)>.", ) parser.add_option( "--numAtomFilters", dest="P_numAtomFilters", type="int", default=30, help="Number of atomic filters - in WGAMMA.", ) parser.add_option( "--numFilters", dest="K_numFilters", type="int", default=10, help="Number of filters for decomposition - in WPHI", ) parser.add_option( "--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0." ) parser.add_option( "--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0." ) parser.add_option( "--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone." ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab ## plt.rc('text', usetex=True) plt.rc("image", cmap="jet") ## gray_r plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] if inputAudioFile[-4:] != ".wav": raise ValueError("File not WAV file? Only WAV format support, for now...") if options.mus_output_file is None: options.mus_output_file = inputAudioFile[:-4] + "_acc.wav" if options.voc_output_file is None: options.voc_output_file = inputAudioFile[:-4] + "_lead.wav" if options.pitch_output_file is None: options.pitch_output_file = inputAudioFile[:-4] + "_pitches.txt" print "Writing the different following output files:" print " separated lead in", options.voc_output_file print " separated accompaniment in", options.mus_output_file print " separated lead + unvoc in", options.voc_output_file[:-4] + "_VUIMM.wav" print " separated acc - unvoc in", options.mus_output_file[:-4] + "_VUIMM.wav" print " estimated pitches in", options.pitch_output_file Fs, data = wav.read(inputAudioFile) # data = np.double(data) / 32768.0 # makes data vary from -1 to 1 scaleData = 1.2 * data.max() # to rescale the data. dataType = data.dtype data = np.double(data) / scaleData # makes data vary from -1 to 1 is_stereo = True if data.shape[0] == data.size: # data is multi-channel print "The audio file is not stereo. Making stereo out of mono." print "(You could also try the older separateLead.py...)" is_stereo = False # data = np.vstack([data,data]).T # raise ValueError("number of dimensions of the input not 2") if is_stereo and data.shape[1] != 2: print "The data is multichannel, but not stereo... \n" print "Unfortunately this program does not scale well. Data is \n" print "reduced to its 2 first channels.\n" data = data[:, 0:2] # Processing the options: windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs)) hopsize = np.round(options.hopsize * Fs) if hopsize != windowSizeInSamples / 8: # print "Overriding given hopsize to use 1/8th of window size" # hopsize = windowSizeInSamples/8 warnings.warn("Chosen hopsize: " + str(hopsize) + ", while windowsize: " + str(windowSizeInSamples)) if options.fourierSize is None: NFT = windowSizeInSamples else: NFT = options.fourierSize # number of iterations for each parameter estimation step: niter = options.nbiter # number of spectral shapes for the accompaniment R = options.R eps = 10 ** -9 if options.verbose: print "Some parameter settings:" print " Size of analysis windows: ", windowSizeInSamples print " Hopsize: ", hopsize print " Size of Fourier transforms: ", NFT print " Number of iterations to be done: ", niter print " Number of elements in WM: ", R if is_stereo: XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8) ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8) # SXR = np.abs(XR) ** 2 # SXL = np.abs(XL) ** 2 SX = np.maximum((0.5 * np.abs(XR + XL)) ** 2, eps) else: # data is mono X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) SX = np.maximum(np.abs(X) ** 2, eps) del data, F, N # TODO: also process these as options: # minimum and maximum F0 in glottal source spectra dictionary minF0 = options.minF0 maxF0 = options.maxF0 F, N = SX.shape stepNotes = options.stepNotes # this is the number of F0s within one semitone K = options.K_numFilters # number of spectral shapes for the filter part P = options.P_numAtomFilters # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = generate_WF0_chirped( minF0, maxF0, Fs, Nfft=NFT, stepNotes=stepNotes, lengthWindow=windowSizeInSamples, Ot=0.25, perF0=chirpPerF0, depthChirpInSemiTone=0.15, loadWF0=True, analysisWindow="sinebell", ) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale="linear", numberOfBasis=P, overlap=0.75) if displayEvolution: plt.figure(1) plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r"Frame number $n$", fontsize=16) plt.ylabel(r"Leading source number $u$", fontsize=16) plt.ion() # plt.show() ## the following seems superfluous if mpl's backend is macosx... ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\ ## "!! Press Return to resume the program. !!\n"\ ## "!! Be sure that the figure has been !!\n"\ ## "!! already displayed, so that the !!\n"\ ## "!! evolution of HF0 will be visible. !!\n"\ ## "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if options.melody is None: ## section to estimate the melody, on monophonic algo: # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) if displayEvolution: h2 = plt.figure(2) plt.clf() imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line # create transition probability matrix - adhoc parameter 'scale' # TODO: use "learned" parameter scale (NB: after many trials, # provided scale and parameterization seems robust) scale = 1.0 transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = transitions[ np.array(np.abs(np.outer(np.ones(NF0), b) - np.outer(b, np.ones(NF0))), dtype=int) ] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 / np.outer(sumTransitionMatrixF0, np.ones(NF0 + 1)) # prior probabilities, and setting the array for Viterbi tracking: priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) indexBestPath = viterbiTrackingArray( logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose ) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, "-b") h2.hold(False) plt.axis("tight") del logHF0 # detection of silences: # computing the melody restricted F0 amplitude matrix HF00 # (which will be used as initial HF0 for further algo): HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # computing indices for and around the melody indices, # dim1index are indices along axis 0, and dim2index along axis 1 # of HF0: # TODO: use numpy broadcasting to make this "clearer" (if possible...) dim1index = np.array( np.maximum( np.minimum( np.outer( chirpPerF0 * indexBestPath, np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), ) + np.outer( np.ones(N), np.arange( -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1), ), ), chirpPerF0 * NF0 - 1, ), 0, ), dtype=int, ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) dim2index = np.outer( np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int) ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) HF00[dim1index, dim2index] = HF0[dim1index, dim2index] # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 # remove frames with less than (100 thres_energy) % of total energy. thres_energy = 0.000584 SF0 = np.maximum(np.dot(WF0, HF00), eps) SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps) SM = np.maximum(np.dot(WM, HM), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum((((SPHI * SF0) / hatSX) ** 2) * SX, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = N melNotPresent = energyMel <= energyMelCumulNorm[ind_999] indexBestPath[melNotPresent] = 0 else: ## take the provided melody line: # load melody from file: melodyFromFile = np.loadtxt(options.melody) sizeProvidedMel = melodyFromFile.shape if len(sizeProvidedMel) == 1: print "The melody should be provided as <Time (s)><F0 (Hz)>." raise ValueError("Bad melody format") melTimeStamps = melodyFromFile[:, 0] # + 1024 / np.double(Fs) melFreqHz = melodyFromFile[:, 1] if minF0 > melFreqHz[melFreqHz > 40.0].min() or maxF0 < melFreqHz.max(): minF0 = melFreqHz[melFreqHz > 40.0].min() * 0.97 maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03) print "Recomputing the source basis for " print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz." # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = generate_WF0_chirped( minF0, maxF0, Fs, Nfft=NFT, stepNotes=stepNotes, lengthWindow=windowSizeInSamples, Ot=0.25, perF0=chirpPerF0, depthChirpInSemiTone=0.15, ) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) sigTimeStamps = np.arange(N) * hopsize / np.double(Fs) distMatTimeStamps = np.abs( np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N)) ) minDistTimeStamps = distMatTimeStamps.argmin(axis=0) f0BestPath = melFreqHz[minDistTimeStamps] distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N))) indexBestPath = distMatF0.argmin(axis=0) # setting silences to 0, with tolerance = 1/2 window length indexBestPath[distMatTimeStamps[minDistTimeStamps, range(N)] >= 0.5 * options.windowSize] = 0 indexBestPath[f0BestPath <= 0] = 0 freqMelody = F0Table[np.array(indexBestPath, dtype=int)] freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0] np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T) # If separation is required: if options.separateSignals: # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 2.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array( np.maximum( np.minimum( np.outer( chirpPerF0 * indexBestPath, np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), ) + np.outer( np.ones(N), np.arange( -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1), ), ), chirpPerF0 * NF0 - 1, ), 0, ), dtype=int, ) dim1index = dim1index[indexBestPath != 0, :] ## dim1index = dim1index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes / scopeAllowedHF0) \ ## + 1)) dim1index = dim1index.reshape(1, dim1index.size) dim2index = np.outer( np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int) ) dim2index = dim2index[indexBestPath != 0, :] dim2index = dim2index.reshape(1, dim2index.size) ## dim2index.reshape(1, N * chirpPerF0 \ ## * (2 * np.floor(stepNotes \ ## / scopeAllowedHF0) \ ## + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 WF0effective = WF0 HF00effective = HF00 if options.melody is None: del HF0, HGAMMA, HPHI, HM, WM, HF00 if is_stereo: del SX SXR = np.maximum(np.abs(XR) ** 2, eps) SXL = np.maximum(np.abs(XL) ** 2, eps) alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM) hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM) hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \ # options.voc_output_file, Fs) vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, np.array([vestR, vestL]).T) # wav.write(options.voc_output_file, Fs, \ # np.int16(32768.0 * np.array([vestR,vestL]).T)) hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \ # options.mus_output_file, Fs) mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, np.array([mestR, mestL]).T) # wav.write(options.mus_output_file, Fs, \ # np.int16(32768.0 * np.array([mestR,mestL]).T)) del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM( # the data to be fitted to: SXR, SXL, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, # WM, HM0=None, # HM, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SXR.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM) hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM) hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav" vestR = np.array(np.round(vestR * scaleData), dtype=dataType) vestL = np.array(np.round(vestL * scaleData), dtype=dataType) wav.write(outputFileName, Fs, np.array([vestR, vestL]).T) hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav" mestR = np.array(np.round(mestR * scaleData), dtype=dataType) mestL = np.array(np.round(mestL * scaleData), dtype=dataType) wav.write(outputFileName, Fs, np.array([mestR, mestL]).T) else: # running on monophonic data: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM, HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest * scaleData), dtype=dataType) wav.write(options.voc_output_file, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest * scaleData), dtype=dataType) wav.write(options.mus_output_file, Fs, mest) del hatM, vest, mest, hatV, hatSX, SPHI, SF0 # adding the unvoiced part in the source basis: WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])]) HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])]) ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this? HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WUF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=HGAMMA, HPHI0=HPHI, HF00=HUF0, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0=0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution, updateHGAMMA=False, ) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WUF0, HF0) SM = np.dot(WM, HM) hatSX = SF0 * SPHI + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 vest = np.array(np.round(vest * scaleData), dtype=dataType) outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav" wav.write(outputFileName, Fs, vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 mest = np.array(np.round(mest * scaleData), dtype=dataType) outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav" wav.write(outputFileName, Fs, mest) if displayEvolution: plt.close("all") print "Done!"
def runViterbi(self): if not('HF0' in self.SIMMParams.keys()): raise AttributeError("HF0 has probably not been estimated yet.") # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 NF0 = self.SIMMParams['NF0'] transitions = np.exp(-np.floor(np.arange(0, NF0)/\ self.SIMMParams['stepNotes']) * \ scale) cutoffnote = 2 * 5 * self.SIMMParams['stepNotes'] transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, self.N]) normHF0 = np.amax(self.SIMMParams['HF0'], axis=0) barHF0 = np.array(self.SIMMParams['HF0']) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) print "Running Viterbi algorithm to track the melody, " + \ str(self.N) + " frames." indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=False) print "Viterbi algorithm done..." # drawing this as a line is actually a bit confusing, on the image # TODO: think of a better representation (is contour good enough?) ##if self.displayEvolution and not(self.imageCanvas is None): ## self.imageCanvas.ax.plot(indexBestPath, '-b') ## self.imageCanvas.ax.axis('tight') ## self.imageCanvas.draw() del logHF0 # detection of silences: chirpPerF0 = self.SIMMParams['chirpPerF0'] stepNotes = self.SIMMParams['stepNotes'] HF00 = np.zeros([NF0 * chirpPerF0, self.N]) scopeAllowedHF0 = self.scopeAllowedHF0# 4.0 / 1.0 # 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(self.N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, self.N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(self.N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, self.N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = self.SIMMParams['HF0'][dim1index, dim2index] HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(self.SIMMParams['WF0'], HF00), eps) SPHI = np.maximum(np.dot(self.SIMMParams['WGAMMA'], \ np.dot(self.SIMMParams['HGAMMA'], self.SIMMParams['HPHI'])), eps) SM = np.maximum(np.dot(self.SIMMParams['WM'], \ self.SIMMParams['HM']), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (self.XR + self.XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0] if ind_999 is None: ind_999 = self.N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 freqMelody = self.SIMMParams['F0Table'][np.array(indexBestPath,dtype=int)] freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0] np.savetxt(self.files['pitch_output_file'], np.array([np.arange(self.N) * \ self.stftParams['hopsize'] / np.double(self.fs), freqMelody]).T) self.indexBestPath = indexBestPath self.freqMelody = freqMelody
def main(): import optparse usage = "usage: %prog [options] inputAudioFile" parser = optparse.OptionParser(usage) # Name of the output files: parser.add_option("-v", "--vocal-output-file", dest="voc_output_file", type="string", help="name of the audio output file for the estimated\nsolo (vocal) part", default="estimated_solo.wav") parser.add_option("-m", "--music-output-file", dest="mus_output_file", type="string", help="name of the audio output file for the estimated\nmusic part", default="estimated_music.wav") parser.add_option("-p", "--pitch-output-file", dest="pitch_output_file", type="string", help="name of the output file for the estimated pitches", default="pitches.txt") # Some more optional options: parser.add_option("-d", "--with-display", dest="displayEvolution", action="store_true",help="display the figures", default=False) parser.add_option("-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True) parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=50) parser.add_option("--window-size", dest="windowSize", type="float", default=0.04644,help="size of analysis windows, in s.") parser.add_option("--Fourier-size", dest="fourierSize", type="int", default=2048, help="size of Fourier transforms, in samples.") parser.add_option("--hopsize", dest="hopsize", type="float", default=0.0058, help="size of the hop between analysis windows, in s.") (options, args) = parser.parse_args() if len(args) != 1: parser.error("incorrect number of arguments, use option -h for help.") displayEvolution = options.displayEvolution if displayEvolution: import matplotlib.pyplot as plt import imageMatlab plt.rc('text', usetex=True) plt.rc('image',cmap='gray_r') plt.ion() # Compulsory option: name of the input file: inputAudioFile = args[0] fs, data = wav.read(inputAudioFile) #data, fs, enc = scikits.audiolab.wavread(inputAudioFile) if data.shape[0] != data.size: # data is multi-channel data = np.mean(data,axis=1) # Processing the options: windowSizeInSamples = np.round(options.windowSize * fs) hopsize = np.round(options.hopsize * fs) NFT = options.fourierSize niter = options.nbiter if options.verbose: print "Size of analysis windows: ", windowSizeInSamples, "\n" print "Hopsize: ", hopsize, "\n" print "Size of Fourier transforms: ", NFT, "\n" print "Number of iterations to be done: ", niter, "\n" X, F, N = stft(data, fs=fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT) # SX is the power spectrogram: SX = np.maximum(np.abs(X) ** 2, 10 ** -8) del data, F, N # TODO: also process these as options: minF0 = 100 maxF0 = 800 Fs = fs F, N = SX.shape stepNotes = 20 # this is the number of F0s within one semitone K = 50 # number of spectral shapes for the filter part R = 40 # number of spectral shapes for the accompaniment P = 30 # number of elements in dictionary of smooth filters chirpPerF0 = 1 # number of chirped spectral shapes between each F0 # this feature should be further studied before # we find a good way of doing that. # Create the harmonic combs, for each F0 between minF0 and maxF0: F0Table, WF0 = \ generate_WF0_chirped(minF0, maxF0, Fs, Nfft=2048, \ stepNotes=stepNotes, \ lengthWindow=2048, Ot=0.25, \ perF0=chirpPerF0, \ depthChirpInSemiTone=.15) WF0 = WF0[0:F, :] # ensure same size as SX NF0 = F0Table.size # number of harmonic combs # Normalization: WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0)) # Create the dictionary of smooth filters, for the filter part of # the lead isntrument: WGAMMA = generateHannBasis(F, 2048, Fs=fs, frequencyScale='linear', \ numberOfBasis=P, overlap=.75) if displayEvolution: plt.figure(1);plt.clf() plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) plt.ion() # plt.show() raw_input("Press Return to resume the program. \nBe sure that the figure has been already displayed, so that the evolution of HF0 will be visible. ") # First round of parameter estimation: HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # putting only 2 elements in accompaniment for a start... # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=None, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1., stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) if displayEvolution: plt.figure(3);plt.clf() plt.subplot(221) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(222) plt.plot(db(np.dot(WGAMMA, HGAMMA[:,1]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(223) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(224) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.figure(4);plt.clf() imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(5);plt.clf() imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0 cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(6);plt.clf() imageMatlab.imageM(db(np.dot(WM, HM)), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(7);plt.clf() imageMatlab.imageM(db(WM), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Element number $r$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) if displayEvolution: h2 = plt.figure(2);plt.clf(); imageMatlab.imageM(20 * np.log10(HF0)) matMax = (20 * np.log10(HF0)).max() matMed = np.median(20 * np.log10(HF0)) plt.clim([matMed - 100, matMax]) # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale) cutoffnote = 2 * 5 * stepNotes transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10 ** (-90) p0_0 = transitions[cutoffnote - 1] * 10 ** (-100) p0_f = transitions[cutoffnote - 1] * 10 ** (-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, N]) normHF0 = np.amax(HF0, axis=0) barHF0 = np.array(HF0) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100) indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0)) np.savetxt(options.pitch_output_file, np.array([np.arange(N)*options.hopsize, F0Table[np.array(indexBestPath,dtype=int)]]).T) if displayEvolution: h2.hold(True) plt.plot(indexBestPath, '-b') h2.hold(False) plt.axis('tight') raw_input("Press Return to resume the program...") del logHF0 # Second round of parameter estimation, with specific # initial HF00: HF00 = np.zeros([NF0 * chirpPerF0, N]) scopeAllowedHF0 = 1.0 / 1.0 # indexes for HF00: # TODO: reprogram this with a 'where'?... dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = 1 # HF0.max() HF00[:, indexBestPath == (NF0 - 1)] = 0.0 WF0effective = WF0 HF00effective = HF00 del HF0, HGAMMA, HPHI, HM, WM, HF00 HGAMMA, HPHI, HF0, HM, WM, recoError2 = SIMM.SIMM( # the data to be fitted to: SX, # the basis matrices for the spectral combs WF0effective, # and for the elementary filters: WGAMMA, # number of desired filters, accompaniment spectra: numberOfFilters=K, numberOfAccompanimentSpectralShapes=R, # if any, initial amplitude matrices for HGAMMA0=None, HPHI0=None, HF00=HF00effective, WM0=None, HM0=None, # Some more optional arguments, to control the "convergence" # of the algo numberOfIterations=niter, updateRulePower=1.0, stepNotes=stepNotes, lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9, verbose=options.verbose, displayEvolution=displayEvolution) WPHI = np.dot(WGAMMA, HGAMMA) SPHI = np.dot(WPHI, HPHI) SF0 = np.dot(WF0effective, HF0) SM = np.dot(WM, HM) hatSX = SPHI * SF0 + SM hatV = SPHI * SF0 / hatSX * X vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 # scikits.audiolab.wavwrite(vest, options.voc_output_file, fs) wav.write(options.voc_output_file, fs, \ vest) hatM = SM / hatSX * X mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0 #scikits.audiolab.wavwrite(mest, options.mus_output_file, fs) wav.write(options.mus_output_file, fs, \ mest) if displayEvolution: plt.figure(13);plt.clf() plt.subplot(221) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(222) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 1]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(223) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.subplot(224) plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3]))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.ylim([-30, 0]) plt.axis("tight") plt.figure(14);plt.clf() imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(141);plt.clf() SVhat = db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)) \ + db(np.dot(WF0, HF0)) imageMatlab.imageM(SVhat, vmax=SVhat.max(), vmin=SVhat.max() - 50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(15);plt.clf() imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Leading source number $u$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0 plt.figure(16) plt.clf() imageMatlab.imageM(db(np.dot(WM, HM)), vmin=np.maximum(-50, db(SM.min()))) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Frame number $n$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.figure(17) plt.clf() imageMatlab.imageM(db(WM), vmin=-50) plt.xticks(fontsize=16) plt.yticks(fontsize=16) plt.xlabel(r'Element number $r$', fontsize=16) plt.ylabel(r'Frequency bin number $f$', fontsize=16) cb = plt.colorbar(fraction=0.04) plt.axes(cb.ax) plt.xticks(fontsize=16) plt.yticks(fontsize=16) raw_input("Press Return to end the program...") print "Done!"
def runViterbi(self): if not ('HF0' in self.SIMMParams.keys()): raise AttributeError("HF0 has probably not been estimated yet.") # Viterbi decoding to estimate the predominant fundamental # frequency line scale = 1.0 NF0 = self.SIMMParams['NF0'] transitions = np.exp(-np.floor(np.arange(0, NF0)/\ self.SIMMParams['stepNotes']) * \ scale) cutoffnote = 2 * 5 * self.SIMMParams['stepNotes'] transitions[cutoffnote:] = transitions[cutoffnote - 1] transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix b = np.arange(NF0) transitionMatrixF0[0:NF0, 0:NF0] = \ transitions[\ np.array(np.abs(np.outer(np.ones(NF0), b) \ - np.outer(b, np.ones(NF0))), dtype=int)] pf_0 = transitions[cutoffnote - 1] * 10**(-90) p0_0 = transitions[cutoffnote - 1] * 10**(-100) p0_f = transitions[cutoffnote - 1] * 10**(-80) transitionMatrixF0[0:NF0, NF0] = pf_0 transitionMatrixF0[NF0, 0:NF0] = p0_f transitionMatrixF0[NF0, NF0] = p0_0 sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1) transitionMatrixF0 = transitionMatrixF0 \ / np.outer(sumTransitionMatrixF0, \ np.ones(NF0 + 1)) priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1]) logHF0 = np.zeros([NF0 + 1, self.N]) normHF0 = np.amax(self.SIMMParams['HF0'], axis=0) barHF0 = np.array(self.SIMMParams['HF0']) logHF0[0:NF0, :] = np.log(barHF0) logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf]) logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100) print "Running Viterbi algorithm to track the melody, " + \ str(self.N) + " frames." indexBestPath = viterbiTrackingArray(\ logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=False) print "Viterbi algorithm done..." # drawing this as a line is actually a bit confusing, on the image # TODO: think of a better representation (is contour good enough?) ##if self.displayEvolution and not(self.imageCanvas is None): ## self.imageCanvas.ax.plot(indexBestPath, '-b') ## self.imageCanvas.ax.axis('tight') ## self.imageCanvas.draw() del logHF0 # detection of silences: chirpPerF0 = self.SIMMParams['chirpPerF0'] stepNotes = self.SIMMParams['stepNotes'] HF00 = np.zeros([NF0 * chirpPerF0, self.N]) scopeAllowedHF0 = self.scopeAllowedHF0 # 4.0 / 1.0 # 2.0 / 1.0 dim1index = np.array(\ np.maximum(\ np.minimum(\ np.outer(chirpPerF0 * indexBestPath, np.ones(chirpPerF0 \ * (2 \ * np.floor(stepNotes / scopeAllowedHF0) \ + 1))) \ + np.outer(np.ones(self.N), np.arange(-chirpPerF0 \ * np.floor(stepNotes / scopeAllowedHF0), chirpPerF0 \ * (np.floor(stepNotes / scopeAllowedHF0) \ + 1))), chirpPerF0 * NF0 - 1), 0), dtype=int).reshape(1, self.N * chirpPerF0 \ * (2 * np.floor(stepNotes / scopeAllowedHF0) \ + 1)) dim2index = np.outer(np.arange(self.N), np.ones(chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) + 1), \ dtype=int)\ ).reshape(1, self.N * chirpPerF0 \ * (2 * np.floor(stepNotes \ / scopeAllowedHF0) \ + 1)) HF00[dim1index, dim2index] = self.SIMMParams['HF0'][dim1index, dim2index] HF00[:, indexBestPath == (NF0 - 1)] = 0.0 HF00[:, indexBestPath == 0] = 0.0 thres_energy = 0.000584 SF0 = np.maximum(np.dot(self.SIMMParams['WF0'], HF00), eps) SPHI = np.maximum(np.dot(self.SIMMParams['WGAMMA'], \ np.dot(self.SIMMParams['HGAMMA'], self.SIMMParams['HPHI'])), eps) SM = np.maximum(np.dot(self.SIMMParams['WM'], \ self.SIMMParams['HM']), eps) hatSX = np.maximum(SPHI * SF0 + SM, eps) energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \ (self.XR + self.XL) * 0.5) \ ** 2, axis=0) energyMelSorted = np.sort(energyMel) energyMelCumul = np.cumsum(energyMelSorted) energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps) # normalized to the maximum of energy: # expressed in 0.01 times the percentage ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0] if ind_999 is None: ind_999 = self.N melNotPresent = (energyMel <= energyMelCumulNorm[ind_999]) indexBestPath[melNotPresent] = 0 freqMelody = self.SIMMParams['F0Table'][np.array(indexBestPath, dtype=int)] freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0] np.savetxt(self.files['pitch_output_file'], np.array([np.arange(self.N) * \ self.stftParams['hopsize'] / np.double(self.fs), freqMelody]).T) self.indexBestPath = indexBestPath self.freqMelody = freqMelody