def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part",
                      default="estimated_music.wav")
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")
    
    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=100)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements", dest="R", type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")
    
    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")
    
    (options, args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")
        
    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab
        
        ## plt.rc('text', usetex=True)
        plt.rc('image',cmap='jet') ## gray_r
        plt.ion()
        
    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max() # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData # makes data vary from -1 to 1

    tmp = np.zeros((data.size, 2))
    tmp[:,0] = data
    tmp[:,1] = data
    data = tmp

    if data.shape[0] == data.size: # data is multi-channel
        print "The audio file is not stereo. Try separateLead.py instead."
        raise ValueError("number of dimensions of the input not 2")
    if data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:,0:2]
    
    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter
    R = options.R
    
    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R 
    
    XR, F, N = stft(data[:,0], fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    XL, F, N = stft(data[:,1], fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    # SX is the power spectrogram:
    ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
    ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
    SXR = np.abs(XR) ** 2
    SXL = np.abs(XL) ** 2
    
    del data, F, N
    
    # TODO: also process these as options:
    eps = 10 ** -9
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SXR.shape
    stepNotes = 20 # this is the number of F0s within one semitone
    # until 17/09/2010 : stepNotes = 20
    # 17/09/2010 : trying stepNotes = 8, checking for less artefacts
    
    K = 10 # number of spectral shapes for the filter part
    # R = 40 # number of spectral shapes for the accompaniment
    P = 30 # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.
    
    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
    
    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)
    
    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        
    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        SX = np.maximum(np.abs((XR + XL) / 2.0) ** 2, 10 ** -8)
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for 
            HGAMMA0=None, HPHI0=None,
            HF00=None,
            WM0=None, HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter, updateRulePower=1.,
            stepNotes=stepNotes, 
            lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
            verbose=options.verbose, displayEvolution=displayEvolution)
        
        if displayEvolution:
            h2 = plt.figure(2);plt.clf();
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])
            
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]
        
        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0
        
        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))
        
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)
        
        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)
        
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)
        
        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
            ##         raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                   "!! Press Return to resume the program  !!\n"\
            ##                   "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            
        del logHF0
        
        # detection of silences:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (XR+XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        
        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0
        
    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print "The melody should be provided as <Time (s)><F0 (Hz)>."
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:,1]
        if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz>40.0].min() *.97
            maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03)
            print "Recomputing the source basis for "
            print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz."
            # Create the harmonic combs, for each F0 between minF0 and maxF0: 
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :] # ensure same size as SX 
            NF0 = F0Table.size # number of harmonic combs
            # Normalization: 
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
            
        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]),
                                            sigTimeStamps) -
                                   np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) -
                                   np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath<=0] = 0
        
    freqMelody = F0Table[np.array(indexBestPath,dtype=int)]
    freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0]
    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N) * hopsize / np.double(Fs),
                         freqMelody]).T)
    
    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])
    
    scopeAllowedHF0 = 2.0 / 1.0
    
    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int)
    dim1index = dim1index[indexBestPath!=0,:]
    ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
    ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
    ##                          + 1))
    dim1index = dim1index.reshape(1,dim1index.size)
    
    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         )
    dim2index = dim2index[indexBestPath!=0,:]
    dim2index = dim2index.reshape(1,dim2index.size)
    ## dim2index.reshape(1, N * chirpPerF0 \
    ##                                * (2 * np.floor(stepNotes \
    ##                                                / scopeAllowedHF0) \
    ##                                   + 1))
    HF00[dim1index, dim2index] = 1 # HF0.max()
    
    HF00[:, indexBestPath == (NF0 - 1)] = 0.0
    HF00[:, indexBestPath == 0] = 0.0
    
    
    WF0effective = WF0
    HF00effective = HF00
    
    if options.melody is None:
        del HF0, HGAMMA, HPHI, HM, WM, HF00, SX
        
    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)
    
    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)
    
    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
    
    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
    
    vestR = istft(hatVR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
    
    vestL = istft(hatVR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
    #                          options.voc_output_file, fs)
    
    vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
    wav.write(options.voc_output_file, fs, \
              np.array([vestR,vestL]).T)
    
    #wav.write(options.voc_output_file, fs, \
    #          np.int16(32768.0 * np.array([vestR,vestL]).T))
    
    hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
    
    mestR = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
    
    mestL = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
    #                          options.mus_output_file, fs)
    
    mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
    wav.write(options.mus_output_file, fs, \
              np.array([mestR,mestL]).T)
    
    #wav.write(options.mus_output_file, fs, \
    #          np.int16(32768.0 * np.array([mestR,mestL]).T))
    
    del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0
    
    # adding the unvoiced part in the source basis:
    WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
    HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
    ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
    
    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WUF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=HGAMMA, HPHI0=HPHI,
        HF00=HUF0,
        WM0=None,#WM,
        HM0=None,#HM,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution,
        updateHGAMMA=False)
    
    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WUF0, HF0)
    
    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
    
    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
    
    vestR = istft(hatVR, hopsize=hopsize, nfft=NFT,
                  window=sinebell(windowSizeInSamples)) / 4.0
    
    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
    
    vestL = istft(hatVR, hopsize=hopsize, nfft=NFT,
                  window=sinebell(windowSizeInSamples)) / 4.0
    
    outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
    # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs)
    
    vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([vestR,vestL]).T)
    
    hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
    
    mestR = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
    
    mestL = istft(hatMR, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0
    
    outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs)
    
    mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([mestR,mestL]).T)
    
    if displayEvolution:
        plt.close('all')
        ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##           "!! Press Return to end the program...  !!\n"\
        ##           "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    
    print "Done!"
Ejemplo n.º 2
0
def stereo_NMF(SXR, SXL,
               numberOfAccompanimentSpectralShapes,
               WM0=None, HM0=None,
               numberOfIterations=50, updateRulePower=1.0,
               verbose=False, displayEvolution=False):
    
    eps = 10 ** (-20)
    
    if displayEvolution:
        import matplotlib.pyplot as plt
        from imageMatlab import imageM
        plt.ion()
        print("Is the display interactive? ", plt.isinteractive())
    
    R = numberOfAccompanimentSpectralShapes
    omega = updateRulePower
    
    F, N = SXR.shape
    if (F, N) != SXL.shape:
        print("The input STFT matrices do not have the same dimension.\n")
        print("Please check what happened...")
        raise ValueError("Dimension of STFT matrices must be the same.")
    
    if HM0 is None:
        HM0 = np.abs(randn(R, N))
    else:
        if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N:
            HM0 = np.array(HM0)
        else:
            print("Wrong dimensions for given HM0, \n")
            print("random initialization used instead")
            HM0 = np.abs(randn(R, N))
    HM = np.copy(HM0)
    
    if WM0 is None:
        WM0 = np.abs(randn(F, R))
    else:
        if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R:
            WM0 = np.array(WM0)
        else:
            print("Wrong dimensions for given WM0, \n")
            print("random initialization used instead")
            WM0 = np.abs(randn(F, R))
    WM = np.copy(WM0)
    
    betaR = np.diag(np.random.rand(R))
    betaL = np.eye(R) - betaR
    
    hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
    hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
    
    # temporary matrices
    tempNumFbyN = np.zeros([F, N])
    tempDenFbyN = np.zeros([F, N])
    
    recoError = np.zeros([numberOfIterations * 3 + 1])
    recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL)
    if verbose:
        print("Reconstruction error at beginning: ", recoError[0])
    counterError = 1
    if displayEvolution:
        h1 = plt.figure(1)
        
        
    for n in np.arange(numberOfIterations):
        # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM
        if verbose:
            print("iteration ", n, " over ", numberOfIterations)
            
        if displayEvolution:
            h1.clf()
            imageM(db(hatSXR))
            plt.clim([np.amax(db(hatSXR))-100, np.amax(db(hatSXR))])
            plt.draw()
        
        # updating HM
        HM = HM * \
             ((np.dot(np.dot((betaR**2), WM.T), SXR /
                      np.maximum(hatSXR ** 2, eps)) +
               np.dot(np.dot((betaL**2), WM.T), SXL /
                      np.maximum(hatSXL ** 2, eps))
               ) /
              np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 /
                                np.maximum(hatSXR, eps)) +
                         np.dot(np.dot((betaL**2), WM.T), 1 /
                                np.maximum(hatSXL, eps)),
                         eps)) ** omega
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2),HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2),HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print("Reconstruction error difference after HM    : ",\
                  recoError[counterError] - recoError[counterError - 1])
        counterError += 1
        
        # updating WM
        WM = WM * \
             ((np.dot(SXR / np.maximum(hatSXR ** 2, eps),
                      np.dot(HM.T, betaR ** 2)) +
               np.dot(SXL / np.maximum(hatSXL ** 2, eps),
                      np.dot(HM.T, betaL ** 2))
               ) /
              (np.dot(1 / np.maximum(hatSXR, eps),
                      np.dot(HM.T, betaR ** 2)) +
               np.dot(1 / np.maximum(hatSXL, eps),
                      np.dot(HM.T, betaL ** 2))
               )) ** omega
        
        sumWM = np.sum(WM, axis=0)
        WM[:, sumWM>0] = (WM[:, sumWM>0] /
                          np.outer(np.ones(F),sumWM[sumWM>0]))
        HM = HM * np.outer(sumWM, np.ones(N))
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print("Reconstruction error difference after WM    : ",)
            print(recoError[counterError] - recoError[counterError - 1])
            
        counterError += 1
        
        # updating betaR and betaL
        betaR = np.diag(np.diag(np.maximum(betaR *
                        ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2,
                                                               eps)),
                                 HM.T)) /
                         (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR,
                                                             eps)),
                                 HM.T))) ** (omega*.1), eps)))
        betaL = np.diag(np.diag(np.maximum(betaL *
                        ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2,
                                                               eps)),
                                 HM.T)) /
                         (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL,
                                                             eps)),
                                 HM.T))) ** (omega*.1), eps)))
        betaR = betaR / np.maximum(betaR + betaL, eps)
        betaL = np.copy(np.eye(R) - betaR)
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print("Reconstruction error difference after BETA  : ",)
            print(recoError[counterError] - recoError[counterError - 1])
        
        counterError += 1
        
    return betaR, betaL, HM, WM
def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part. \n"\
                           "If None, appends _lead to inputAudioFile.",
                      default=None)
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part.\n"\
                           "If None, appends _acc to inputAudioFile.",
                      default=None)
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches.\n"
                           "If None, appends _pitches to inputAudioFile",
                      default=None)
    
    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("-n", "--dontseparate", dest="separateSignals",
                      action="store_false",
                      help="Trigger this option if you only desire to "+\
                           "estimate the melody",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=30)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=None,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements", dest="R", type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")
    
    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")
    
    parser.add_option("--numAtomFilters", dest="P_numAtomFilters",
                      type="int", default=30,
                      help="Number of atomic filters - in WGAMMA.")
    parser.add_option("--numFilters", dest="K_numFilters", type="int",
                      default=10,
                      help="Number of filters for decomposition - in WPHI")
    parser.add_option("--min-F0-Freq", dest="minF0", type="float",
                      default=100.0,
                      help="Minimum of fundamental frequency F0.")
    parser.add_option("--max-F0-Freq", dest="maxF0", type="float",
                      default=800.0,
                      help="Maximum of fundamental frequency F0.")
    parser.add_option("--step-F0s", dest="stepNotes", type="int",
                      default=20,
                      help="Number of F0s in dictionary for each semitone.")
    
    (options, args) = parser.parse_args()
    
    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")
    
    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab
        
        ## plt.rc('text', usetex=True)
        plt.rc('image',cmap='jet') ## gray_r
        plt.ion()
        
    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    if inputAudioFile[-4:] != ".wav":
        raise ValueError("File not WAV file? Only WAV format support, for now...")
    
    if options.mus_output_file is None:
        options.mus_output_file = inputAudioFile[:-4]+'_acc.wav'
    
    if options.voc_output_file is None:
        options.voc_output_file = inputAudioFile[:-4]+'_lead.wav'
    
    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4]+'_pitches.txt'
    
    print("Writing the different following output files:")
    print("    separated lead          in", options.voc_output_file)
    print("    separated accompaniment in", options.mus_output_file)
    print("    separated lead + unvoc  in", options.voc_output_file[:-4] + \
          '_VUIMM.wav')
    print("    separated acc  - unvoc  in", options.mus_output_file[:-4] + \
          '_VUIMM.wav')
    print("    estimated pitches       in", options.pitch_output_file)
    
    Fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max() # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData # makes data vary from -1 to 1
    is_stereo = True
    if data.shape[0] == data.size: # data is multi-channel
        print("The audio file is not stereo. Making stereo out of mono.")
        print("(You could also try the older separateLead.py...)")
        is_stereo = False
        # data = np.vstack([data,data]).T 
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print("The data is multichannel, but not stereo... \n")
        print("Unfortunately this program does not scale well. Data is \n")
        print("reduced to its 2 first channels.\n")
        data = data[:,0:2]
    
    # Processing the options:
    windowSizeInSamples = int(nextpow2(np.round(options.windowSize * Fs)) )
    
    hopsize = np.round(options.hopsize * Fs)
    if hopsize != windowSizeInSamples/8:
        #print "Overriding given hopsize to use 1/8th of window size"
        #hopsize = windowSizeInSamples/8
        warnings.warn("Chosen hopsize: "+str(hopsize)+\
                      ", while windowsize: "+str(windowSizeInSamples))
    
    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step: 
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = options.R
    
    eps = 10 ** -9
    
    if options.verbose:
        print("Some parameter settings:")
        print("    Size of analysis windows: ", windowSizeInSamples)
        print("    Hopsize: ", hopsize)
        print("    Size of Fourier transforms: ", NFT)
        print("    Number of iterations to be done: ", niter)
        print("    Number of elements in WM: ", R)
        
    if is_stereo:
        XR, F, N = stft(data[:,0], fs=Fs, hopsize=hopsize,
                        window=sinebell(windowSizeInSamples), nfft=NFT)
        XL, F, N = stft(data[:,1], fs=Fs, hopsize=hopsize,
                        window=sinebell(windowSizeInSamples), nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        #SXR = np.abs(XR) ** 2
        #SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5*np.abs(XR+XL)) ** 2, eps)
    else: # data is mono
        X, F, N = stft(data, fs=Fs, hopsize=hopsize,
                       window=sinebell(windowSizeInSamples), nfft=NFT)
        SX = np.maximum(np.abs(X) ** 2, eps)
    
    del data, F, N
    
    # TODO: also process these as options:
    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes # this is the number of F0s within one semitone
    
    K = options.K_numFilters # number of spectral shapes for the filter part
    P = options.P_numAtomFilters # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.
    
    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
    
    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)
    
    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        
    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for 
            HGAMMA0=None, HPHI0=None,
            HF00=None,
            WM0=None, HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter, updateRulePower=1.,
            stepNotes=stepNotes, 
            lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
            verbose=options.verbose, displayEvolution=displayEvolution)
        
        if displayEvolution:
            h2 = plt.figure(2);plt.clf();
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])
            
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]
        
        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0
        
        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))
        
        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)
        
        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)
        
        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)
        
        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
        
        del logHF0
        
        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)) )) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)))
        dim2index = np.outer(np.arange(N),
                             np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int)\
                             ).reshape(1, int(N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]# HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0
        
        # remove frames with less than (100 thres_energy) % of total energy. 
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum((((SPHI * SF0)/hatSX)**2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm>thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        
        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0
        
    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print("The melody should be provided as <Time (s)><F0 (Hz)>.")
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:,0] # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:,1]
        if minF0 > melFreqHz[melFreqHz>40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz>40.0].min() *.97
            maxF0 = np.maximum(melFreqHz.max()*1.03, 2*minF0 * 1.03)
            print("Recomputing the source basis for ")
            print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.")
            # Create the harmonic combs, for each F0 between minF0 and maxF0: 
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :] # ensure same size as SX 
            NF0 = F0Table.size # number of harmonic combs
            # Normalization: 
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))
            
        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(np.outer(np.ones(sizeProvidedMel[0]),
                                            sigTimeStamps) -
                                   np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) -
                                   np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath<=0] = 0
        
    freqMelody = F0Table[np.array(indexBestPath,dtype=int)]
    freqMelody[indexBestPath==0] = - freqMelody[indexBestPath==0]
    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N) * hopsize / np.double(Fs),
                         freqMelody]).T)
    
    # If separation is required:
    if options.separateSignals:
        # Second round of parameter estimation, with specific
        # initial HF00:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        
        scopeAllowedHF0 = 2.0 / 1.0
        
        # indexes for HF00:
        # TODO: reprogram this with a 'where'?...
        dim1index = np.array(\
            np.maximum(\
            np.minimum(\
            np.outer(chirpPerF0 * indexBestPath,
                     np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1) ))) \
            + np.outer(np.ones(N),
                       np.arange(-chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                                 chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1))),
            chirpPerF0 * NF0 - 1),
            0),
            dtype=int)
        dim1index = dim1index[indexBestPath!=0,:]
        ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
        ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
        ##                          + 1))
        dim1index = dim1index.reshape(1,dim1index.size)
        
        dim2index = np.outer(np.arange(N), np.ones( int(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)), dtype=int) )
        dim2index = dim2index[indexBestPath!=0,:]
        dim2index = dim2index.reshape(1,dim2index.size)
        ## dim2index.reshape(1, N * chirpPerF0 \
        ##                                * (2 * np.floor(stepNotes \
        ##                                                / scopeAllowedHF0) \
        ##                                   + 1))
        HF00[dim1index, dim2index] = 1 # HF0.max()
        
        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0
        
        
        WF0effective = WF0
        HF00effective = HF00
        
        if options.melody is None:
            del HF0, HGAMMA, HPHI, HM, WM, HF00
        
        if is_stereo:
            del SX
            SXR = np.maximum(np.abs(XR) ** 2, eps)
            SXL = np.maximum(np.abs(XL) ** 2, eps)
            alphaR, alphaL, HGAMMA, HPHI, HF0, \
                betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
                    # the data to be fitted to:
                    SXR, SXL,
                    # the basis matrices for the spectral combs
                    WF0effective,
                    # and for the elementary filters:
                    WGAMMA,
                    # number of desired filters, accompaniment spectra:
                    numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                    # if any, initial amplitude matrices for
                    HGAMMA0=None, HPHI0=None,
                    HF00=HF00effective,
                    WM0=None, HM0=None,
                    # Some more optional arguments, to control the "convergence"
                    # of the algo
                    numberOfIterations=niter, updateRulePower=1.0,
                    stepNotes=stepNotes, 
                    lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
                    verbose=options.verbose, displayEvolution=displayEvolution)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            
            hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
            hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
            
            hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
            
            vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
            
            vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
            #                          options.voc_output_file, Fs)
            
            vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, \
                      np.array([vestR,vestL]).T)
            
            #wav.write(options.voc_output_file, Fs, \
            #          np.int16(32768.0 * np.array([vestR,vestL]).T))
            
            hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
            
            mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
            
            mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
            #                          options.mus_output_file, Fs)
            
            mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, \
                      np.array([mestR,mestL]).T)
            
            #wav.write(options.mus_output_file, Fs, \
            #          np.int16(32768.0 * np.array([mestR,mestL]).T))
            
            del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0
        
            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
            
            alphaR, alphaL, HGAMMA, HPHI, HF0, \
                betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
                    # the data to be fitted to:
                    SXR, SXL,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA, HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,#WM,
                HM0=None,#HM,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.0,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution,
                updateHGAMMA=False)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            
            hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
            hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)
            
            hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR
            
            vestR = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL
            
            vestL = istft(hatVR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
            
            vestR = np.array(np.round(vestR*scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL*scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, \
                      np.array([vestR,vestL]).T)
            
            hatMR = (np.dot(np.dot(WM,betaR ** 2),HM)) / hatSXR * XR
            
            mestR = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            hatMR = (np.dot(np.dot(WM,betaL ** 2),HM)) / hatSXL * XL
            
            mestL = istft(hatMR, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
            
            mestR = np.array(np.round(mestR*scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL*scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, \
                      np.array([mestR,mestL]).T)
        else:
            # running on monophonic data:
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for 
                HGAMMA0=None, HPHI0=None,
                HF00=HF00effective,
                WM0=None, HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            SM = np.dot(WM,HM)
            
            hatSX =  SF0 * SPHI + SM
            
            hatV = SPHI * SF0 / hatSX * X
            
            vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            vest = np.array(np.round(vest*scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, vest)
            
            hatM = SM / hatSX * X
            
            mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            mest = np.array(np.round(mest*scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, mest)
            
            del hatM, vest, mest, hatV, hatSX, SPHI, SF0
            
            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?
            
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for 
                HGAMMA0=HGAMMA, HPHI0=HPHI,
                HF00=HUF0,
                WM0=None, HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter, updateRulePower=1.,
                stepNotes=stepNotes, 
                lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
                verbose=options.verbose, displayEvolution=displayEvolution,
                updateHGAMMA=False)
            
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            SM = np.dot(WM,HM)
            
            hatSX =  SF0 * SPHI + SM
            
            hatV = SPHI * SF0 / hatSX * X
            
            vest = istft(hatV, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            vest = np.array(np.round(vest*scaleData), dtype=dataType)
            outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
            wav.write(outputFileName, Fs, vest)
            
            hatM = SM / hatSX * X
            
            mest = istft(hatM, hopsize=int(hopsize), nfft=int(NFT), window=sinebell(windowSizeInSamples)) / 4.0
            
            mest = np.array(np.round(mest*scaleData), dtype=dataType)
            
            outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
            wav.write(outputFileName, Fs, mest)
            

        if displayEvolution:
            plt.close('all')
            
    print("Done!")
Ejemplo n.º 4
0
def stereo_NMF(SXR, SXL,
               numberOfAccompanimentSpectralShapes,
               WM0=None, HM0=None,
               numberOfIterations=50, updateRulePower=1.0,
               verbose=False, displayEvolution=False):
    
    eps = 10 ** (-20)
    
    if displayEvolution:
        import matplotlib.pyplot as plt
        from imageMatlab import imageM
        plt.ion()
        print "Is the display interactive? ", plt.isinteractive()
    
    R = numberOfAccompanimentSpectralShapes
    omega = updateRulePower
    
    F, N = SXR.shape
    if (F, N) != SXL.shape:
        print "The input STFT matrices do not have the same dimension.\n"
        print "Please check what happened..."
        raise ValueError("Dimension of STFT matrices must be the same.")
    
    if HM0 is None:
        HM0 = np.abs(randn(R, N))
    else:
        if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N:
            HM0 = np.array(HM0)
        else:
            print "Wrong dimensions for given HM0, \n"
            print "random initialization used instead"
            HM0 = np.abs(randn(R, N))
    HM = np.copy(HM0)
    
    if WM0 is None:
        WM0 = np.abs(randn(F, R))
    else:
        if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R:
            WM0 = np.array(WM0)
        else:
            print "Wrong dimensions for given WM0, \n"
            print "random initialization used instead"
            WM0 = np.abs(randn(F, R))
    WM = np.copy(WM0)
    
    betaR = np.diag(np.random.rand(R))
    betaL = np.eye(R) - betaR
    
    hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
    hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
    
    # temporary matrices
    tempNumFbyN = np.zeros([F, N])
    tempDenFbyN = np.zeros([F, N])
    
    recoError = np.zeros([numberOfIterations * 3 + 1])
    recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL)
    if verbose:
        print "Reconstruction error at beginning: ", recoError[0]
    counterError = 1
    if displayEvolution:
        h1 = plt.figure(1)
        
        
    for n in np.arange(numberOfIterations):
        # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM
        if verbose:
            print "iteration ", n, " over ", numberOfIterations
            
        if displayEvolution:
            h1.clf()
            imageM(db(hatSXR))
            plt.clim([np.amax(db(hatSXR))-100, np.amax(db(hatSXR))])
            plt.draw()
        
        # updating HM
        HM = HM * \
             ((np.dot(np.dot((betaR**2), WM.T), SXR /
                      np.maximum(hatSXR ** 2, eps)) +
               np.dot(np.dot((betaL**2), WM.T), SXL /
                      np.maximum(hatSXL ** 2, eps))
               ) /
              np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 /
                                np.maximum(hatSXR, eps)) +
                         np.dot(np.dot((betaL**2), WM.T), 1 /
                                np.maximum(hatSXL, eps)),
                         eps)) ** omega
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2),HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2),HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print "Reconstruction error difference after HM    : ",\
                  recoError[counterError] - recoError[counterError - 1]
        counterError += 1
        
        # updating WM
        WM = WM * \
             ((np.dot(SXR / np.maximum(hatSXR ** 2, eps),
                      np.dot(HM.T, betaR ** 2)) +
               np.dot(SXL / np.maximum(hatSXL ** 2, eps),
                      np.dot(HM.T, betaL ** 2))
               ) /
              (np.dot(1 / np.maximum(hatSXR, eps),
                      np.dot(HM.T, betaR ** 2)) +
               np.dot(1 / np.maximum(hatSXL, eps),
                      np.dot(HM.T, betaL ** 2))
               )) ** omega
        
        sumWM = np.sum(WM, axis=0)
        WM[:, sumWM>0] = (WM[:, sumWM>0] /
                          np.outer(np.ones(F),sumWM[sumWM>0]))
        HM = HM * np.outer(sumWM, np.ones(N))
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print "Reconstruction error difference after WM    : ",
            print recoError[counterError] - recoError[counterError - 1]
            
        counterError += 1
        
        # updating betaR and betaL
        betaR = np.diag(np.diag(np.maximum(betaR *
                        ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2,
                                                               eps)),
                                 HM.T)) /
                         (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR,
                                                             eps)),
                                 HM.T))) ** (omega*.1), eps)))
        betaL = np.diag(np.diag(np.maximum(betaL *
                        ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2,
                                                               eps)),
                                 HM.T)) /
                         (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL,
                                                             eps)),
                                 HM.T))) ** (omega*.1), eps)))
        betaR = betaR / np.maximum(betaR + betaL, eps)
        betaL = np.copy(np.eye(R) - betaR)
        
        hatSXR = np.maximum(np.dot(np.dot(WM, betaR**2), HM), eps)
        hatSXL = np.maximum(np.dot(np.dot(WM, betaL**2), HM), eps)
        
        recoError[counterError] = ISDistortion(SXR, hatSXR) \
                                  + ISDistortion(SXL, hatSXL)
        
        if verbose:
            print "Reconstruction error difference after BETA  : ",
            print recoError[counterError] - recoError[counterError - 1]
        
        counterError += 1
        
    return betaR, betaL, HM, WM
Ejemplo n.º 5
0
def SIMM(# the data to be fitted to:
         SX,
         # the basis matrices for the spectral combs
         WF0,
         # and for the elementary filters:
         WGAMMA,
         # number of desired filters, accompaniment spectra:
         numberOfFilters=4, numberOfAccompanimentSpectralShapes=10,
         # if any, initial amplitude matrices for 
         HGAMMA0=None, HPHI0=None,
         HF00=None,
         WM0=None, HM0=None,
         # Some more optional arguments, to control the "convergence"
         # of the algo
         numberOfIterations=1000, updateRulePower=1.0,
         stepNotes=4, 
         lambdaHF0=0.00,alphaHF0=0.99,
         displayEvolution=False, verbose=True, makeMovie=False,
         updateHGAMMA=True,
         computeISDistortion=False):
    """
    HGAMMA, HPHI, HF0, HM, WM, recoError =
        SIMM(SX, WF0, WGAMMA, numberOfFilters=4,
             numberOfAccompanimentSpectralShapes=10, HGAMMA0=None, HPHI0=None,
             HF00=None, WM0=None, HM0=None, numberOfIterations=1000,
             updateRulePower=1.0, stepNotes=4, 
             lambdaHF0=0.00, alphaHF0=0.99, displayEvolution=False,
             verbose=True)

    Implementation of the Smooth-filters Instantaneous Mixture Model
    (SIMM). This model can be used to estimate the main melody of a
    song, and separate the lead voice from the accompaniment, provided
    that the basis WF0 is constituted of elements associated to
    particular pitches.

    Inputs:
        SX
            the F x N power spectrogram to be approximated.
            F is the number of frequency bins, while N is the number of
            analysis frames
        WF0
            the F x NF0 basis matrix containing the NF0 source elements
        WGAMMA
            the F x P basis matrix of P smooth elementary filters
        numberOfFilters
            the number of filters K to be considered
        numberOfAccompanimentSpectralShapes
            the number of spectral shapes R for the accompaniment
        HGAMMA0
            the P x K decomposition matrix of WPHI on WGAMMA
        HPHI0
            the K x N amplitude matrix of the filter part of the lead
            instrument
        HF00
            the NF0 x N amplitude matrix for the source part of the lead
            instrument
        WM0
            the F x R the matrix for spectral shapes of the
            accompaniment
        HM0
            the R x N amplitude matrix associated with each of the R
            accompaniment spectral shapes
        numberOfIterations
            the number of iterations for the estimatino algorithm
        updateRulePower
            the power to which the multiplicative gradient is elevated to
        stepNotes
            the number of elements in WF0 per semitone. stepNotes=4 means
            that there are 48 elements per octave in WF0.
        lambdaHF0
            Lagrangian multiplier for the octave control
        alphaHF0
            parameter that controls how much influence a lower octave
            can have on the upper octave's amplitude.

    Outputs:
        HGAMMA
            the estimated P x K decomposition matrix of WPHI on WGAMMA
        HPHI
            the estimated K x N amplitude matrix of the filter part 
        HF0
            the estimated NF0 x N amplitude matrix for the source part
        HM
            the estimated R x N amplitude matrix for the accompaniment
        WM
            the estimate F x R spectral shapes for the accompaniment
        recoError
            the successive values of the Itakura Saito divergence
            between the power spectrogram and the spectrogram
            computed thanks to the updated estimations of the matrices.

    Please also refer to the following article for more details about
    the algorithm within this function, as well as the meaning of the
    different matrices that are involved:
        J.-L. Durrieu, G. Richard, B. David and C. Fevotte
        Source/Filter Model for Unsupervised Main Melody
        Extraction From Polyphonic Audio Signals
        IEEE Transactions on Audio, Speech and Language Processing
        Vol. 18, No. 3, March 2010
    """
    eps = 10 ** (-20)

    if displayEvolution:
        import matplotlib.pyplot as plt
        from imageMatlab import imageM
        plt.ion()
        print("Is the display interactive? ", plt.isinteractive())

    # renamed for convenience:
    K = numberOfFilters
    R = int(numberOfAccompanimentSpectralShapes)
    omega = updateRulePower
    
    F, N = SX.shape
    Fwf0, NF0 = WF0.shape
    Fwgamma, P = WGAMMA.shape
    
    # Checking the sizes of the matrices
    if Fwf0 != F:
        return False # A REVOIR!!!
    if HGAMMA0 is None:
        HGAMMA0 = np.abs(randn(P, K))
    else:
        if not(isinstance(HGAMMA0,np.ndarray)): # default behaviour
            HGAMMA0 = np.array(HGAMMA0)
        Phgamma0, Khgamma0 = HGAMMA0.shape
        if Phgamma0 != P or Khgamma0 != K:
            print("Wrong dimensions for given HGAMMA0, \n")
            print("random initialization used instead")
            HGAMMA0 = np.abs(randn(P, K))

    HGAMMA = np.copy(HGAMMA0)
    
    if HPHI0 is None: # default behaviour
        HPHI = np.abs(randn(K, N))
    else:
        Khphi0, Nhphi0 = np.array(HPHI0).shape
        if Khphi0 != K or Nhphi0 != N:
            print("Wrong dimensions for given HPHI0, \n")
            print("random initialization used instead")
            HPHI = np.abs(randn(K, N))
        else:
            HPHI = np.copy(np.array(HPHI0))

    if HF00 is None:
        HF00 = np.abs(randn(NF0, N))
    else:
        if np.array(HF00).shape[0] == NF0 and np.array(HF00).shape[1] == N:
            HF00 = np.array(HF00)
        else:
            print("Wrong dimensions for given HF00, \n")
            print("random initialization used instead")
            HF00 = np.abs(randn(NF0, N))
    HF0 = np.copy(HF00)

    if HM0 is None:
        HM0 = np.abs(randn(R, N))
    else:
        if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N:
            HM0 = np.array(HM0)
        else:
            print("Wrong dimensions for given HM0, \n")
            print("random initialization used instead")
            HM0 = np.abs(randn(R, N))
    HM = np.copy(HM0)

    if WM0 is None:
        WM0 = np.abs(randn(F, R))
    else:
        if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R:
            WM0 = np.array(WM0)
        else:
            print("Wrong dimensions for given WM0, \n")
            print("random initialization used instead")
            WM0 = np.abs(randn(F, R))
    WM = np.copy(WM0)
    
    # Iterations to estimate the SIMM parameters:
    WPHI = np.dot(WGAMMA, HGAMMA)
    SF0 = np.dot(WF0, HF0)
    SPHI = np.dot(WPHI, HPHI)
    SM = np.dot(WM, HM)
    hatSX = SF0 * SPHI + SM

    ## SX = SX + np.abs(randn(F, N)) ** 2
                                       # should not need this line
                                       # which ensures that data is not
                                       # 0 everywhere. 
    # temporary matrices
    tempNumFbyN = np.zeros([F, N])
    tempDenFbyN = np.zeros([F, N])

    # Array containing the reconstruction error after the update of each 
    # of the parameter matrices:
    recoError = np.zeros([numberOfIterations * 5 * 2 + NF0 * 2 + 1])
    recoError[0] = ISDistortion(SX, hatSX)
    if verbose:
        print("Reconstruction error at beginning: ", recoError[0])
    counterError = 1
    if displayEvolution:
        h1 = plt.figure(1)

    if makeMovie:
        dirName = 'tmp%s/' %time.strftime("%Y%m%d%H%M%S")
        os.system('mkdir %s' %dirName)

    # Main loop for multiplicative updating rules:
    for n in np.arange(numberOfIterations):
        # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM
        if verbose:
            print("iteration ", n, " over ", numberOfIterations)
        if displayEvolution:
            h1.clf();imageM(db(HF0));
            plt.clim([np.amax(db(HF0))-100, np.amax(db(HF0))]);plt.draw();
            ## h1.clf();
            ## imageM(HF0 * np.outer(np.ones([NF0, 1]),
            ##                       1 / (HF0.max(axis=0))));
        if makeMovie:
            filename = dirName + '%04d' % n + '.png'
            plt.savefig(filename, dpi=100)
            
        # updating HF0:
        tempNumFbyN = (SPHI * SX) / np.maximum(hatSX ** 2, eps)
        tempDenFbyN = SPHI / np.maximum(hatSX, eps)

        # This to enable octave control
        HF0[np.arange(12 * stepNotes, NF0), :] \
           = HF0[np.arange(12 * stepNotes, NF0), :] \
             * (np.dot(WF0[:, np.arange(12 * stepNotes,
                                        NF0)].T, tempNumFbyN) \
                / np.maximum(
            np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T,
                   tempDenFbyN) \
            + lambdaHF0 * (- (alphaHF0 - 1.0) \
                           / np.maximum(HF0[
            np.arange(12 * stepNotes, NF0), :], eps) \
                           + HF0[
            np.arange(NF0 - 12 * stepNotes), :]),
            eps)) ** omega

        HF0[np.arange(12 * stepNotes), :] \
           = HF0[np.arange(12 * stepNotes), :] \
             * (np.dot(WF0[:, np.arange(12 * stepNotes)].T,
                      tempNumFbyN) /
               np.maximum(
                np.dot(WF0[:, np.arange(12 * stepNotes)].T,
                       tempDenFbyN), eps)) ** omega

        ### normal update rules without checking octaves:
        ##HF0 = HF0 * (np.dot(WF0.T, tempNumFbyN) /
        ##             np.maximum(np.dot(WF0.T, tempDenFbyN), eps)) ** omega
        
        SF0 = np.maximum(np.dot(WF0, HF0),eps)
        hatSX = np.maximum(SF0 * SPHI + SM,eps)
        if computeISDistortion:
            recoError[counterError] = ISDistortion(SX, hatSX)

        if verbose:
            print("Reconstruction error difference after HF0   : ",)
            print(recoError[counterError] - recoError[counterError - 1])
        counterError += 1
    
        # updating HPHI
        tempNumFbyN = (SF0 * SX) / np.maximum(hatSX ** 2, eps)
        tempDenFbyN = SF0 / np.maximum(hatSX, eps)
        HPHI = HPHI * (np.dot(WPHI.T, tempNumFbyN) / \
                       np.maximum(np.dot(WPHI.T, tempDenFbyN), eps)) ** omega
        sumHPHI = np.sum(HPHI, axis=0)
        HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / \
                             np.outer(np.ones(K), sumHPHI[sumHPHI>0])
        HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI)

        SF0 = np.maximum(np.dot(WF0, HF0), eps)
        SPHI = np.maximum(np.dot(WPHI, HPHI), eps)
        hatSX = np.maximum(SF0 * SPHI + SM, eps)
        
        if computeISDistortion:
            recoError[counterError] = ISDistortion(SX, hatSX)

        if verbose:
            print("Reconstruction error difference after HPHI  : ", \
                  recoError[counterError] - recoError[counterError - 1])
        counterError += 1
        
        # updating HM
        tempNumFbyN = SX / np.maximum(hatSX ** 2, eps)
        tempDenFbyN = 1 / np.maximum(hatSX, eps)
        HM = np.maximum(HM * (np.dot(WM.T, tempNumFbyN) / \
                              np.maximum(np.dot(WM.T, tempDenFbyN), eps)) ** \
                        omega, eps)
        
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SF0 * SPHI + SM, eps)
        
        if computeISDistortion:
            recoError[counterError] = ISDistortion(SX, hatSX)

        if verbose:
            print("Reconstruction error difference after HM    : ", \
                  recoError[counterError] - recoError[counterError - 1])
        counterError += 1

        # updating HGAMMA
        if updateHGAMMA:
            tempNumFbyN = (SF0 * SX) / np.maximum(hatSX ** 2, eps)
            tempDenFbyN = SF0 / np.maximum(hatSX, eps)
            HGAMMA = np.maximum(\
                     HGAMMA * (np.dot(WGAMMA.T, \
                                      np.dot(tempNumFbyN, HPHI.T)) / \
                               np.maximum(\
                                   np.dot(WGAMMA.T, \
                                          np.dot(tempDenFbyN, HPHI.T)),
                                   eps)) ** \
                     omega, eps)
            
            sumHGAMMA = np.sum(HGAMMA, axis=0)
            HGAMMA[:, sumHGAMMA>0] = HGAMMA[:, sumHGAMMA>0] / \
                                     np.outer(np.ones(P), \
                                              sumHGAMMA[sumHGAMMA>0])
            HPHI = HPHI * np.outer(sumHGAMMA, np.ones(N))
            sumHPHI = np.sum(HPHI, axis=0)
            HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0])
            HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI)
            
            WPHI = np.maximum(np.dot(WGAMMA, HGAMMA), eps)
            SF0 = np.maximum(np.dot(WF0, HF0), eps)
            SPHI = np.maximum(np.dot(WPHI, HPHI), eps)
            hatSX = np.maximum(SF0 * SPHI + SM, eps)
            
            if computeISDistortion:
                recoError[counterError] = ISDistortion(SX, hatSX)
                
            if verbose:
                print("Reconstruction error difference after HGAMMA: ",)
                print(recoError[counterError] - recoError[counterError - 1])
            
        counterError += 1

        # updating WM, after a certain number of iterations (here, after 1 iteration)
        if n > -1: # this test can be used such that WM is updated only
                  # after a certain number of iterations
            tempNumFbyN = SX / np.maximum(hatSX ** 2, eps)
            tempDenFbyN = 1 / np.maximum(hatSX, eps)
            WM = np.maximum(WM * (np.dot(tempNumFbyN, HM.T) /
                                  np.maximum(np.dot(tempDenFbyN, HM.T),
                                             eps)) ** omega, eps)
            
            sumWM = np.sum(WM, axis=0)
            WM[:, sumWM>0] = (WM[:, sumWM>0] /
                              np.outer(np.ones(F),sumWM[sumWM>0]))
            HM = HM * np.outer(sumWM, np.ones(N))
            
            SM = np.maximum(np.dot(WM, HM), eps)
            hatSX = np.maximum(SF0 * SPHI + SM, eps)
            
            if computeISDistortion:
                recoError[counterError] = ISDistortion(SX, hatSX)

            if verbose:
                print("Reconstruction error difference after WM    : ",)
                print(recoError[counterError] - recoError[counterError - 1])
            counterError += 1

    return HGAMMA, HPHI, HF0, HM, WM, recoError
Ejemplo n.º 6
0
def Stereo_SIMM(# the data to be fitted to:
         SXR, SXL,
         # the basis matrices for the spectral combs
         WF0,
         # and for the elementary filters:
         WGAMMA,
         # number of desired filters, accompaniment spectra:
         numberOfFilters=4, numberOfAccompanimentSpectralShapes=10,
         # if any, initial amplitude matrices for 
         HGAMMA0=None, HPHI0=None,
         HF00=None,
         WM0=None, HM0=None,
         # Some more optional arguments, to control the "convergence"
         # of the algo
         numberOfIterations=1000, updateRulePower=1.0,
         stepNotes=4, 
         lambdaHF0=0.00,alphaHF0=0.99,
         displayEvolution=False, verbose=True,
         updateHGAMMA=True):
    """
    HGAMMA, HPHI, HF0, HM, WM, recoError =
        SIMM(SXR, SXL, WF0, WGAMMA, numberOfFilters=4,
             numberOfAccompanimentSpectralShapes=10, HGAMMA0=None, HPHI0=None,
             HF00=None, WM0=None, HM0=None, numberOfIterations=1000,
             updateRulePower=1.0, stepNotes=4, 
             lambdaHF0=0.00, alphaHF0=0.99, displayEvolution=False,
             verbose=True)

    Implementation of the Smooth-filters Instantaneous Mixture Model
    (SIMM). This model can be used to estimate the main melody of a
    song, and separate the lead voice from the accompaniment, provided
    that the basis WF0 is constituted of elements associated to
    particular pitches.

    Inputs:
        SX
            the F x N power spectrogram to be approximated.
            F is the number of frequency bins, while N is the number of
            analysis frames
        WF0
            the F x NF0 basis matrix containing the NF0 source elements
        WGAMMA
            the F x P basis matrix of P smooth elementary filters
        numberOfFilters
            the number of filters K to be considered
        numberOfAccompanimentSpectralShapes
            the number of spectral shapes R for the accompaniment
        HGAMMA0
            the P x K decomposition matrix of WPHI on WGAMMA
        HPHI0
            the K x N amplitude matrix of the filter part of the lead
            instrument
        HF00
            the NF0 x N amplitude matrix for the source part of the lead
            instrument
        WM0
            the F x R the matrix for spectral shapes of the
            accompaniment
        HM0
            the R x N amplitude matrix associated with each of the R
            accompaniment spectral shapes
        numberOfIterations
            the number of iterations for the estimatino algorithm
        updateRulePower
            the power to which the multiplicative gradient is elevated to
        stepNotes
            the number of elements in WF0 per semitone. stepNotes=4 means
            that there are 48 elements per octave in WF0.
        lambdaHF0
            Lagrangian multiplier for the octave control
        alphaHF0
            parameter that controls how much influence a lower octave
            can have on the upper octave's amplitude.

    Outputs:
        HGAMMA
            the estimated P x K decomposition matrix of WPHI on WGAMMA
        HPHI
            the estimated K x N amplitude matrix of the filter part 
        HF0
            the estimated NF0 x N amplitude matrix for the source part
        HM
            the estimated R x N amplitude matrix for the accompaniment
        WM
            the estimate F x R spectral shapes for the accompaniment
        recoError
            the successive values of the Itakura Saito divergence
            between the power spectrogram and the spectrogram
            computed thanks to the updated estimations of the matrices.

    Please also refer to the following article for more details about
    the algorithm within this function, as well as the meaning of the
    different matrices that are involved:
        J.-L. Durrieu, G. Richard, B. David and C. Fevotte
        Source/Filter Model for Unsupervised Main Melody
        Extraction From Polyphonic Audio Signals
        IEEE Transactions on Audio, Speech and Language Processing
        Vol. 18, No. 3, March 2010
    """
    eps = 10 ** (-20)

    if displayEvolution:
        import matplotlib.pyplot as plt
        from imageMatlab import imageM
        plt.ion()
        print "Is the display interactive? ", plt.isinteractive()

    # renamed for convenience:
    K = numberOfFilters
    R = numberOfAccompanimentSpectralShapes
    omega = updateRulePower
    
    F, N = SXR.shape
    if (F, N) != SXL.shape:
        print "The input STFT matrices do not have the same dimension.\n"
        print "Please check what happened..."
        raise ValueError("Dimension of STFT matrices must be the same.")
        
    Fwf0, NF0 = WF0.shape
    Fwgamma, P = WGAMMA.shape
    
    # Checking the sizes of the matrices
    if Fwf0 != F:
        return False # A REVOIR!!!
    if HGAMMA0 is None:
        HGAMMA0 = np.abs(randn(P, K))
    else:
        if not(isinstance(HGAMMA0,np.ndarray)): # default behaviour
            HGAMMA0 = np.array(HGAMMA0)
        Phgamma0, Khgamma0 = HGAMMA0.shape
        if Phgamma0 != P or Khgamma0 != K:
            print "Wrong dimensions for given HGAMMA0, \n"
            print "random initialization used instead"
            HGAMMA0 = np.abs(randn(P, K))

    HGAMMA = np.copy(HGAMMA0)
    
    if HPHI0 is None: # default behaviour
        HPHI = np.abs(randn(K, N))
    else:
        Khphi0, Nhphi0 = np.array(HPHI0).shape
        if Khphi0 != K or Nhphi0 != N:
            print "Wrong dimensions for given HPHI0, \n"
            print "random initialization used instead"
            HPHI = np.abs(randn(K, N))
        else:
            HPHI = np.copy(np.array(HPHI0))

    if HF00 is None:
        HF00 = np.abs(randn(NF0, N))
    else:
        if np.array(HF00).shape[0] == NF0 and np.array(HF00).shape[1] == N:
            HF00 = np.array(HF00)
        else:
            print "Wrong dimensions for given HF00, \n"
            print "random initialization used instead"
            HF00 = np.abs(randn(NF0, N))
    HF0 = np.copy(HF00)

    if HM0 is None:
        HM0 = np.abs(randn(R, N))
    else:
        if np.array(HM0).shape[0] == R and np.array(HM0).shape[1] == N:
            HM0 = np.array(HM0)
        else:
            print "Wrong dimensions for given HM0, \n"
            print "random initialization used instead"
            HM0 = np.abs(randn(R, N))
    HM = np.copy(HM0)

    if WM0 is None:
        WM0 = np.abs(randn(F, R))
    else:
        if np.array(WM0).shape[0] == F and np.array(WM0).shape[1] == R:
            WM0 = np.array(WM0)
        else:
            print "Wrong dimensions for given WM0, \n"
            print "random initialization used instead"
            WM0 = np.abs(randn(F, R))
    WM = np.copy(WM0)

    alphaR = 0.5
    alphaL = 0.5
    betaR = np.diag(np.random.rand(R))
    betaL = np.eye(R) - betaR
    
    # Iterations to estimate the SIMM parameters:
    WPHI = np.dot(WGAMMA, HGAMMA)
    SF0 = np.dot(WF0, HF0)
    SPHI = np.dot(WPHI, HPHI)
    # SM = np.dot(WM, HM)
    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2),HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2),HM)

    # SX = SX + np.abs(randn(F, N)) ** 2
                                       # should not need this line
                                       # which ensures that data is not
                                       # 0 everywhere. 
    # temporary matrices
    tempNumFbyN = np.zeros([F, N])
    tempDenFbyN = np.zeros([F, N])

    # Array containing the reconstruction error after the update of each 
    # of the parameter matrices:
    recoError = np.zeros([numberOfIterations * 5 * 2 + NF0 * 2 + 1])
    recoError[0] = ISDistortion(SXR, hatSXR) + ISDistortion(SXL, hatSXL)
    if verbose:
        print "Reconstruction error at beginning: ", recoError[0]
    counterError = 1
    if displayEvolution:
        h1 = plt.figure(1)

    # Main loop for multiplicative updating rules:
    for n in np.arange(numberOfIterations):
        # order of re-estimation: HF0, HPHI, HM, HGAMMA, WM
        if verbose:
            print "iteration ", n, " over ", numberOfIterations
        if displayEvolution:
            h1.clf();imageM(db(HF0));
            plt.clim([np.amax(db(HF0))-100, np.amax(db(HF0))]);plt.draw();
            # h1.clf();
            # imageM(HF0 * np.outer(np.ones([NF0, 1]),
            #                       1 / (HF0.max(axis=0))));

        # updating HF0:
        tempNumFbyN = ((alphaR**2) * SPHI * SXR) / np.maximum(hatSXR ** 2, eps)\
                      + ((alphaL**2) * SPHI * SXL) / np.maximum(hatSXL ** 2, eps)
        tempDenFbyN = (alphaR**2) * SPHI / np.maximum(hatSXR, eps)\
                      + (alphaL**2) * SPHI / np.maximum(hatSXL, eps)

        # This to enable octave control
        HF0[np.arange(12 * stepNotes, NF0), :] \
           = HF0[np.arange(12 * stepNotes, NF0), :] \
             * (np.dot(WF0[:, np.arange(12 * stepNotes,
                                        NF0)].T, tempNumFbyN) \
                / np.maximum(
            np.dot(WF0[:, np.arange(12 * stepNotes, NF0)].T,
                   tempDenFbyN) \
            + lambdaHF0 * (- (alphaHF0 - 1.0) \
                           / np.maximum(HF0[
            np.arange(12 * stepNotes, NF0), :], eps) \
                           + HF0[
            np.arange(NF0 - 12 * stepNotes), :]),
            eps)) ** omega

        HF0[np.arange(12 * stepNotes), :] \
           = HF0[np.arange(12 * stepNotes), :] \
             * (np.dot(WF0[:, np.arange(12 * stepNotes)].T,
                      tempNumFbyN) /
               np.maximum(
                np.dot(WF0[:, np.arange(12 * stepNotes)].T,
                       tempDenFbyN), eps)) ** omega

##        # normal update rules:
##        HF0 = HF0 * (np.dot(WF0.T, tempNumFbyN) /
##                     np.maximum(np.dot(WF0.T, tempDenFbyN), eps)) ** omega
        
        
        SF0 = np.maximum(np.dot(WF0, HF0), eps)
        hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaR**2),HM),
                            eps)
        hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaL**2),HM),
                            eps)
        
        ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
        ##                           + ISDistortion(SXL, hatSXL)
        ## 
        ## if verbose:
        ##     print "Reconstruction error difference after HF0   : ",
        ##     print recoError[counterError] - recoError[counterError - 1]
        counterError += 1
    
        # updating HPHI
        if updateHGAMMA or True:
            tempNumFbyN = ((alphaR**2) * SF0 * SXR) / np.maximum(hatSXR ** 2, eps)\
                          + ((alphaL**2) * SF0 * SXL) / np.maximum(hatSXL ** 2, eps)
            tempDenFbyN = (alphaR**2) * SF0 / np.maximum(hatSXR, eps)\
                          + (alphaL**2) * SF0 / np.maximum(hatSXL, eps)
            HPHI = HPHI * (np.dot(WPHI.T, tempNumFbyN) / np.maximum(np.dot(WPHI.T, tempDenFbyN), eps)) ** omega
            sumHPHI = np.sum(HPHI, axis=0)
            HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0])
            HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI)
            
            SF0 = np.maximum(np.dot(WF0, HF0), eps)
            SPHI = np.maximum(np.dot(WPHI, HPHI), eps)
            hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaR**2),HM),
                                eps)
            hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaL**2),HM),
                                eps)
            
            ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
            ##                           + ISDistortion(SXL, hatSXL)
            ## 
            ## if verbose:
            ##     print "Reconstruction error difference after HPHI  : ", recoError[counterError] - recoError[counterError - 1]
            ##     
            counterError += 1
        
        
        # updating HM
        # tempNumFbyN = SXR / np.maximum(hatSXR ** 2, eps)\
        #               + SXL / np.maximum(hatSXL ** 2, eps)
        # tempDenFbyN = 1 / np.maximum(hatSXR, eps)\
        #               + 1 / np.maximum(hatSXL, eps)
        # HM = np.maximum(HM * (np.dot(WM.T, tempNumFbyN) / np.maximum(np.dot(WM.T, tempDenFbyN), eps)) ** omega, eps)
        HM = HM * \
             ((np.dot(np.dot((betaR**2), WM.T), SXR /
                      np.maximum(hatSXR ** 2, eps)) +
               np.dot(np.dot((betaL**2), WM.T), SXL /
                      np.maximum(hatSXL ** 2, eps))
               ) /
              np.maximum(np.dot(np.dot((betaR**2), WM.T), 1 /
                                np.maximum(hatSXR, eps)) +
                         np.dot(np.dot((betaL**2), WM.T), 1 /
                                np.maximum(hatSXL, eps)),
                         eps)) ** omega
        
        hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaR**2),HM), eps)
        hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaL**2),HM), eps)
        
        ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
        ##                           + ISDistortion(SXL, hatSXL)
        ## 
        ## if verbose:
        ##     print "Reconstruction error difference after HM    : ", recoError[counterError] - recoError[counterError - 1]
        counterError += 1  

        # updating HGAMMA
        if updateHGAMMA:
            tempNumFbyN = ((alphaR ** 2) * SF0 * SXR) / np.maximum(hatSXR ** 2, eps)\
                          + ((alphaL ** 2) * SF0 * SXL) / np.maximum(hatSXL ** 2, eps)
            tempDenFbyN = (alphaR ** 2) * SF0 / np.maximum(hatSXR, eps) \
                          + (alphaL ** 2) * SF0 / np.maximum(hatSXL, eps)
            
            HGAMMA = np.maximum(HGAMMA * (np.dot(WGAMMA.T, np.dot(tempNumFbyN, HPHI.T)) / np.maximum(np.dot(WGAMMA.T, np.dot(tempDenFbyN, HPHI.T)), eps)) ** omega, eps)
            
            sumHGAMMA = np.sum(HGAMMA, axis=0)
            HGAMMA[:, sumHGAMMA>0] = HGAMMA[:, sumHGAMMA>0] / np.outer(np.ones(P), sumHGAMMA[sumHGAMMA>0])
            HPHI = HPHI * np.outer(sumHGAMMA, np.ones(N))
            sumHPHI = np.sum(HPHI, axis=0)
            HPHI[:, sumHPHI>0] = HPHI[:, sumHPHI>0] / np.outer(np.ones(K), sumHPHI[sumHPHI>0])
            HF0 = HF0 * np.outer(np.ones(NF0), sumHPHI)
            
            WPHI = np.maximum(np.dot(WGAMMA, HGAMMA), eps)
            SF0 = np.maximum(np.dot(WF0, HF0), eps)
            SPHI = np.maximum(np.dot(WPHI, HPHI), eps)
            
            hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaR**2),HM), eps)
            hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaL**2),HM), eps)
            
            ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
            ##                           + ISDistortion(SXL, hatSXL)
            ## 
            ## if verbose:
            ##     print "Reconstruction error difference after HGAMMA: ",
            ##     print recoError[counterError] - recoError[counterError - 1]
            ## 
            counterError += 1
        
        # updating WM, after a certain number of iterations (here, after 1 iteration)
        if n > -1: # this test can be used such that WM is updated only
                  # after a certain number of iterations
##           tempNumFbyN = SX / np.maximum(hatSX ** 2, eps)
##            tempDenFbyN = 1 / np.maximum(hatSX, eps)
##            WM = np.maximum(WM * (np.dot(tempNumFbyN, HM.T) /
##                                  np.maximum(np.dot(tempDenFbyN, HM.T),
##                                             eps)) ** omega, eps)
            WM = WM * \
                 ((np.dot(SXR / np.maximum(hatSXR ** 2, eps),
                          np.dot(HM.T, betaR ** 2)) +
                   np.dot(SXL / np.maximum(hatSXL ** 2, eps),
                          np.dot(HM.T, betaL ** 2))
                   ) /
                  (np.dot(1 / np.maximum(hatSXR, eps),
                          np.dot(HM.T, betaR ** 2)) +
                   np.dot(1 / np.maximum(hatSXL, eps),
                          np.dot(HM.T, betaL ** 2))
                   )) ** omega
            
            sumWM = np.sum(WM, axis=0)
            WM[:, sumWM>0] = (WM[:, sumWM>0] /
                              np.outer(np.ones(F),sumWM[sumWM>0]))
            HM = HM * np.outer(sumWM, np.ones(N))
            
            hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaR**2),HM), eps)
            hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                                np.dot(np.dot(WM, betaL**2),HM), eps)
            
            ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
            ##                       + ISDistortion(SXL, hatSXL)
            ## 
            ## if verbose:
            ##     print "Reconstruction error difference after WM    : ",
            ##     print recoError[counterError] - recoError[counterError - 1]
            counterError += 1

        # updating alphaR and alphaL:
        tempNumFbyN = SF0 * SPHI * SXR / np.maximum(hatSXR ** 2, eps)
        tempDenFbyN = SF0 * SPHI / np.maximum(hatSXR, eps)
        alphaR = np.maximum(alphaR *
                            (np.sum(tempNumFbyN) /
                            np.sum(tempDenFbyN)) ** (omega*.1), eps)
        tempNumFbyN = SF0 * SPHI * SXL / np.maximum(hatSXL ** 2, eps)
        tempDenFbyN = SF0 * SPHI / np.maximum(hatSXL, eps)
        alphaL = np.maximum(alphaL *
                            (np.sum(tempNumFbyN) /
                            np.sum(tempDenFbyN)) ** (omega*.1), eps)
        alphaR = alphaR / np.maximum(alphaR + alphaL, .001)
        alphaL = np.copy(1 - alphaR)

            
        hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaR**2),HM), eps)
        hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaL**2),HM), eps)
        
        ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
        ##                           + ISDistortion(SXL, hatSXL)
        ## 
        ## if verbose:
        ##     print "Reconstruction error difference after ALPHA : ",
        ##     print recoError[counterError] - recoError[counterError - 1]
        counterError += 1
            

        # updating betaR and betaL
        betaR = np.diag(np.diag(np.maximum(betaR *
                                   ((np.dot(np.dot(WM.T, SXR / np.maximum(hatSXR ** 2, eps)), HM.T)) /
                                   (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXR, eps)), HM.T))) ** (omega*.1), eps)))
        betaL = np.diag(np.diag(np.maximum(betaL *
                                   ((np.dot(np.dot(WM.T, SXL / np.maximum(hatSXL ** 2, eps)), HM.T)) /
                                   (np.dot(np.dot(WM.T, 1 / np.maximum(hatSXL, eps)), HM.T))) ** (omega*.1), eps)))
        betaR = betaR / np.maximum(betaR + betaL, eps)
        betaL = np.copy(np.eye(R) - betaR)

        hatSXR = np.maximum((alphaR**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaR**2),HM), eps)
        hatSXL = np.maximum((alphaL**2) * SF0 * SPHI + \
                            np.dot(np.dot(WM, betaL**2),HM), eps)
        
        ## recoError[counterError] = ISDistortion(SXR, hatSXR) \
        ##                           + ISDistortion(SXL, hatSXL)
        ## 
        ## if verbose:
        ##     print "Reconstruction error difference after BETA  : ",
        ##     print recoError[counterError] - recoError[counterError - 1]
        counterError += 1
        
    return alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError
Ejemplo n.º 7
0
def main(inputAudioFile):
    import optparse

    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "solo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\n"\
                           "music part",
                      default="estimated_music.wav")
    parser.add_option("-p",
                      "--pitch-output-file",
                      dest="pitch_output_file",
                      type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")

    # Some more optional options:
    parser.add_option("-d",
                      "--with-display",
                      dest="displayEvolution",
                      action="store_true",
                      help="display the figures",
                      default=False)
    parser.add_option("-q",
                      "--quiet",
                      dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    #Number of iterations
    parser.add_option("--nb-iterations",
                      dest="nbiter",
                      help="number of iterations",
                      type="int",
                      default=50)
    parser.add_option("--window-size",
                      dest="windowSize",
                      type="float",
                      default=0.04644,
                      help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048,
                      help="size of Fourier transforms, "\
                           "in samples.")
    parser.add_option("--hopsize",
                      dest="hopsize",
                      type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")
    parser.add_option("--nb-accElements",
                      dest="R",
                      type="float",
                      default=40.0,
                      help="number of elements for the accompaniment.")

    parser.add_option("--with-melody", dest="melody", type="string",
                      default=None,
                      help="provide the melody in a file named MELODY, "\
                           "with at each line: <time (s)><F0 (Hz)>.")

    (options, args) = parser.parse_args()
    #if len(args) != 1:
    #parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc('image', cmap='jet')  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    #inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max()  # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData  # makes data vary from -1 to 1
    if data.shape[0] == data.size:  # data is multi-channel
        print("The audio file is not stereo. Try separateLead.py instead.")
        raise ValueError("number of dimensions of the input not 2")
    if data.shape[1] != 2:
        print("The data is multichannel, but not stereo... \n")
        print("Unfortunately this program does not scale well. Data is \n")
        print("reduced to its 2 first channels.\n")
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter
    R = options.R

    if options.verbose:
        print("Some parameter settings:")
        print("    Size of analysis windows: ", windowSizeInSamples)
        print("    Hopsize: ", hopsize)
        print("    Size of Fourier transforms: ", NFT)
        print("    Number of iterations to be done: ", niter)
        print("    Number of elements in WM: ", R)

    XR, F, N = stft(data[:, 0],
                    fs=fs,
                    hopsize=hopsize,
                    window=sinebell(windowSizeInSamples),
                    nfft=NFT)
    XL, F, N = stft(data[:, 1],
                    fs=fs,
                    hopsize=hopsize,
                    window=sinebell(windowSizeInSamples),
                    nfft=NFT)
    # SX is the power spectrogram:
    ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
    ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
    SXR = np.abs(XR)**2
    SXL = np.abs(XL)**2

    del data, F, N

    # TODO: also process these as options:
    eps = 10**-9
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SXR.shape
    stepNotes = 20  # this is the number of F0s within one semitone
    # until 17/09/2010 : stepNotes = 20
    # 17/09/2010 : trying stepNotes = 8, checking for less artefacts

    K = 10  # number of spectral shapes for the filter part
    # R = 40 # number of spectral shapes for the accompaniment
    P = 30  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if displayEvolution:
        plt.figure(1)
        plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        SX = np.maximum(np.abs((XR + XL) / 2.0)**2, 10**-8)
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K,
            numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for
            HGAMMA0=None,
            HPHI0=None,
            HF00=None,
            WM0=None,
            HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter,
            updateRulePower=1.,
            stepNotes=stepNotes,
            lambdaHF0=0.0 / (1.0 * SX.max()),
            alphaHF0=0.9,
            verbose=options.verbose,
            displayEvolution=displayEvolution)

        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10**(-90)
        p0_0 = transitions[cutoffnote - 1] * 10**(-100)
        p0_f = transitions[cutoffnote - 1] * 10**(-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))

        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')
            ##         raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                   "!! Press Return to resume the program  !!\n"\
            ##                   "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

        del logHF0

        # detection of silences:
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * np.floor(stepNotes / scopeAllowedHF0) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * np.floor(stepNotes / scopeAllowedHF0),
                                         chirpPerF0 \
                                         * (np.floor(stepNotes / scopeAllowedHF0) \
                                            + 1))),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * np.floor(stepNotes \
                                                     / scopeAllowedHF0) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * np.floor(stepNotes \
                                                       / scopeAllowedHF0) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum(np.abs((SPHI * SF0)/hatSX * \
                                  (XR+XL) * 0.5) \
                           ** 2, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N

        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])
        indexBestPath[melNotPresent] = 0

    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:

            print("The melody should be provided as <Time (s)><F0 (Hz)>.")
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:, 0]  # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:, 1]
        if minF0 > melFreqHz[
                melFreqHz > 40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz > 40.0].min() * .97
            maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03)
            print("Recomputing the source basis for ")
            print("minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz.")
            # Create the harmonic combs, for each F0 between minF0 and maxF0:
            F0Table, WF0 = \
                     generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                          stepNotes=stepNotes, \
                                          lengthWindow=windowSizeInSamples,
                                          Ot=0.25, \
                                          perF0=chirpPerF0, \
                                          depthChirpInSemiTone=.15)
            WF0 = WF0[0:F, :]  # ensure same size as SX
            NF0 = F0Table.size  # number of harmonic combs
            # Normalization:
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(
            np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) -
            np.outer(melTimeStamps, np.ones(N)))
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(
            np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps,range(N)] >= \
                      0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath <= 0] = 0

    freqMelody = F0Table[np.array(indexBestPath, dtype=int)]
    freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0]
    np.savetxt(
        options.pitch_output_file,
        np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T)

    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])

    scopeAllowedHF0 = 2.0 / 1.0

    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int)
    dim1index = dim1index[indexBestPath != 0, :]
    ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
    ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
    ##                          + 1))
    dim1index = dim1index.reshape(1, dim1index.size)

    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         )
    dim2index = dim2index[indexBestPath != 0, :]
    dim2index = dim2index.reshape(1, dim2index.size)
    ## dim2index.reshape(1, N * chirpPerF0 \
    ##                                * (2 * np.floor(stepNotes \
    ##                                                / scopeAllowedHF0) \
    ##                                   + 1))
    HF00[dim1index, dim2index] = 1  # HF0.max()

    HF00[:, indexBestPath == (NF0 - 1)] = 0.0
    HF00[:, indexBestPath == 0] = 0.0

    WF0effective = WF0
    HF00effective = HF00

    if options.melody is None:
        del HF0, HGAMMA, HPHI, HM, WM, HF00, SX

    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes,
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)

    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM)

    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR

    vestR = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL

    vestL = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
    #                          options.voc_output_file, fs)

    vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
    # wav.write(options.voc_output_file, fs, \
    #           np.array([vestR,vestL]).T)

    #wav.write(options.voc_output_file, fs, \
    #          np.int16(32768.0 * np.array([vestR,vestL]).T))

    hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR

    mestR = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL

    mestL = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
    #                          options.mus_output_file, fs)

    mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
    # wav.write(options.mus_output_file, fs, \
    #           np.array([mestR,mestL]).T)

    #wav.write(options.mus_output_file, fs, \
    #          np.int16(32768.0 * np.array([mestR,mestL]).T))

    del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0

    # adding the unvoiced part in the source basis:
    WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
    HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
    ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

    alphaR, alphaL, HGAMMA, HPHI, HF0, \
            betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
        # the data to be fitted to:
        SXR, SXL,
        # the basis matrices for the spectral combs
        WUF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=HGAMMA, HPHI0=HPHI,
        HF00=HUF0,
        WM0=None,#WM,
        HM0=None,#HM,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes,
        lambdaHF0 = 0.0 / (1.0 * SXR.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution,
        updateHGAMMA=False)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WUF0, HF0)

    hatSXR = (alphaR**2) * SF0 * SPHI + np.dot(np.dot(WM, betaR**2), HM)
    hatSXL = (alphaL**2) * SF0 * SPHI + np.dot(np.dot(WM, betaL**2), HM)

    hatVR = (alphaR**2) * SPHI * SF0 / hatSXR * XR

    vestR = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatVR = (alphaL**2) * SPHI * SF0 / hatSXL * XL

    vestL = istft(
        hatVR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    outputFileName = options.voc_output_file[:-4] + '_VUIMM.wav'
    # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, outputFileName, fs)

    vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
    vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
    # wav.write(outputFileName, fs, \
    #           np.array([vestR,vestL]).T)

    hatMR = (np.dot(np.dot(WM, betaR**2), HM)) / hatSXR * XR

    mestR = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    hatMR = (np.dot(np.dot(WM, betaL**2), HM)) / hatSXL * XL

    mestL = istft(
        hatMR, hopsize=hopsize, nfft=NFT,
        window=sinebell(windowSizeInSamples)) / 4.0

    #This is the required file
    outputFileName = options.mus_output_file[:-4] + '_VUIMM.wav'
    #scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, outputFileName, fs)
    os.chdir('media/karaoke')
    mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
    mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
    wav.write(outputFileName, fs, \
              np.array([mestR,mestL]).T)

    if displayEvolution:
        plt.close('all')
        ## raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##           "!! Press Return to end the program...  !!\n"\
        ##           "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("Done!")
    print outputFileName
    os.chdir('..')
    os.chdir('..')
def main(args, options):

    stereoEstimation = True

    # Median filtering in spectrogram
    HPS = False

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc('image', cmap='jet')  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = ''
    if len(args) >= 2:
        inputAudioFile = args[0]
        options.pitch_output_file = args[1]
    if len(args) == 1:
        inputAudioFile = args[0]
    if len(args) == 0:
        inputAudioFile = options.input_file

    if inputAudioFile[-4:] != ".wav":
        raise ValueError(
            "File not WAV file? Only WAV format support, for now...")

    #print "Writing the different following output files:"
    if not (options.vit_pitch_output_file is None):
        print "    estimated pitches in", options.vit_pitch_output_file
    if not (options.sal_output_file is None):
        print "    salience file in ", options.sal_output_file

    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4] + '_pitches.txt'

    try:
        from essentia.standard import AudioLoader
        loaded = AudioLoader(filename=inputAudioFile)()
        audio = loaded[0]
        Fs = loaded[1]
        nchan = loaded[2]
        loaded = AudioLoader(filename=inputAudioFile)()
        audio = loaded[0]
        if nchan == 1:
            data = audio[:, 0].transpose()
        else:
            data = audio.transpose()

        data = np.double(data) / (1.2 * abs(data).max())
    except:
        # Using scipy to import wav
        import scipy.io.wavfile as wav
        Fs, data = wav.read(inputAudioFile)
        # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
        scaleData = 1.2 * data.max()  # to rescale the data.
        data = np.double(data) / scaleData  # makes data vary from -1 to 1
    options.Fs = Fs
    is_stereo = True
    if data.shape[0] == data.size:  # data is multi-channel
        #print "The audio file is not stereo."
        #print "The audio file is not stereo. Making stereo out of mono."
        #print "(You could also try the older separateLead.py...)"
        is_stereo = False
        # data = np.vstack([data,data]).T
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs))

    hopsize = np.round(options.hopsize * Fs)
    #if hopsize != windowSizeInSamples/8:
    #    #print "Overriding given hopsize to use 1/8th of window size"
    #    #hopsize = windowSizeInSamples/8
    #    warnings.warn("Chosen hopsize: "+str(hopsize)+\
    #                  ", while windowsize: "+str(windowSizeInSamples))

    options.hopsizeInSamples = hopsize
    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step:
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = int(options.R)

    eps = 10**-9

    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R

    if is_stereo:
        XR, F, N = stft(data[:, 0],
                        fs=Fs,
                        hopsize=hopsize,
                        window=sinebell(windowSizeInSamples),
                        nfft=NFT)
        XL, F, N = stft(data[:, 1],
                        fs=Fs,
                        hopsize=hopsize,
                        window=sinebell(windowSizeInSamples),
                        nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        #SXR = np.abs(XR) ** 2
        #SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5 * np.abs(XR + XL))**2, eps)
    else:  # data is mono
        X, F, N = stft(data,
                       fs=Fs,
                       hopsize=hopsize,
                       window=sinebell(windowSizeInSamples),
                       nfft=NFT)
        SX = np.maximum(np.abs(X)**2, eps)

    del data, F, N

    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes  # this is the number of F0s within one semitone

    K = int(
        options.K_numFilters)  # number of spectral shapes for the filter part
    P = int(options.P_numAtomFilters
            )  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=NFT, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=windowSizeInSamples, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15, loadWF0=True,\
                                  analysisWindow='sinebell')
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if options.sal_output_file is None or not os.path.exists(
            options.sal_output_file):
        if displayEvolution:
            plt.figure(1)
            plt.clf()
            plt.xticks(fontsize=16)
            plt.yticks(fontsize=16)
            plt.xlabel(r'Frame number $n$', fontsize=16)
            plt.ylabel(r'Leading source number $u$', fontsize=16)
            plt.ion()
            # plt.show()
            ## the following seems superfluous if mpl's backend is macosx...
            ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
            ##                  "!! Press Return to resume the program. !!\n"\
            ##                  "!! Be sure that the figure has been    !!\n"\
            ##                  "!! already displayed, so that the      !!\n"\
            ##                  "!! evolution of HF0 will be visible.   !!\n"\
            ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        if (HPS):
            from scipy.signal import medfilt
        if (is_stereo & stereoEstimation):
            SXR = np.maximum(np.abs(XR)**2, eps)
            SXL = np.maximum(np.abs(XL)**2, eps)
            if (HPS):
                SXR = medfilt(SXR, 3)
                SXL = medfilt(SXL, 3)

            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError1 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=None,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution)
        else:
            if (HPS):
                SX = medfilt(SX, 3)

            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=None,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution)
        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

    else:
        print "Loading Salience from file to calculate Melody: " + options.sal_output_file
        loaded = np.loadtxt(options.sal_output_file).T
        times = [loaded[0, :]]
        HF0 = loaded[1:, :]

    # If vit_pitch_output_file is not null, do melody extraction with Viterbi
    if not (options.vit_pitch_output_file is None):
        print "Viterbi decoding"
        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = \
                                  transitions[\
            np.array(np.abs(np.outer(np.ones(NF0), b) \
                            - np.outer(b, np.ones(NF0))), dtype=int)]
        pf_0 = transitions[cutoffnote - 1] * 10**(-90)
        p0_0 = transitions[cutoffnote - 1] * 10**(-100)
        p0_f = transitions[cutoffnote - 1] * 10**(-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 \
                             / np.outer(sumTransitionMatrixF0, \
                                        np.ones(NF0 + 1))

        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(\
            logHF0, np.log(priorProbabilities),
            np.log(transitionMatrixF0), verbose=options.verbose)

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, '-b')
            h2.hold(False)
            plt.axis('tight')

        del logHF0

        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(\
            np.maximum(\
                np.minimum(\
                    np.outer(chirpPerF0 * indexBestPath,
                             np.ones(chirpPerF0 \
                                     * (2 \
                                        * int(np.floor(stepNotes / scopeAllowedHF0)) \
                                        + 1))) \
                    + np.outer(np.ones(N),
                               np.arange(-chirpPerF0 \
                                         * int(np.floor(stepNotes / scopeAllowedHF0)),
                                         chirpPerF0 \
                                         * int((np.floor(stepNotes / scopeAllowedHF0))) \
                                            + 1)),
                    chirpPerF0 * NF0 - 1),
                0),
            dtype=int).reshape(1, N * chirpPerF0 \
                               * (2 * int(np.floor(stepNotes / scopeAllowedHF0)) \
                                  + 1))
        dim2index = np.outer(np.arange(N),
                             np.ones(chirpPerF0 \
                                     * (2 * int(np.floor(stepNotes \
                                                     / scopeAllowedHF0)) + 1), \
                                     dtype=int)\
                             ).reshape(1, N * chirpPerF0 \
                                       * (2 * int(np.floor(stepNotes \
                                                       / scopeAllowedHF0)) \
                                          + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        # remove frames with less than (100 thres_energy) % of total energy.
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)

        energyMel = np.sum((((SPHI * SF0) / hatSX)**2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N
        if not os.path.isdir(os.path.dirname((options.vit_pitch_output_file))):
            os.mkdir(os.path.dirname((options.vit_pitch_output_file)))

        np.savetxt(options.vit_pitch_output_file + '.egy',
                   np.array(
                       [np.arange(N) * hopsize / np.double(Fs), energyMel]).T,
                   fmt='%10.5f')

        # energyMel <= energyMelCumul[ind_999]?

        melNotPresent = (energyMel <= energyMelCumulNorm[ind_999])

        # edit: frames predicted as unvoiced will be given negative values
        # indexBestPath[melNotPresent] = 0

        freqMelody = F0Table[np.array(np.minimum(indexBestPath,
                                                 len(F0Table) - 1),
                                      dtype=int)]
        freqMelody[melNotPresent] = -freqMelody[melNotPresent]

        if not os.path.exists(os.path.dirname(options.vit_pitch_output_file)):
            os.makedirs(os.path.dirname(options.vit_pitch_output_file))

        np.savetxt(options.vit_pitch_output_file,
                   np.array(
                       [np.arange(N) * hopsize / np.double(Fs), freqMelody]).T,
                   fmt='%10.7f')

    times = np.array([np.arange(N) * hopsize / np.double(Fs)])

    # Save salience file:
    if not (options.sal_output_file is None):
        if not os.path.exists(os.path.dirname(options.sal_output_file)):
            os.makedirs(os.path.dirname(options.sal_output_file))
        np.savetxt(options.sal_output_file,
                   np.concatenate((times, HF0), axis=0).T,
                   fmt='%10.6f')
        # saveSPHI (timbre related)
        saveSPHI = 0
        if saveSPHI:
            if not os.path.exists(
                    os.path.dirname(options.sal_output_file + '.SPHI')):
                os.makedirs(os.path.dirname(options.sal_output_file))
            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            np.savetxt(options.sal_output_file + '.SPHI',
                       np.concatenate((times, SPHI), axis=0).T,
                       fmt='%10.4f')
        #np.savetxt(options.sal_output_file+'.WGAMMA',np.concatenate((times,WGAMMA),axis=0).T,fmt='%10.4f')

    # return times[0],freqMelody,HF0
    print "Done!"
    return times[0], HF0, options
def main():
    import optparse

    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option(
        "-v",
        "--vocal-output-file",
        dest="voc_output_file",
        type="string",
        help="name of the audio output file for the estimated\n"
        "solo (vocal) part. \n"
        "If None, appends _lead to inputAudioFile.",
        default=None,
    )
    parser.add_option(
        "-m",
        "--music-output-file",
        dest="mus_output_file",
        type="string",
        help="name of the audio output file for the estimated\n"
        "music part.\n"
        "If None, appends _acc to inputAudioFile.",
        default=None,
    )
    parser.add_option(
        "-p",
        "--pitch-output-file",
        dest="pitch_output_file",
        type="string",
        help="name of the output file for the estimated pitches.\n" "If None, appends _pitches to inputAudioFile",
        default=None,
    )

    # Some more optional options:
    parser.add_option(
        "-d", "--with-display", dest="displayEvolution", action="store_true", help="display the figures", default=False
    )
    parser.add_option(
        "-q", "--quiet", dest="verbose", action="store_false", help="use to quiet all output verbose", default=True
    )
    parser.add_option(
        "-n",
        "--dontseparate",
        dest="separateSignals",
        action="store_false",
        help="Trigger this option if you only desire to " + "estimate the melody",
        default=True,
    )
    parser.add_option("--nb-iterations", dest="nbiter", help="number of iterations", type="int", default=30)
    parser.add_option(
        "--window-size", dest="windowSize", type="float", default=0.04644, help="size of analysis windows, in s."
    )
    parser.add_option(
        "--Fourier-size",
        dest="fourierSize",
        type="int",
        default=None,
        help="size of Fourier transforms, " "in samples.",
    )
    parser.add_option(
        "--hopsize",
        dest="hopsize",
        type="float",
        default=0.0058,
        help="size of the hop between analysis windows, in s.",
    )
    parser.add_option(
        "--nb-accElements", dest="R", type="float", default=40.0, help="number of elements for the accompaniment."
    )

    parser.add_option(
        "--with-melody",
        dest="melody",
        type="string",
        default=None,
        help="provide the melody in a file named MELODY, " "with at each line: <time (s)><F0 (Hz)>.",
    )

    parser.add_option(
        "--numAtomFilters",
        dest="P_numAtomFilters",
        type="int",
        default=30,
        help="Number of atomic filters - in WGAMMA.",
    )
    parser.add_option(
        "--numFilters",
        dest="K_numFilters",
        type="int",
        default=10,
        help="Number of filters for decomposition - in WPHI",
    )
    parser.add_option(
        "--min-F0-Freq", dest="minF0", type="float", default=100.0, help="Minimum of fundamental frequency F0."
    )
    parser.add_option(
        "--max-F0-Freq", dest="maxF0", type="float", default=800.0, help="Maximum of fundamental frequency F0."
    )
    parser.add_option(
        "--step-F0s", dest="stepNotes", type="int", default=20, help="Number of F0s in dictionary for each semitone."
    )

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        ## plt.rc('text', usetex=True)
        plt.rc("image", cmap="jet")  ## gray_r
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    if inputAudioFile[-4:] != ".wav":
        raise ValueError("File not WAV file? Only WAV format support, for now...")

    if options.mus_output_file is None:
        options.mus_output_file = inputAudioFile[:-4] + "_acc.wav"

    if options.voc_output_file is None:
        options.voc_output_file = inputAudioFile[:-4] + "_lead.wav"

    if options.pitch_output_file is None:
        options.pitch_output_file = inputAudioFile[:-4] + "_pitches.txt"

    print "Writing the different following output files:"
    print "    separated lead          in", options.voc_output_file
    print "    separated accompaniment in", options.mus_output_file
    print "    separated lead + unvoc  in", options.voc_output_file[:-4] + "_VUIMM.wav"
    print "    separated acc  - unvoc  in", options.mus_output_file[:-4] + "_VUIMM.wav"
    print "    estimated pitches       in", options.pitch_output_file

    Fs, data = wav.read(inputAudioFile)
    # data = np.double(data) /  32768.0 # makes data vary from -1 to 1
    scaleData = 1.2 * data.max()  # to rescale the data.
    dataType = data.dtype
    data = np.double(data) / scaleData  # makes data vary from -1 to 1
    is_stereo = True
    if data.shape[0] == data.size:  # data is multi-channel
        print "The audio file is not stereo. Making stereo out of mono."
        print "(You could also try the older separateLead.py...)"
        is_stereo = False
        # data = np.vstack([data,data]).T
        # raise ValueError("number of dimensions of the input not 2")
    if is_stereo and data.shape[1] != 2:
        print "The data is multichannel, but not stereo... \n"
        print "Unfortunately this program does not scale well. Data is \n"
        print "reduced to its 2 first channels.\n"
        data = data[:, 0:2]

    # Processing the options:
    windowSizeInSamples = nextpow2(np.round(options.windowSize * Fs))

    hopsize = np.round(options.hopsize * Fs)
    if hopsize != windowSizeInSamples / 8:
        # print "Overriding given hopsize to use 1/8th of window size"
        # hopsize = windowSizeInSamples/8
        warnings.warn("Chosen hopsize: " + str(hopsize) + ", while windowsize: " + str(windowSizeInSamples))

    if options.fourierSize is None:
        NFT = windowSizeInSamples
    else:
        NFT = options.fourierSize

    # number of iterations for each parameter estimation step:
    niter = options.nbiter
    # number of spectral shapes for the accompaniment
    R = options.R

    eps = 10 ** -9

    if options.verbose:
        print "Some parameter settings:"
        print "    Size of analysis windows: ", windowSizeInSamples
        print "    Hopsize: ", hopsize
        print "    Size of Fourier transforms: ", NFT
        print "    Number of iterations to be done: ", niter
        print "    Number of elements in WM: ", R

    if is_stereo:
        XR, F, N = stft(data[:, 0], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        XL, F, N = stft(data[:, 1], fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        # SX is the power spectrogram:
        ## SXR = np.maximum(np.abs(XR) ** 2, 10 ** -8)
        ## SXL = np.maximum(np.abs(XL) ** 2, 10 ** -8)
        # SXR = np.abs(XR) ** 2
        # SXL = np.abs(XL) ** 2
        SX = np.maximum((0.5 * np.abs(XR + XL)) ** 2, eps)
    else:  # data is mono
        X, F, N = stft(data, fs=Fs, hopsize=hopsize, window=sinebell(windowSizeInSamples), nfft=NFT)
        SX = np.maximum(np.abs(X) ** 2, eps)

    del data, F, N

    # TODO: also process these as options:
    # minimum and maximum F0 in glottal source spectra dictionary
    minF0 = options.minF0
    maxF0 = options.maxF0
    F, N = SX.shape
    stepNotes = options.stepNotes  # this is the number of F0s within one semitone

    K = options.K_numFilters  # number of spectral shapes for the filter part
    P = options.P_numAtomFilters  # number of elements in dictionary of smooth filters
    chirpPerF0 = 1  # number of chirped spectral shapes between each F0
    # this feature should be further studied before
    # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0:
    F0Table, WF0 = generate_WF0_chirped(
        minF0,
        maxF0,
        Fs,
        Nfft=NFT,
        stepNotes=stepNotes,
        lengthWindow=windowSizeInSamples,
        Ot=0.25,
        perF0=chirpPerF0,
        depthChirpInSemiTone=0.15,
        loadWF0=True,
        analysisWindow="sinebell",
    )
    WF0 = WF0[0:F, :]  # ensure same size as SX
    NF0 = F0Table.size  # number of harmonic combs
    # Normalization:
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, NFT, Fs=Fs, frequencyScale="linear", numberOfBasis=P, overlap=0.75)

    if displayEvolution:
        plt.figure(1)
        plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r"Frame number $n$", fontsize=16)
        plt.ylabel(r"Leading source number $u$", fontsize=16)
        plt.ion()
        # plt.show()
        ## the following seems superfluous if mpl's backend is macosx...
        ##        raw_input("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"\
        ##                  "!! Press Return to resume the program. !!\n"\
        ##                  "!! Be sure that the figure has been    !!\n"\
        ##                  "!! already displayed, so that the      !!\n"\
        ##                  "!! evolution of HF0 will be visible.   !!\n"\
        ##                  "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

    if options.melody is None:
        ## section to estimate the melody, on monophonic algo:
        # First round of parameter estimation:
        HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
            # the data to be fitted to:
            SX,
            # the basis matrices for the spectral combs
            WF0,
            # and for the elementary filters:
            WGAMMA,
            # number of desired filters, accompaniment spectra:
            numberOfFilters=K,
            numberOfAccompanimentSpectralShapes=R,
            # putting only 2 elements in accompaniment for a start...
            # if any, initial amplitude matrices for
            HGAMMA0=None,
            HPHI0=None,
            HF00=None,
            WM0=None,
            HM0=None,
            # Some more optional arguments, to control the "convergence"
            # of the algo
            numberOfIterations=niter,
            updateRulePower=1.0,
            stepNotes=stepNotes,
            lambdaHF0=0.0 / (1.0 * SX.max()),
            alphaHF0=0.9,
            verbose=options.verbose,
            displayEvolution=displayEvolution,
        )

        if displayEvolution:
            h2 = plt.figure(2)
            plt.clf()
            imageMatlab.imageM(20 * np.log10(HF0))
            matMax = (20 * np.log10(HF0)).max()
            matMed = np.median(20 * np.log10(HF0))
            plt.clim([matMed - 100, matMax])

        # Viterbi decoding to estimate the predominant fundamental
        # frequency line
        # create transition probability matrix - adhoc parameter 'scale'
        # TODO: use "learned" parameter scale (NB: after many trials,
        # provided scale and parameterization seems robust)
        scale = 1.0
        transitions = np.exp(-np.floor(np.arange(0, NF0) / stepNotes) * scale)
        cutoffnote = 2 * 5 * stepNotes
        transitions[cutoffnote:] = transitions[cutoffnote - 1]

        transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1])  # toeplitz matrix
        b = np.arange(NF0)
        transitionMatrixF0[0:NF0, 0:NF0] = transitions[
            np.array(np.abs(np.outer(np.ones(NF0), b) - np.outer(b, np.ones(NF0))), dtype=int)
        ]
        pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
        p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
        p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
        transitionMatrixF0[0:NF0, NF0] = pf_0
        transitionMatrixF0[NF0, 0:NF0] = p0_f
        transitionMatrixF0[NF0, NF0] = p0_0

        sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
        transitionMatrixF0 = transitionMatrixF0 / np.outer(sumTransitionMatrixF0, np.ones(NF0 + 1))

        # prior probabilities, and setting the array for Viterbi tracking:
        priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
        logHF0 = np.zeros([NF0 + 1, N])
        normHF0 = np.amax(HF0, axis=0)
        barHF0 = np.array(HF0)

        logHF0[0:NF0, :] = np.log(barHF0)
        logHF0[0:NF0, normHF0 == 0] = np.amin(logHF0[logHF0 > -np.Inf])
        logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0 > -np.Inf]), -100)

        indexBestPath = viterbiTrackingArray(
            logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0), verbose=options.verbose
        )

        if displayEvolution:
            h2.hold(True)
            plt.plot(indexBestPath, "-b")
            h2.hold(False)
            plt.axis("tight")

        del logHF0

        # detection of silences:
        # computing the melody restricted F0 amplitude matrix HF00
        # (which will be used as initial HF0 for further algo):
        HF00 = np.zeros([NF0 * chirpPerF0, N])
        scopeAllowedHF0 = 2.0 / 1.0
        # computing indices for and around the melody indices,
        # dim1index are indices along axis 0, and dim2index along axis 1
        # of HF0:
        #     TODO: use numpy broadcasting to make this "clearer" (if possible...)
        dim1index = np.array(
            np.maximum(
                np.minimum(
                    np.outer(
                        chirpPerF0 * indexBestPath,
                        np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)),
                    )
                    + np.outer(
                        np.ones(N),
                        np.arange(
                            -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                            chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1),
                        ),
                    ),
                    chirpPerF0 * NF0 - 1,
                ),
                0,
            ),
            dtype=int,
        ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))
        dim2index = np.outer(
            np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int)
        ).reshape(1, N * chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1))
        HF00[dim1index, dim2index] = HF0[dim1index, dim2index]  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        # remove frames with less than (100 thres_energy) % of total energy.
        thres_energy = 0.000584
        SF0 = np.maximum(np.dot(WF0, HF00), eps)
        SPHI = np.maximum(np.dot(WGAMMA, np.dot(HGAMMA, HPHI)), eps)
        SM = np.maximum(np.dot(WM, HM), eps)
        hatSX = np.maximum(SPHI * SF0 + SM, eps)
        energyMel = np.sum((((SPHI * SF0) / hatSX) ** 2) * SX, axis=0)
        energyMelSorted = np.sort(energyMel)
        energyMelCumul = np.cumsum(energyMelSorted)
        energyMelCumulNorm = energyMelCumul / max(energyMelCumul[-1], eps)
        # normalized to the maximum of energy:
        # expressed in 0.01 times the percentage
        ind_999 = np.nonzero(energyMelCumulNorm > thres_energy)[0][0]
        if ind_999 is None:
            ind_999 = N

        melNotPresent = energyMel <= energyMelCumulNorm[ind_999]
        indexBestPath[melNotPresent] = 0

    else:
        ## take the provided melody line:
        # load melody from file:
        melodyFromFile = np.loadtxt(options.melody)
        sizeProvidedMel = melodyFromFile.shape
        if len(sizeProvidedMel) == 1:
            print "The melody should be provided as <Time (s)><F0 (Hz)>."
            raise ValueError("Bad melody format")
        melTimeStamps = melodyFromFile[:, 0]  # + 1024 / np.double(Fs)
        melFreqHz = melodyFromFile[:, 1]
        if minF0 > melFreqHz[melFreqHz > 40.0].min() or maxF0 < melFreqHz.max():
            minF0 = melFreqHz[melFreqHz > 40.0].min() * 0.97
            maxF0 = np.maximum(melFreqHz.max() * 1.03, 2 * minF0 * 1.03)
            print "Recomputing the source basis for "
            print "minF0 = ", minF0, "Hz and maxF0 = ", maxF0, "Hz."
            # Create the harmonic combs, for each F0 between minF0 and maxF0:
            F0Table, WF0 = generate_WF0_chirped(
                minF0,
                maxF0,
                Fs,
                Nfft=NFT,
                stepNotes=stepNotes,
                lengthWindow=windowSizeInSamples,
                Ot=0.25,
                perF0=chirpPerF0,
                depthChirpInSemiTone=0.15,
            )
            WF0 = WF0[0:F, :]  # ensure same size as SX
            NF0 = F0Table.size  # number of harmonic combs
            # Normalization:
            WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

        sigTimeStamps = np.arange(N) * hopsize / np.double(Fs)
        distMatTimeStamps = np.abs(
            np.outer(np.ones(sizeProvidedMel[0]), sigTimeStamps) - np.outer(melTimeStamps, np.ones(N))
        )
        minDistTimeStamps = distMatTimeStamps.argmin(axis=0)
        f0BestPath = melFreqHz[minDistTimeStamps]
        distMatF0 = np.abs(np.outer(np.ones(NF0), f0BestPath) - np.outer(F0Table, np.ones(N)))
        indexBestPath = distMatF0.argmin(axis=0)
        # setting silences to 0, with tolerance = 1/2 window length
        indexBestPath[distMatTimeStamps[minDistTimeStamps, range(N)] >= 0.5 * options.windowSize] = 0
        indexBestPath[f0BestPath <= 0] = 0

    freqMelody = F0Table[np.array(indexBestPath, dtype=int)]
    freqMelody[indexBestPath == 0] = -freqMelody[indexBestPath == 0]
    np.savetxt(options.pitch_output_file, np.array([np.arange(N) * hopsize / np.double(Fs), freqMelody]).T)

    # If separation is required:
    if options.separateSignals:
        # Second round of parameter estimation, with specific
        # initial HF00:
        HF00 = np.zeros([NF0 * chirpPerF0, N])

        scopeAllowedHF0 = 2.0 / 1.0

        # indexes for HF00:
        # TODO: reprogram this with a 'where'?...
        dim1index = np.array(
            np.maximum(
                np.minimum(
                    np.outer(
                        chirpPerF0 * indexBestPath,
                        np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1)),
                    )
                    + np.outer(
                        np.ones(N),
                        np.arange(
                            -chirpPerF0 * np.floor(stepNotes / scopeAllowedHF0),
                            chirpPerF0 * (np.floor(stepNotes / scopeAllowedHF0) + 1),
                        ),
                    ),
                    chirpPerF0 * NF0 - 1,
                ),
                0,
            ),
            dtype=int,
        )
        dim1index = dim1index[indexBestPath != 0, :]
        ## dim1index = dim1index.reshape(1, N * chirpPerF0 \
        ##                        * (2 * np.floor(stepNotes / scopeAllowedHF0) \
        ##                          + 1))
        dim1index = dim1index.reshape(1, dim1index.size)

        dim2index = np.outer(
            np.arange(N), np.ones(chirpPerF0 * (2 * np.floor(stepNotes / scopeAllowedHF0) + 1), dtype=int)
        )
        dim2index = dim2index[indexBestPath != 0, :]
        dim2index = dim2index.reshape(1, dim2index.size)
        ## dim2index.reshape(1, N * chirpPerF0 \
        ##                                * (2 * np.floor(stepNotes \
        ##                                                / scopeAllowedHF0) \
        ##                                   + 1))
        HF00[dim1index, dim2index] = 1  # HF0.max()

        HF00[:, indexBestPath == (NF0 - 1)] = 0.0
        HF00[:, indexBestPath == 0] = 0.0

        WF0effective = WF0
        HF00effective = HF00

        if options.melody is None:
            del HF0, HGAMMA, HPHI, HM, WM, HF00

        if is_stereo:
            del SX
            SXR = np.maximum(np.abs(XR) ** 2, eps)
            SXL = np.maximum(np.abs(XL) ** 2, eps)
            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError2 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=HF00effective,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SXR.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)

            hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM)
            hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM)

            hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR

            vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL

            vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            # scikits.audiolab.wavwrite(np.array([vestR,vestL]).T, \
            #                          options.voc_output_file, Fs)

            vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, np.array([vestR, vestL]).T)

            # wav.write(options.voc_output_file, Fs, \
            #          np.int16(32768.0 * np.array([vestR,vestL]).T))

            hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR

            mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL

            mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            # scikits.audiolab.wavwrite(np.array([mestR,mestL]).T, \
            #                          options.mus_output_file, Fs)

            mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, np.array([mestR, mestL]).T)

            # wav.write(options.mus_output_file, Fs, \
            #          np.int16(32768.0 * np.array([mestR,mestL]).T))

            del hatMR, mestL, vestL, vestR, mestR, hatVR, hatSXR, hatSXL, SPHI, SF0

            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

            alphaR, alphaL, HGAMMA, HPHI, HF0, betaR, betaL, HM, WM, recoError3 = SIMM.Stereo_SIMM(
                # the data to be fitted to:
                SXR,
                SXL,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA,
                HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,  # WM,
                HM0=None,  # HM,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SXR.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
                updateHGAMMA=False,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)

            hatSXR = (alphaR ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaR ** 2), HM)
            hatSXL = (alphaL ** 2) * SF0 * SPHI + np.dot(np.dot(WM, betaL ** 2), HM)

            hatVR = (alphaR ** 2) * SPHI * SF0 / hatSXR * XR

            vestR = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatVR = (alphaL ** 2) * SPHI * SF0 / hatSXL * XL

            vestL = istft(hatVR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav"

            vestR = np.array(np.round(vestR * scaleData), dtype=dataType)
            vestL = np.array(np.round(vestL * scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, np.array([vestR, vestL]).T)

            hatMR = (np.dot(np.dot(WM, betaR ** 2), HM)) / hatSXR * XR

            mestR = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            hatMR = (np.dot(np.dot(WM, betaL ** 2), HM)) / hatSXL * XL

            mestL = istft(hatMR, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav"

            mestR = np.array(np.round(mestR * scaleData), dtype=dataType)
            mestL = np.array(np.round(mestL * scaleData), dtype=dataType)
            wav.write(outputFileName, Fs, np.array([mestR, mestL]).T)
        else:
            # running on monophonic data:
            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WF0effective,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=None,
                HPHI0=None,
                HF00=HF00effective,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WF0effective, HF0)
            SM = np.dot(WM, HM)

            hatSX = SF0 * SPHI + SM

            hatV = SPHI * SF0 / hatSX * X

            vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            vest = np.array(np.round(vest * scaleData), dtype=dataType)
            wav.write(options.voc_output_file, Fs, vest)

            hatM = SM / hatSX * X

            mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            mest = np.array(np.round(mest * scaleData), dtype=dataType)
            wav.write(options.mus_output_file, Fs, mest)

            del hatM, vest, mest, hatV, hatSX, SPHI, SF0

            # adding the unvoiced part in the source basis:
            WUF0 = np.hstack([WF0, np.ones([WF0.shape[0], 1])])
            HUF0 = np.vstack([HF0, np.ones([1, HF0.shape[1]])])
            ## HUF0[-1,:] = HF0.sum(axis=0) # should we do this?

            HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
                # the data to be fitted to:
                SX,
                # the basis matrices for the spectral combs
                WUF0,
                # and for the elementary filters:
                WGAMMA,
                # number of desired filters, accompaniment spectra:
                numberOfFilters=K,
                numberOfAccompanimentSpectralShapes=R,
                # putting only 2 elements in accompaniment for a start...
                # if any, initial amplitude matrices for
                HGAMMA0=HGAMMA,
                HPHI0=HPHI,
                HF00=HUF0,
                WM0=None,
                HM0=None,
                # Some more optional arguments, to control the "convergence"
                # of the algo
                numberOfIterations=niter,
                updateRulePower=1.0,
                stepNotes=stepNotes,
                lambdaHF0=0.0 / (1.0 * SX.max()),
                alphaHF0=0.9,
                verbose=options.verbose,
                displayEvolution=displayEvolution,
                updateHGAMMA=False,
            )

            WPHI = np.dot(WGAMMA, HGAMMA)
            SPHI = np.dot(WPHI, HPHI)
            SF0 = np.dot(WUF0, HF0)
            SM = np.dot(WM, HM)

            hatSX = SF0 * SPHI + SM

            hatV = SPHI * SF0 / hatSX * X

            vest = istft(hatV, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            vest = np.array(np.round(vest * scaleData), dtype=dataType)
            outputFileName = options.voc_output_file[:-4] + "_VUIMM.wav"
            wav.write(outputFileName, Fs, vest)

            hatM = SM / hatSX * X

            mest = istft(hatM, hopsize=hopsize, nfft=NFT, window=sinebell(windowSizeInSamples)) / 4.0

            mest = np.array(np.round(mest * scaleData), dtype=dataType)

            outputFileName = options.mus_output_file[:-4] + "_VUIMM.wav"
            wav.write(outputFileName, Fs, mest)

        if displayEvolution:
            plt.close("all")

    print "Done!"
Ejemplo n.º 10
0
def main():
    import optparse
    
    usage = "usage: %prog [options] inputAudioFile"
    parser = optparse.OptionParser(usage)
    # Name of the output files:
    parser.add_option("-v", "--vocal-output-file",
                      dest="voc_output_file", type="string",
                      help="name of the audio output file for the estimated\nsolo (vocal) part",
                      default="estimated_solo.wav")
    parser.add_option("-m", "--music-output-file",
                      dest="mus_output_file", type="string",
                      help="name of the audio output file for the estimated\nmusic part",
                      default="estimated_music.wav")
    parser.add_option("-p", "--pitch-output-file",
                      dest="pitch_output_file", type="string",
                      help="name of the output file for the estimated pitches",
                      default="pitches.txt")

    # Some more optional options:
    parser.add_option("-d", "--with-display", dest="displayEvolution",
                      action="store_true",help="display the figures",
                      default=False)
    parser.add_option("-q", "--quiet", dest="verbose",
                      action="store_false",
                      help="use to quiet all output verbose",
                      default=True)
    parser.add_option("--nb-iterations", dest="nbiter",
                      help="number of iterations", type="int",
                      default=50)
    parser.add_option("--window-size", dest="windowSize", type="float",
                      default=0.04644,help="size of analysis windows, in s.")
    parser.add_option("--Fourier-size", dest="fourierSize", type="int",
                      default=2048, help="size of Fourier transforms, in samples.")
    parser.add_option("--hopsize", dest="hopsize", type="float",
                      default=0.0058,
                      help="size of the hop between analysis windows, in s.")

    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("incorrect number of arguments, use option -h for help.")

    displayEvolution = options.displayEvolution
    if displayEvolution:
        import matplotlib.pyplot as plt
        import imageMatlab

        plt.rc('text', usetex=True)
        plt.rc('image',cmap='gray_r')
        plt.ion()

    # Compulsory option: name of the input file:
    inputAudioFile = args[0]
    fs, data = wav.read(inputAudioFile)
    #data, fs, enc = scikits.audiolab.wavread(inputAudioFile)
    if data.shape[0] != data.size: # data is multi-channel
        data = np.mean(data,axis=1)

    # Processing the options:
    windowSizeInSamples = np.round(options.windowSize * fs)
    hopsize = np.round(options.hopsize * fs)
    NFT = options.fourierSize
    niter = options.nbiter

    if options.verbose:
        print "Size of analysis windows: ", windowSizeInSamples, "\n"
        print "Hopsize: ", hopsize, "\n"
        print "Size of Fourier transforms: ", NFT, "\n"
        print "Number of iterations to be done: ", niter, "\n"
    
    X, F, N = stft(data, fs=fs, hopsize=hopsize,
                   window=sinebell(windowSizeInSamples), nfft=NFT)
    # SX is the power spectrogram:
    SX = np.maximum(np.abs(X) ** 2, 10 ** -8)

    del data, F, N

    # TODO: also process these as options:
    minF0 = 100
    maxF0 = 800
    Fs = fs
    F, N = SX.shape
    stepNotes = 20 # this is the number of F0s within one semitone
    K = 50 # number of spectral shapes for the filter part
    R = 40 # number of spectral shapes for the accompaniment
    P = 30 # number of elements in dictionary of smooth filters
    chirpPerF0 = 1 # number of chirped spectral shapes between each F0
                   # this feature should be further studied before
                   # we find a good way of doing that.

    # Create the harmonic combs, for each F0 between minF0 and maxF0: 
    F0Table, WF0 = \
             generate_WF0_chirped(minF0, maxF0, Fs, Nfft=2048, \
                                  stepNotes=stepNotes, \
                                  lengthWindow=2048, Ot=0.25, \
                                  perF0=chirpPerF0, \
                                  depthChirpInSemiTone=.15)
    WF0 = WF0[0:F, :] # ensure same size as SX 
    NF0 = F0Table.size # number of harmonic combs
    # Normalization: 
    WF0 = WF0 / np.outer(np.ones(F), np.amax(WF0, axis=0))

    # Create the dictionary of smooth filters, for the filter part of
    # the lead isntrument:
    WGAMMA = generateHannBasis(F, 2048, Fs=fs, frequencyScale='linear', \
                               numberOfBasis=P, overlap=.75)

    if displayEvolution:
        plt.figure(1);plt.clf()
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        plt.ion()
        # plt.show()
        raw_input("Press Return to resume the program. \nBe sure that the figure has been already displayed, so that the evolution of HF0 will be visible. ")

    # First round of parameter estimation:
    HGAMMA, HPHI, HF0, HM, WM, recoError1 = SIMM.SIMM(
        # the data to be fitted to:
        SX,
        # the basis matrices for the spectral combs
        WF0,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # putting only 2 elements in accompaniment for a start...
        # if any, initial amplitude matrices for 
        HGAMMA0=None, HPHI0=None,
        HF00=None,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    if displayEvolution:
        plt.figure(3);plt.clf()
        plt.subplot(221)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(222)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:,1])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(223)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(224)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")

        plt.figure(4);plt.clf()
        imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(5);plt.clf()
        imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(6);plt.clf()
        imageMatlab.imageM(db(np.dot(WM, HM)), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(7);plt.clf()
        imageMatlab.imageM(db(WM), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Element number $r$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        
    if displayEvolution:
        h2 = plt.figure(2);plt.clf();
        imageMatlab.imageM(20 * np.log10(HF0))
        matMax = (20 * np.log10(HF0)).max()
        matMed = np.median(20 * np.log10(HF0))
        plt.clim([matMed - 100, matMax])

    # Viterbi decoding to estimate the predominant fundamental
    # frequency line
    scale = 1.0
    transitions = np.exp(-np.floor(np.arange(0,NF0) / stepNotes) * scale)
    cutoffnote = 2 * 5 * stepNotes
    transitions[cutoffnote:] = transitions[cutoffnote - 1]

    transitionMatrixF0 = np.zeros([NF0 + 1, NF0 + 1]) # toeplitz matrix
    b = np.arange(NF0)
    transitionMatrixF0[0:NF0, 0:NF0] = \
                              transitions[\
        np.array(np.abs(np.outer(np.ones(NF0), b) \
                        - np.outer(b, np.ones(NF0))), dtype=int)]
    pf_0 = transitions[cutoffnote - 1] * 10 ** (-90)
    p0_0 = transitions[cutoffnote - 1] * 10 ** (-100)
    p0_f = transitions[cutoffnote - 1] * 10 ** (-80)
    transitionMatrixF0[0:NF0, NF0] = pf_0
    transitionMatrixF0[NF0, 0:NF0] = p0_f
    transitionMatrixF0[NF0, NF0] = p0_0

    sumTransitionMatrixF0 = np.sum(transitionMatrixF0, axis=1)
    transitionMatrixF0 = transitionMatrixF0 \
                         / np.outer(sumTransitionMatrixF0, \
                                    np.ones(NF0 + 1))

    priorProbabilities = 1 / (NF0 + 1.0) * np.ones([NF0 + 1])
    logHF0 = np.zeros([NF0 + 1, N])
    normHF0 = np.amax(HF0, axis=0)
    barHF0 = np.array(HF0)

    logHF0[0:NF0, :] = np.log(barHF0)
    logHF0[0:NF0, normHF0==0] = np.amin(logHF0[logHF0>-np.Inf])
    logHF0[NF0, :] = np.maximum(np.amin(logHF0[logHF0>-np.Inf]),-100)

    indexBestPath = viterbiTrackingArray(\
        logHF0, np.log(priorProbabilities), np.log(transitionMatrixF0))

    np.savetxt(options.pitch_output_file,
               np.array([np.arange(N)*options.hopsize,
                         F0Table[np.array(indexBestPath,dtype=int)]]).T)

    if displayEvolution:
        h2.hold(True)
        plt.plot(indexBestPath, '-b')
        h2.hold(False)
        plt.axis('tight')
        raw_input("Press Return to resume the program...")

    del logHF0

    # Second round of parameter estimation, with specific
    # initial HF00:
    HF00 = np.zeros([NF0 * chirpPerF0, N])

    scopeAllowedHF0 = 1.0 / 1.0

    # indexes for HF00:
    # TODO: reprogram this with a 'where'?...
    dim1index = np.array(\
        np.maximum(\
        np.minimum(\
        np.outer(chirpPerF0 * indexBestPath,
                 np.ones(chirpPerF0 \
                         * (2 \
                            * np.floor(stepNotes / scopeAllowedHF0) \
                            + 1))) \
        + np.outer(np.ones(N),
                   np.arange(-chirpPerF0 \
                             * np.floor(stepNotes / scopeAllowedHF0),
                             chirpPerF0 \
                             * (np.floor(stepNotes / scopeAllowedHF0) \
                                + 1))),
        chirpPerF0 * NF0 - 1),
        0),
        dtype=int).reshape(1, N * chirpPerF0 \
                           * (2 * np.floor(stepNotes / scopeAllowedHF0) \
                              + 1))
    dim2index = np.outer(np.arange(N),
                         np.ones(chirpPerF0 \
                                 * (2 * np.floor(stepNotes \
                                                 / scopeAllowedHF0) + 1), \
                                 dtype=int)\
                         ).reshape(1, N * chirpPerF0 \
                                   * (2 * np.floor(stepNotes \
                                                   / scopeAllowedHF0) \
                                      + 1))
    HF00[dim1index, dim2index] = 1 # HF0.max()

    HF00[:, indexBestPath == (NF0 - 1)] = 0.0

    WF0effective = WF0
    HF00effective = HF00

    del HF0, HGAMMA, HPHI, HM, WM, HF00

    HGAMMA, HPHI, HF0, HM, WM, recoError2 = SIMM.SIMM(
        # the data to be fitted to:
        SX,
        # the basis matrices for the spectral combs
        WF0effective,
        # and for the elementary filters:
        WGAMMA,
        # number of desired filters, accompaniment spectra:
        numberOfFilters=K, numberOfAccompanimentSpectralShapes=R,
        # if any, initial amplitude matrices for
        HGAMMA0=None, HPHI0=None,
        HF00=HF00effective,
        WM0=None, HM0=None,
        # Some more optional arguments, to control the "convergence"
        # of the algo
        numberOfIterations=niter, updateRulePower=1.0,
        stepNotes=stepNotes, 
        lambdaHF0 = 0.0 / (1.0 * SX.max()), alphaHF0=0.9,
        verbose=options.verbose, displayEvolution=displayEvolution)

    WPHI = np.dot(WGAMMA, HGAMMA)
    SPHI = np.dot(WPHI, HPHI)
    SF0 = np.dot(WF0effective, HF0)
    SM = np.dot(WM, HM)

    hatSX = SPHI * SF0 + SM

    hatV = SPHI * SF0 / hatSX * X

    vest = istft(hatV, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0

   # scikits.audiolab.wavwrite(vest, options.voc_output_file, fs)
    wav.write(options.voc_output_file, fs, \
              vest)
    hatM = SM / hatSX * X

    mest = istft(hatM, hopsize=hopsize, nfft=NFT,
                 window=sinebell(windowSizeInSamples)) / 4.0

    #scikits.audiolab.wavwrite(mest, options.mus_output_file, fs)
    wav.write(options.mus_output_file, fs, \
              mest)
    if displayEvolution:
        plt.figure(13);plt.clf()
        plt.subplot(221)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 0])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(222)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 1])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(223)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 2])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")
        plt.subplot(224)
        plt.plot(db(np.dot(WGAMMA, HGAMMA[:, 3])))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.ylim([-30, 0])
        plt.axis("tight")

        plt.figure(14);plt.clf()
        imageMatlab.imageM(db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(141);plt.clf()
        SVhat = db(np.dot(np.dot(WGAMMA, HGAMMA), HPHI)) \
                + db(np.dot(WF0, HF0))
        imageMatlab.imageM(SVhat, vmax=SVhat.max(),
                           vmin=SVhat.max() - 50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(15);plt.clf()
        imageMatlab.imageM(db(HF0), vmin=-100, cmap=plt.cm.gray_r)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Leading source number $u$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        # plt.xlim([3199.5, 3500.5]) # For detailed picture of HF0

        plt.figure(16)
        plt.clf()
        imageMatlab.imageM(db(np.dot(WM, HM)),
                           vmin=np.maximum(-50, db(SM.min())))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Frame number $n$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        plt.figure(17)
        plt.clf()
        imageMatlab.imageM(db(WM), vmin=-50)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel(r'Element number $r$', fontsize=16)
        plt.ylabel(r'Frequency bin number $f$', fontsize=16)
        cb = plt.colorbar(fraction=0.04)
        plt.axes(cb.ax)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        raw_input("Press Return to end the program...")
        print "Done!"