Beispiel #1
0
def lnp_Xw(X_w, x=None, method='gmm', n_comp_max=10, info_crit='bic', njobs=1):
    ''' Estimate the multi-dimensional pdf at x for a given X_w using a 
    nonparametric density estimation (either KDE or GMM). 
    '''
    if x is None: raise ValueError
    if method not in ['kde', 'gmm']: raise ValueError("method = gkde or gmm")

    if method == 'gmm':
        # find best fit component using information criteria (BIC/AIC)
        gmms, ics = [], []
        for i_comp in range(1, n_comp_max + 1):
            gmm = GMix(n_components=i_comp)
            gmm.fit(X_w)
            gmms.append(gmm)
            if info_crit == 'bic':  # Bayesian Information Criterion
                ics.append(gmm.bic(X_w))
            elif info_crit == 'aic':  # Akaike information criterion
                ics.append(gmm.aic(X_w))
        ibest = np.array(ics).argmin()  # lower the better!
        kern = gmms[ibest]
    elif method == 'kde':
        kern = UT.KayDE(X_w)
    elif method == 'gkde':
        # find the best fit bandwidth using cross-validation grid search
        grid = GridSearchCV(skKDE(), {'bandwidth': np.linspace(0.1, 1.0, 30)},
                            cv=10,
                            njobs=njobs)  # 10-fold cross-validation
        grid.fit(X_w)
        kern = grid.best_estimator_

    if len(x.shape) == 1:
        return kern.score_samples(x[:, None])
    else:
        return kern.score_samples(x)
Beispiel #2
0
def sfr_mstar_gmm(logmstar, logsfr, n_comp_max=30, silent=False): 
    ''' Fit a 2D gaussian mixture model to the 
    log(M*) and log(SFR) sample of galaxies, 
    '''
    # only keep sensible logmstar and log sfr
    sense = (logmstar > 0.) & (logmstar < 13) & (logsfr > -5) & (logsfr < 4) & (np.isnan(logsfr) == False)
    if (len(logmstar) - np.sum(sense) > 0) and not silent: 
        warnings.warn(str(len(logmstar) - np.sum(sense))+' galaxies have nonsensical logM* or logSFR values')  
    logmstar = logmstar[np.where(sense)]
    logsfr = logsfr[np.where(sense)]

    X = np.array([logmstar, logsfr]).T # (n_sample, n_features) 

    gmms, bics = [], []  
    for i_n, n in enumerate(range(1, n_comp_max)): 
        gmm = GMix(n_components=n)
        gmm.fit(X)
        gmms.append(gmm)
        bics.append(gmm.bic(X)) # bayesian information criteria
    ibest = np.array(bics).argmin() # lower the better!
    gbest = gmms[ibest]

    if not silent: 
        print(str(len(gbest.means_))+' components') 
    return gbest 
Beispiel #3
0
    def _GMMfit_bins(self, logmstar, logsfr, max_comp=3): 
        ''' Fit GMM components to P(SSFR) of given data and return best-fit
        '''
        n_bin = self._mbins.shape[0] # number of stellar mass bins.
        assert n_bin > 0, 'no mass bins'
    
        # sort logM* into M* bins
        i_bins = np.digitize(logmstar, np.append(self._mbins[:,0], self._mbins[-1,1]))
        i_bins -= 1
    
        bin_mid, gbests, nbests, _gmms, _bics = [], [], [], [], [] 

        # fit GMM to p(SSFR) in each log M* bins  
        for i in range(n_bin): 
            # if there are not enough galaxies 
            if not self._has_nbinthresh[i]: continue 
            in_bin = (i_bins == i)  

            x = logsfr[in_bin] - logmstar[in_bin] # logSSFRs
            x = np.reshape(x, (-1,1))

            bin_mid.append(np.median(logmstar[in_bin])) 
            
            # fit GMMs with a range of components 
            ncomps = range(1, max_comp+1)
            gmms, bics = [], []  
            for i_n, n in enumerate(ncomps): 
                gmm = GMix(n_components=n)
                gmm.fit(x)
                bics.append(gmm.bic(x)) # bayesian information criteria
                gmms.append(gmm)

            # components with the lowest BIC (preferred)
            i_best = np.array(bics).argmin()
            n_best = ncomps[i_best] # number of components of the best-fit 
            gbest = gmms[i_best] # best fit GMM 
            
            # save the best gmm, all the gmms, and bics 
            nbests.append(n_best) 
            gbests.append(gbest)
            _gmms.append(gmms) 
            _bics.append(bics)
        
        assert len(bin_mid) > 0, 'no mass bin has enough galaxies '
        if bin_mid[0] > 10.: 
            warnings.warn("The lowest M* bin is greater than 10^10, this may compromise the SFS identification scheme") 
        return bin_mid, gbests, nbests, _gmms, _bics
Beispiel #4
0
def _fit_pdf(samples, method='kde', range=None, debug=False, **method_kwargs):
    ''' fit a probability distribution sampled by given samples using method
    specified by `method`. This function is designed to fit p(F(theta)), the
    probability distribution of *derived* properties; however, it works for any
    PDF really. 


    Parameters
    ----------
    samples : 2d array
        Nsample x Ndim array of samples from the PDF 

    method : string
        which method to use for estimating the PDF. Currently supports 'kde' and
        'gmm' (default: 'kde') 
    '''
    if debug: print('... fitting pdf using %s' % method)

    # whiten samples
    avg_samples = np.mean(samples, axis=0)
    std_samples = np.std(samples, axis=0)
    samples_w = (samples - avg_samples) / std_samples

    if method == 'kde':
        # fit PDF using Kernel Density Estimation
        #from scipy.stats import gaussian_kde as gkde
        from sklearn.neighbors import KernelDensity

        pdf_fit = KernelDensity(kernel='gaussian',
                                **method_kwargs).fit(samples_w)
    else:
        from sklearn.mixture import GaussianMixture as GMix

        if 'n_comp' not in method_kwargs.keys():
            raise ValueError("specify number of Gaussians `n_comp` in kwargs")

        gmm = GMix(n_components=method_kwargs['n_comp'])
        gmm.fit(samples_w)
        pdf_fit = gmm

    return _PDF(pdf_fit,
                method=method,
                range=range,
                avg=avg_samples,
                std=std_samples)
Beispiel #5
0
    def _GMMfit_bins_nbest(self, logmstar, logsfr, nbests): 
        ''' Fit GMM components to P(SSFR) of given data and return best-fit
        '''
        n_bin = self._mbins.shape[0]
        i_bins = np.digitize(logmstar, np.append(self._mbins[:,0], self._mbins[-1,1]))
        i_bins -= 1 

        gmms = [] 
        ii = 0 
        for i in range(n_bin): 
            # if there are not enough galaxies 
            if not self._has_nbinthresh[i]: continue 
            in_bin = (i_bins == i)  

            x = logsfr[in_bin] - logmstar[in_bin] # logSSFRs
            x = np.reshape(x, (-1,1))
    
            gmm = GMix(n_components=nbests[ii])
            gmm.fit(x)
            # save the best gmm, all the gmms, and bics 
            gmms.append(gmm) 
            ii += 1
        return gmms
Beispiel #6
0
def divGMF(div_func='kl', Nref=1000, K=5, n_mc=10, n_comp_max=10, n_mocks=2000):
    ''' compare the divergence estimates between 
    D( gauss(C_gmf) || gauss(C_gmf) ),  D( gmfs || gauss(C_gmf) ), 
    D( gmfs || p(gmfs) KDE), D( gmfs || p(gmfs) GMM), 
    D( gmfs || PI p(gmfs^i_ICA) KDE), and D( gmfs || PI p(gmfs^i_ICA) GMM)
    '''
    if isinstance(Nref, float): 
        Nref = int(Nref)
    # read in mock GMFs from all HOD realizations (20,000 mocks)
    gmfs_mock = NG.X_gmf_all()[:n_mocks]
    n_mock = gmfs_mock.shape[0] # number of mocks 
    print("%i mocks" % n_mock) 

    gmfs_mock_meansub, _ = NG.meansub(gmfs_mock) # mean subtract
    X_w, W = NG.whiten(gmfs_mock_meansub)
    X_ica, _ = NG.Ica(X_w)  # ICA transformation 

    C_gmf = np.cov(X_w.T) # covariance matrix

    # p(gmfs) GMM
    gmms, bics = [], [] 
    for i_comp in range(1,n_comp_max+1):
        gmm = GMix(n_components=i_comp)
        gmm.fit(X_w) 
        gmms.append(gmm)
        bics.append(gmm.bic(X_w))
    ibest = np.array(bics).argmin() 
    kern_gmm = gmms[ibest]

    # p(gmfs) KDE 
    t0 = time.time() 
    grid = GridSearchCV(skKDE(),
            {'bandwidth': np.linspace(0.1, 1.0, 30)},
            cv=10) # 10-fold cross-validation
    grid.fit(X_w)
    kern_kde = grid.best_estimator_
    dt = time.time() - t0 
    print('%f sec' % dt) 
    
    # PI p(gmfs^i_ICA) GMM
    kern_gmm_ica = [] 
    for ibin in range(X_ica.shape[1]): 
        gmms, bics = [], [] 
        for i_comp in range(1,n_comp_max+1):
            gmm = GMix(n_components=i_comp)
            gmm.fit(X_ica[:,ibin][:,None]) 
            gmms.append(gmm)
            bics.append(gmm.bic(X_ica[:,ibin][:,None]))
        ibest = np.array(bics).argmin() 
        kern_gmm_ica.append(gmms[ibest])
    
    # PI p(gmfs^i_ICA) KDE  
    kern_kde_ica = [] 
    for ibin in range(X_ica.shape[1]): 
        t0 = time.time() 
        grid = GridSearchCV(skKDE(),
                {'bandwidth': np.linspace(0.1, 1.0, 30)},
                cv=10) # 10-fold cross-validation
        grid.fit(X_ica[:,ibin][:,None]) 
        kern_kde_ica.append(grid.best_estimator_) 
        dt = time.time() - t0 
        print('%f sec' % dt) 

    # caluclate the divergences now 
    div_gauss_ref, div_gauss = [], []
    div_gmm, div_gmm_ica = [], [] 
    div_kde, div_kde_ica = [], [] 
    for i in range(n_mc): 
        print('%i montecarlo' % i)
        t_start = time.time() 
        # reference divergence in order to showcase the estimator's scatter
        # Gaussian distribution described by C_gmf with same n_mock mocks 
        gauss = mvn(np.zeros(gmfs_mock.shape[1]), C_gmf, size=n_mock)
        div_gauss_ref_i = NG.kNNdiv_gauss(gauss, C_gmf, Knn=K, div_func=div_func, Nref=Nref)
        div_gauss_ref.append(div_gauss_ref_i)
        # estimate divergence between gmfs_white and a 
        # Gaussian distribution described by C_gmf
        div_gauss_i = NG.kNNdiv_gauss(X_w, C_gmf, Knn=K, div_func=div_func, Nref=Nref)
        div_gauss.append(div_gauss_i)
        # D( gmfs || p(gmfs) GMM)
        div_gmm_i = NG.kNNdiv_Kernel(X_w, kern_gmm, Knn=K, div_func=div_func, 
                Nref=Nref, compwise=False) 
        div_gmm.append(div_gmm_i)
        # D( gmfs || p(gmfs) KDE)
        div_kde_i = NG.kNNdiv_Kernel(X_w, kern_kde, Knn=K, div_func=div_func, 
                Nref=Nref, compwise=False) 
        div_kde.append(div_kde_i)
        # D( gmfs || PI p(gmfs^i_ICA) GMM), 
        div_gmm_ica_i = NG.kNNdiv_Kernel(X_ica, kern_gmm_ica, Knn=K, div_func=div_func, 
                Nref=Nref, compwise=True)
        div_gmm_ica.append(div_gmm_ica_i)
        # D( gmfs || PI p(gmfs^i_ICA) KDE), 
        div_kde_ica_i = NG.kNNdiv_Kernel(X_ica, kern_kde_ica, Knn=K, div_func=div_func, 
                Nref=Nref, compwise=True)
        div_kde_ica.append(div_kde_ica_i)
        print('t= %f sec' % round(time.time()-t_start,2))

    fig = plt.figure(figsize=(10,5))
    sub = fig.add_subplot(111)
    hrange = [-0.15, 0.6]
    nbins = 50
    
    divs = [div_gauss_ref, div_gauss, div_gmm, div_kde, div_gmm_ica, div_kde_ica]
    labels = ['Ref.', r'$D(\{\zeta_i^{(m)}\}\parallel \mathcal{N}({\bf C}^{(m)}))$', 
            r'$D(\{\zeta^{(m)}\}\parallel p_\mathrm{GMM}(\{\zeta^{m}\}))$',
            r'$D(\{\zeta^{(m)}\}\parallel p_\mathrm{KDE}(\{\zeta^{m}\}))$',
            r'$D(\{\zeta_\mathrm{ICA}^{(m)}\}\parallel \prod_{i} p^\mathrm{GMM}(\{\zeta_{i, \mathrm{ICA}}^{m}\}))$', 
            r'$D(\{\zeta_\mathrm{ICA}^{(m)}\}\parallel \prod_{i} p^\mathrm{KDE}(\{\zeta_{i, \mathrm{ICA}}^{m}\}))$']
    y_max = 0.
    for div, lbl in zip(divs, labels): 
        hh = np.histogram(np.array(div), normed=True, range=hrange, bins=nbins)
        bp = UT.bar_plot(*hh) 
        sub.fill_between(bp[0], np.zeros(len(bp[0])), bp[1], edgecolor='none', 
                alpha=0.5, label=lbl) 
        y_max = max(y_max, bp[1].max()) 
        if (np.average(div) < hrange[0]) or (np.average(div) > hrange[1]): 
            print('divergence of %s (%f) is outside range' % (lbl, np.average(div)))
    sub.set_xlim(hrange) 
    sub.set_ylim([0., y_max*1.2]) 
    sub.legend(loc='upper left', prop={'size': 15})
    # xlabels
    if 'renyi' in div_func: 
        alpha = float(div_func.split(':')[-1])
        sub.set_xlabel(r'Renyi-$\alpha='+str(alpha)+'$ divergence', fontsize=20)
    elif 'kl' in div_func: 
        sub.set_xlabel(r'KL divergence', fontsize=20)
    if 'renyi' in div_func: str_div = 'renyi'+str(alpha) 
    elif div_func == 'kl': str_div = 'kl'
    f_fig = ''.join([UT.fig_dir(), 'tests/kNN_divergence.gmf.K', str(K), '.', str(n_mocks), 
        '.', str_div, '.png'])
    fig.savefig(f_fig, bbox_inches='tight') 
    return None
Beispiel #7
0
def diverge(obvs,
            diver,
            div_func='kl',
            Nref=1000,
            K=5,
            n_mc=10,
            n_comp_max=10,
            n_mocks=20000,
            pk_mock='patchy.z1',
            NorS='ngc',
            njobs=1):
    ''' calculate the divergences: 

    - D( gauss(C_X) || gauss(C_X) ) 
    - D( mock X || gauss(C_X))
    - D( mock X || p(X) KDE)
    - D( mock X || p(X) GMM) 
    - D( mock X || PI p(X^i_ICA) KDE)
    - D( mock X || PI p(X^i_ICA) GMM)
    '''
    if isinstance(Nref, float): Nref = int(Nref)
    if diver not in [
            'ref', 'pX_gauss', 'pX_gauss_hartlap', 'pX_GMM', 'pX_GMM_ref',
            'pX_KDE', 'pX_KDE_ref', 'pX_scottKDE', 'pX_scottKDE_ref',
            'pXi_ICA_GMM', 'pXi_ICA_GMM_ref', 'pXi_parICA_GMM',
            'pXi_parICA_GMM_ref', 'pXi_ICA_KDE', 'pXi_ICA_KDE_ref',
            'pXi_parICA_KDE', 'pXi_parICA_KDE_ref', 'pXi_ICA_scottKDE',
            'pXi_ICA_scottKDE_ref', 'pXi_parICA_scottKDE',
            'pXi_parICA_scottKDE_ref'
    ]:
        raise ValueError
    str_obvs = ''
    if obvs == 'pk': str_obvs = '.' + NorS
    if 'renyi' in div_func:
        alpha = float(div_func.split(':')[-1])
        str_div = 'renyi' + str(alpha)
    elif div_func == 'kl':
        str_div = 'kl'
    str_comp = ''
    if 'GMM' in diver: str_comp = '.ncomp' + str(n_comp_max)

    f_dat = ''.join([
        UT.dat_dir(), 'diverg/', 'diverg.', obvs, str_obvs, '.', diver, '.K',
        str(K), str_comp, '.Nref',
        str(Nref), '.', str_div, '.dat'
    ])
    if not os.path.isfile(f_dat):
        print('-- writing to -- \n %s' % f_dat)
        f_out = open(f_dat, 'w')
    else:
        print('-- appending to -- \n %s' % f_dat)

    # read in mock data X
    if obvs == 'pk':
        X_mock = NG.X_pk_all(pk_mock, NorS=NorS, sys='fc')
    elif obvs == 'gmf':
        if n_mocks is not None:
            X_mock = NG.X_gmf_all()[:n_mocks]
        else:
            X_mock = NG.X_gmf_all()
    else:
        raise ValueError("obvs = 'pk' or 'gmf'")
    n_mock = X_mock.shape[0]  # number of mocks
    print("%i mocks" % n_mock)

    X_mock_meansub, _ = NG.meansub(X_mock)  # mean subtract
    X_w, W = NG.whiten(X_mock_meansub)
    if '_ICA' in diver:
        X_ica, W_ica = NG.Ica(X_w)  # ICA transformation
        W_ica_inv = sp.linalg.pinv(W_ica.T)
    elif '_parICA' in diver:
        # FastICA transformation using parallel algorithm
        X_ica, W_ica = NG.Ica(X_w, algorithm='parallel')
        W_ica_inv = sp.linalg.pinv(W_ica.T)

    if diver in ['pX_gauss', 'ref']:
        C_X = np.cov(X_w.T)  # covariance matrix
    elif diver in ['pX_gauss_hartlap']:
        C_X = np.cov(X_w.T)  # covariance matrix
        f_hartlap = (n_mock - float(X_mock.shape[1]) - 2.) / (n_mock - 1.)
        print("hartlap factor = %f" % f_hartlap)
        C_X = C_X / f_hartlap  # scale covariance matrix by hartlap factor
    elif diver in ['pX_GMM', 'pX_GMM_ref']:  # p(mock X) GMM
        gmms, bics = [], []
        for i_comp in range(1, n_comp_max + 1):
            gmm = GMix(n_components=i_comp)
            gmm.fit(X_w)
            gmms.append(gmm)
            bics.append(gmm.bic(X_w))
        ibest = np.array(bics).argmin()
        kern_gmm = gmms[ibest]
    elif diver in ['pX_KDE', 'pX_KDE_ref']:  # p(mock X) KDE
        t0 = time.time()
        grid = GridSearchCV(skKDE(), {'bandwidth': np.linspace(0.1, 1.0, 30)},
                            cv=10,
                            n_jobs=njobs)  # 10-fold cross-validation
        grid.fit(X_w)
        kern_kde = grid.best_estimator_
        dt = time.time() - t0
        print('%f sec' % dt)
    elif diver in ['pX_scottKDE', 'pX_scottKDE_ref']:  # p(mock X) KDE
        # calculate Scott's Rule KDE
        t0 = time.time()
        kern_kde = UT.KayDE(X_w)
        dt = time.time() - t0
        print('%f sec' % dt)
    elif diver in [
            'pXi_ICA_GMM', 'pXi_ICA_GMM_ref', 'pXi_parICA_GMM',
            'pXi_parICA_GMM_ref'
    ]:
        # PI p(X^i_ICA) GMM
        kern_gmm_ica = []
        for ibin in range(X_ica.shape[1]):
            gmms, bics = [], []
            for i_comp in range(1, n_comp_max + 1):
                gmm = GMix(n_components=i_comp)
                gmm.fit(X_ica[:, ibin][:, None])
                gmms.append(gmm)
                bics.append(gmm.bic(X_ica[:, ibin][:, None]))
            ibest = np.array(bics).argmin()
            kern_gmm_ica.append(gmms[ibest])
    elif diver in [
            'pXi_ICA_KDE', 'pXi_ICA_KDE_ref', 'pXi_parICA_KDE',
            'pXi_parICA_KDE_ref'
    ]:
        # PI p(X^i_ICA) KDE
        kern_kde_ica = []
        for ibin in range(X_ica.shape[1]):
            t0 = time.time()
            grid = GridSearchCV(skKDE(),
                                {'bandwidth': np.linspace(0.1, 1.0, 30)},
                                cv=10,
                                n_jobs=njobs)  # 10-fold cross-validation
            grid.fit(X_ica[:, ibin][:, None])
            kern_kde_ica.append(grid.best_estimator_)
            dt = time.time() - t0
            print('%f sec' % dt)
    elif diver in [
            'pXi_ICA_scottKDE', 'pXi_ICA_scottKDE_ref', 'pXi_parICA_scottKDE',
            'pXi_parICA_scottKDE_ref'
    ]:
        # PI p(X^i_ICA) KDE
        kern_kde_ica = []
        for ibin in range(X_ica.shape[1]):
            kern_kde_i = UT.KayDE(X_ica[:, ibin])
            kern_kde_ica.append(kern_kde_i)

    # caluclate the divergences now
    divs = []
    for i in range(n_mc):
        print('%i montecarlo' % i)
        t0 = time.time()
        if diver in ['pX_gauss', 'pX_gauss_hartlap']:
            # estimate divergence between gmfs_white and a
            # Gaussian distribution described by C_gmf
            div_i = NG.kNNdiv_gauss(X_w,
                                    C_X,
                                    Knn=K,
                                    div_func=div_func,
                                    Nref=Nref,
                                    njobs=njobs)
        elif diver == 'ref':
            # reference divergence in order to showcase the estimator's scatter
            # Gaussian distribution described by C_gmf with same n_mock mocks
            gauss = mvn(np.zeros(X_mock.shape[1]), C_X, size=n_mock)
            div_i = NG.kNNdiv_gauss(gauss,
                                    C_X,
                                    Knn=K,
                                    div_func=div_func,
                                    Nref=Nref,
                                    njobs=njobs)
        elif diver == 'pX_GMM':  # D( mock X || p(X) GMM)
            div_i = NG.kNNdiv_Kernel(X_w,
                                     kern_gmm,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=False,
                                     njobs=njobs)
        elif diver == 'pX_GMM_ref':  # D( sample from p(X) GMM || p(X) GMM)
            samp = kern_gmm.sample(n_mock)
            div_i = NG.kNNdiv_Kernel(samp[0],
                                     kern_gmm,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=False,
                                     njobs=njobs)
        elif diver in ['pX_KDE', 'pX_scottKDE']:  # D( mock X || p(X) KDE)
            div_i = NG.kNNdiv_Kernel(X_w,
                                     kern_kde,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=False,
                                     njobs=njobs)
            divs.append(div_i)
        elif diver in ['pX_KDE_ref', 'pX_scottKDE_ref'
                       ]:  # D( sample from p(X) KDE || p(X) KDE)
            samp = kern_kde.sample(n_mock)
            div_i = NG.kNNdiv_Kernel(samp,
                                     kern_kde,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=False,
                                     njobs=njobs)
            divs.append(div_i)
        elif diver in ['pXi_ICA_GMM',
                       'pXi_parICA_GMM']:  # D( mock X || PI p(X^i_ICA) GMM),
            div_i = NG.kNNdiv_Kernel(X_w,
                                     kern_gmm_ica,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=True,
                                     njobs=njobs,
                                     W_ica_inv=W_ica_inv)
        elif diver in ['pXi_ICA_GMM_ref', 'pXi_parICA_GMM_ref']:
            # D( ref. sample || PI p(X^i_ICA) GMM),
            samp = np.zeros((n_mock, X_ica.shape[1]))
            for icomp in range(X_ica.shape[1]):
                samp_i = kern_gmm_ica[icomp].sample(n_mock)
                samp[:, icomp] = samp_i[0].flatten()
            samp = np.dot(samp, W_ica_inv.T)
            div_i = NG.kNNdiv_Kernel(samp,
                                     kern_gmm_ica,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=True,
                                     njobs=njobs,
                                     W_ica_inv=W_ica_inv)
        elif diver in [
                'pXi_ICA_KDE', 'pXi_ICA_scottKDE', 'pXi_parICA_KDE',
                'pXi_parICA_scottKDE'
        ]:  # D( mock X || PI p(X^i_ICA) KDE),
            div_i = NG.kNNdiv_Kernel(X_w,
                                     kern_kde_ica,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=True,
                                     njobs=njobs,
                                     W_ica_inv=W_ica_inv)
        elif diver in [
                'pXi_ICA_KDE_ref', 'pXi_ICA_scottKDE_ref',
                'pXi_parICA_KDE_ref', 'pXi_parICA_scottKDE_ref'
        ]:
            # D( ref sample || PI p(X^i_ICA) KDE),
            samp = np.zeros((n_mock, X_ica.shape[1]))
            for icomp in range(X_ica.shape[1]):
                samp_i = kern_kde_ica[icomp].sample(n_mock)
                samp[:, icomp] = samp_i.flatten()
            samp = np.dot(samp, W_ica_inv.T)
            div_i = NG.kNNdiv_Kernel(samp,
                                     kern_kde_ica,
                                     Knn=K,
                                     div_func=div_func,
                                     Nref=Nref,
                                     compwise=True,
                                     njobs=njobs,
                                     W_ica_inv=W_ica_inv)
        print(div_i)
        f_out = open(f_dat, 'a')
        f_out.write('%f \n' % div_i)
        f_out.close()
    return None
Beispiel #8
0
def lnp_Xw_i(X_w,
             i_bins,
             x=None,
             method='kde',
             n_comp_max=10,
             info_crit='bic',
             njobs=1):
    ''' Estimate the log pdf of X_w[:,i_bins] at x using a nonparametric 
    density estimation (either KDE or GMM). 
    
    parameters
    ----------
    X_w : np.ndarray 
        N_sample x N_feature matrix 

    i_bins : int or list of ints 
        specifies the feature bin(s) 

    x : np.ndarray or list of np.ndarray
        values to evaluate the pdf. Must be consistent with 
        i_bins!
    '''
    if x is None: raise ValueError
    if method not in ['kde', 'gmm']: raise ValueError("method = gkde or gmm")
    if isinstance(i_bins, int): i_bins = [i_bins]
    if np.max(i_bins) > X_w.shape[1] or np.min(i_bins) < 0: raise ValueError
    if len(i_bins) > 1:  # more than one bin
        if not isinstance(x, list): raise ValueError
        else:
            if len(i_bins) != len(x): raise ValueError
    else: x = [x]

    lnpdfs = []
    for ii, i_bin in enumerate(i_bins):
        if method == 'gmm':
            # find best fit component using information criteria (BIC/AIC)
            gmms, ics = [], []
            for i_comp in range(1, n_comp_max + 1):
                gmm = GMix(n_components=i_comp)
                gmm.fit(X_w[:, i_bin])
                gmms.append(gmm)
                if info_crit == 'bic':  # Bayesian Information Criterion
                    ics.append(gmm.bic(X_w[:, i_bin]))
                elif info_crit == 'aic':  # Akaike information criterion
                    ics.append(gmm.aic(X_w[:, i_bin]))
            ibest = np.array(ics).argmin()  # lower the better!
            kern = gmms[ibest]
        elif method == 'kde':  # simple scott's rule KDE
            kern = UT.KayDE(X_w[:, i_bin])
        elif method == 'gkde':
            # find the best fit bandwidth using cross-validation grid search
            t0 = time.time()
            grid = GridSearchCV(skKDE(),
                                {'bandwidth': np.linspace(0.1, 1.0, 30)},
                                cv=10,
                                n_jobs=njobs)  # 10-fold cross-validation
            grid.fit(X_w[:, i_bin][:, None])
            kern = grid.best_estimator_
            dt = time.time() - t0
            print('%f sec' % dt)
        lnpdfs.append(kern.score_sample(x[ii][:, None]))
    if len(i_bins) == 1:
        return np.array(lnpdfs[0])
    else:
        return np.array(lnpdfs)