Ejemplo n.º 1
0
def getBinnedUnbiasedIMSE(cov, dataFrame, bin_means, bin_mean,
                          covariance_class, d, iu, selection):
    r"""    
    :math:`\mathcal{H}_1=\{h^2\mathbf{I}\}`
    
    :math:`\mathcal{H}_2=\{\mathrm{diag}(h_1^2,\dots,h_d^2)\}`
    
    :math:`\mathcal{H}_3=\{\mathbf{\Sigma}\}`
    
    Parameters
    ----------
    dataFrame:
        dataFrame, :math:`X`
    cov:
        the covariance matrix, :math:`H`
    bin_mean:
        the mean of the bin being optimised
    
    Returns
    -------
    IMSE:
        .. math:: \frac{1}{n^{2}}\sum_{i=1}^{n}\sum_{j=1}^{N}
            \bar{K}_{H_j}(X_{i},X_{j})-\frac{2}{n(n-1)}\sum_{i=1}^{n}
            \sum_{j=1,j\neq i}^{N}K_{H_j}(X_{i},X_{j})
    
    Where :math:`\bar{K}_{h}` is the multivariate convolution kernel    
    """
    if covariance_class == 'H3':
        cov = mvn.rollSigma(cov, d, iu)
    elif covariance_class == 'H2':
        cov = cov**2.
    else:  #covariance_class=='H1':
        cov = ones(d) * cov**2.
    nj = selection.sum()
    n = selection.shape[0]
    Rf = mvn.getSamplePointDensity(dataFrame,
                                   cov,
                                   dataFrame,
                                   kernel='gaussian_convolve',
                                   maxjobs=1).mean()
    Ks = mvn.getSamplePointDensity(bin_mean, cov, dataFrame, maxjobs=1)
    f_1 = 2.0 * (Ks[selection].sum() + Ks[-selection].sum()) / (n - 1)
    #print(cov, Rf, f_1)
    #assert False
    return sys.float_info.max if isnan(f_1) else Rf - f_1
def KFoldMeanLikelyHood(dataFrame, H_diag, k=3):
    nvec, ndim = dataFrame.shape
    kf = KFold(nvec, n_folds=k, shuffle=True)
    v = []
    for train, test in kf:
        v.append(
            np.log(
                mvn.getSamplePointDensity(dataFrame.iloc[train], H_diag[train],
                                          dataFrame.iloc[test])).mean())
    return np.mean(v)
Ejemplo n.º 3
0
 def predict(self, data):
     """Evaluate the density model on the data.
     Parameters
     ----------
     X : array_like, shape (n_samples, n_features)
         An array of points to query.  Last dimension should match dimension
         of training data (n_features).
     Returns
     -------
     density : ndarray, shape (n_samples,)
         The array of density evaluations.
     """
     return mvn.getSamplePointDensity(self.dataFrame_, self.H_, pd.DataFrame(data))
Ejemplo n.º 4
0
 def score(self, data):
     """Compute the mean log-likelihood under the model.
     Parameters
     ----------
     X : array_like, shape (n_samples, n_features)
         List of n_features-dimensional data points.  Each row
         corresponds to a single data point.
     Returns
     -------
     logprob : float
         mean log-likelihood of the data in X.
     """
     return np.mean( np.log( mvn.getSamplePointDensity(self.dataFrame_, self.H_, pd.DataFrame(data)) ) )
Ejemplo n.º 5
0
def getDensity(samples,
               k,
               points,
               balloon='loftsgaarden-knn',
               tree=None,
               dist=None,
               nn=None,
               dist_loo=None,
               nn_loo=None,
               maxjobs=None,
               percentile=0.6826,
               leave_out_n=0,
               tolerance=0.001):  #0.001
    #leave_n_out=0 means dont leave any out
    #leave_n_out=1 is a loo density estimate
    if tree == None:
        kdt = KDTree(samples, leaf_size=30, metric='euclidean')
    else:
        kdt = tree
    n, d = samples.shape
    m, _ = points.shape
    M = 2
    if isinstance(k, int):
        pass
    elif k == 'sqrt':
        k = int(n**0.5)
    elif k == 'kung':
        d2 = np.max([d, 2])
        k = M * int(n**(1 / d2))
    elif k == 'hansen':  #Mack et.al, 'fukunaga'
        k = int(n**(4 / (4 + d)))
    elif k == 'loo_ml':
        if dist_loo is None or nn_loo is None:
            dist_loo, nn_loo = kdt.query(samples, k=n, return_distance=True)
        k = int(n**0.5) - 1
        k_old = k
        k_new = 0
        while k > 2:
            k_new = n / (
                (dist_loo[:, k + 1] - dist_loo[:, k]) / dist_loo[:, k]).sum()
            print(k, k_new)
            k = round(int(k_new))
            if k_old - k_new < 1.0:
                break
            k_old = k_new

        print('Best k: %s' % k)

    elif k == 'loo_cv':
        if dist_loo is None or nn_loo is None:
            dist_loo, nn_loo = kdt.query(samples, k=n, return_distance=True)

        estimator = mvn.GlobalKDE('rule-of-thumb', covariance='H3')
        estimator.fit(samples)
        glo_res = estimator.predict(samples)
        if balloon != 'terrel-knn':
            k = 2
            min_res = getDensity(samples,
                                 k,
                                 samples,
                                 balloon=balloon,
                                 tree=kdt,
                                 dist=dist_loo,
                                 nn=nn_loo,
                                 maxjobs=maxjobs,
                                 percentile=percentile,
                                 leave_out_n=1)
            min_los = np.linalg.norm(glo_res - min_res)
            #            max_n = int(n**(4/(4+d)))
            max_n = int(n**0.5)
            strike = 0
            for loo_cv_k in range(3, max_n):
                res = getDensity(samples,
                                 loo_cv_k,
                                 samples,
                                 balloon=balloon,
                                 tree=kdt,
                                 dist=dist_loo,
                                 nn=nn_loo,
                                 maxjobs=maxjobs,
                                 percentile=percentile,
                                 leave_out_n=1)
                los = np.linalg.norm(glo_res - res)
                if los < min_los:
                    min_los = los
                    k = loo_cv_k
                    strike = 0
                elif strike < 3:
                    strike += 1
                else:
                    break
        else:
            min_k = 1
            max_k = int(n**0.5)
            k = floor((min_k + max_k) / 2.0)
            min_res = getDensity(samples,
                                 k,
                                 samples,
                                 balloon=balloon,
                                 tree=kdt,
                                 dist=dist_loo,
                                 nn=nn_loo,
                                 maxjobs=maxjobs,
                                 percentile=percentile,
                                 leave_out_n=1,
                                 tolerance=0.05)
            min_los = np.linalg.norm(glo_res - min_res)
            left_los = min_los
            right_los = min_los
            while True:
                #evaluate left half
                left_k = floor((min_k + k) / 2.0)
                if left_k > min_k:
                    min_res = getDensity(samples,
                                         left_k,
                                         samples,
                                         balloon=balloon,
                                         tree=kdt,
                                         dist=dist_loo,
                                         nn=nn_loo,
                                         maxjobs=maxjobs,
                                         percentile=percentile,
                                         leave_out_n=1,
                                         tolerance=0.05)
                    left_los = np.linalg.norm(glo_res - min_res)
                #evaluate right half
                right_k = ceil((k + max_k) / 2.0)
                if right_k < max_k:
                    min_res = getDensity(samples,
                                         right_k,
                                         samples,
                                         balloon=balloon,
                                         tree=kdt,
                                         dist=dist_loo,
                                         nn=nn_loo,
                                         maxjobs=maxjobs,
                                         percentile=percentile,
                                         leave_out_n=1,
                                         tolerance=0.05)
                    right_los = np.linalg.norm(glo_res - min_res)
                #debug
                print(min_k, left_k, k, right_k, max_k)
                print(left_los, min_los, right_los)
                #pick largest decent, assume convexity
                if left_los < min_los:
                    #go left
                    max_k = k
                    k = left_k
                    min_los = left_los
                elif right_los < min_los:
                    #go right
                    min_k = k
                    k = right_k
                    min_los = right_los
                else:
                    min_k = left_k
                    max_k = right_k

                if min_k + 1 >= k >= max_k - 1:
                    break

        print('Best k: %s' % k)

    if dist is None or nn is None:
        dist, nn = kdt.query(points, k=k, return_distance=True)

    if balloon == 'loftsgaarden-knn':  #Fisrt proposed by Mack et.al "Multivariate k-Nearest Neighbor Density Estimates"
        r = dist[:, k - 1 + leave_out_n]
        return k / (n * Vk(r, d))
    elif balloon == 'kung-knn':
        r = dist[:, k - 1 + leave_out_n]
        return (k - 1.) / (n * Vk(r, d))
    elif balloon == 'biau-knn':  #Biau et al. (2011)
        return (k * (1.0 + k)) / (
            2.0 * n * np.sum(Vk(dist[:, :k - 1 + leave_out_n], d), axis=1))
    elif balloon == 'loftsgaarden-kernel-knn':
        r = dist[:, k - 1 + leave_out_n]
        # \lambda_i = \frac{r^2}{c}
        # where c is your confidence interval
        lambde = r**2 / chi2.ppf(percentile, d)
        hk = np.repeat(lambde, d).reshape((m, d))
        return mvn.getBalloonDensity(samples.values, hk, points.values, True)
#    elif balloon=='terrel-pilot-knn': #Terrell and Scott (1992)
#        # Page 1258,
#        # Here Terrell et.al. suggest the optimal h_k
#        # We use the rule of thumb pilot for f(y)
    elif balloon == 'terrel-knn':
        # Page 1258
        # Here Terrell et.al. suggest using the minimum enclosing ellipsoid
        #        _, nn_ter = kdt.query(points, k=k, return_distance=True)
        hk = mve(samples.values, k, points.values, nn, m, d, leave_out_n,
                 tolerance)
        return (k / (n * Vk(1, d) * (hk.prod(axis=1))))
    elif balloon == 'biau-ellipse-knn':
        # Page 1258
        # Here Terrell et.al. suggest using the minimum enclosing ellipsoid
        #        _, nn_ter = kdt.query(points, k=k, return_distance=True)
        hk = mve(samples.values, k, points.values, nn, m, d, leave_out_n)
        return (k * (1.0 + k)) / (2.0 * n * np.sum(Vk(hk, d), axis=1))
    elif balloon == 'mittal':  #Supposed Hybrid
        estimator = samplepoint.SamplePointKDE(covariance='H2')
        estimator.fit(samples)
        hki = estimator.H_
        r = dist[:, k - 1 + leave_out_n]
        lambde = r**2 / chi2.ppf(0.80, d)
        hk = np.repeat(lambde, d).reshape((m, d))
        return mvn.getSamplePointDensity(
            samples, hki, points) * mvn.getBalloonDensity(
                samples.values, hk, points.values, True)
        #return np.array([mvn.getSamplePointDensity(samples, hk[i]+hki, points[i:i+1]) for i in range(m)])
    elif balloon == 'hall-pilot-knn':
        #Get a global bandwidth for the
        k = np.max([d + 1, k])
        _, nn_hall = kdt.query(points, k=k, return_distance=True)
        hk = np.array([
            mvn.getGlobalBandwidth('rule-of-thumb',
                                   samples.iloc[nn_hall[i]])[2]
            for i in range(m)
        ])
        return mvn.getBalloonDensity(samples.values, hk, points.values, True)
Ejemplo n.º 6
0
], dtype=np.float64)

VI = np.linalg.inv(np.array([[1.0, 0.2], [0.2, 1.0]]))

import numpy.testing as npt
from mvn import mahalanobisdist
from scipy.spatial.distance import cdist

npt.assert_almost_equal(mahalanobisdist(x, y, VI),
                        cdist(x, y, 'mahalanobis', VI=VI))

#Must be squared since we want a covariance matrix
h, cov, H = mvn.getGlobalBandwidth('silverman', dataFrame)
f_ones = getDensity(dataFrame)

f_sil = mvn.getSamplePointDensity(dataFrame, cov, test_set)
f_sim = mvn.getSamplePointDensity(dataFrame, H, test_set)
k = gaussian_kde(dataFrame.values.T, 'silverman')
f_sci = k(test_set.T)
#l = sm.nonparametric.KDEMultivariate(data=dataFrame.values.T, var_type='c'*len(dataFrame.columns), bw='normal_reference')
#f_stm = l.pdf(test_set.T)

assert (abs(
    mvn.getSamplePointDensity(dataFrame, np.diag(cov), test_set) -
    f_sil < 1e-10).all())
assert (abs(
    mvn.getSamplePointDensity(dataFrame, k.covariance, test_set) -
    f_sci < 1e-10).all())
#assert( abs(mvn.getSamplePointDensity(dataFrame, l.bw**2, test_set) - f_stm < 1e-10).all())
assert (abs(
    mvn.getBalloonDensity(dataFrame.values, cov, test_set.values, True) -