Beispiel #1
0
def test_fast_mcd(data):
    """

    """
    n = data.shape[0]
    p = data.shape[1]
    
    ### Naive location and scatter estimates
    location = data.mean(0)
    covariance = np.cov(data.T)
    # invert the covariance matrix
    try:
        inv_sigma = linalg.inv(robust_covariance)
    except:
        u, s, vh = linalg.svd(covariance)
        inv_s = (1. / s) * \
                ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1)))
        inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T)
    # get distribution of data's Mahalanobis distances
    Y = data - location
    R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1))
    # estimate the density with a gaussian kernel
    nonnan_subjects_arg = np.where(~np.isnan(R))[0]
    R = R[nonnan_subjects_arg]
    x1 = np.arange(0., 1.2*np.amax(R), 0.0012*np.amax(R))
    n = R.size
    sigma = 1.05 * np.std(R) * n**(-0.2)
    kernel_arg = (np.tile(x1, (n,1)).T - R) / sigma
    fh = ((1/np.sqrt(2*np.pi)) * np.exp(-0.5*kernel_arg**2)).sum(1) / (n*sigma)
    # plot the distribution
    if PLOT:
        plt.figure()
        plt.plot(x1, fh, color='blue')
    # Khi-2 distribution
    diff_scale = np.sqrt(R.var() / float(chi2.stats(p, moments='v')))
    diff_loc = R.mean() - float(chi2.stats(p, scale=diff_scale, moments='m'))
    template = chi2(p, loc=diff_loc, scale=diff_scale)
    if PLOT:
        plt.plot(x1, template.pdf(x1), linestyle='--', color='blue')
    mse_naive = ((fh - template.pdf(x1))**2).mean()
    imse_naive = 0.5 * ((fh - template.pdf(x1))**2).sum() * (x1[1] - x1[0])
    if PLOT:
        print "MSE (naive case) =", mse_naive
        print "IMSE (naive case) =", imse_naive
    
    ### Robust location and scatter estimates
    robust_location, robust_covariance = fast_mcd(data)
    try:
        inv_sigma = linalg.inv(robust_covariance)
    except:
        u, s, vh = linalg.svd(robust_covariance)
        inv_s = (1. / s) * \
                ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1)))
        inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T)
    # get distribution of data's Mahalanobis distances
    Y = data - robust_location
    R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1))
    # estimate the density with a gaussian kernel
    nonnan_subjects_arg = np.where(~np.isnan(R))[0]
    R = R[nonnan_subjects_arg]
    x2 = np.arange(0., 1.2*np.amax(R), 0.0012*np.amax(R))
    n = R.size
    sigma = 1.05 * np.std(R) * n**(-0.2)
    kernel_arg = (np.tile(x2, (n,1)).T - R) / sigma
    fh = ((1/np.sqrt(2*np.pi)) * np.exp(-0.5*kernel_arg**2)).sum(1) / (n*sigma)
    # plot the distribution
    if PLOT:
        plt.plot(x2, fh, color='green')
    # Khi-2 distribution
    diff_scale = np.sqrt(R.var() / float(chi2.stats(p, moments='v')))
    diff_loc = R.mean() - float(chi2.stats(p, scale=diff_scale, moments='m'))
    template = chi2(p, loc=diff_loc, scale=diff_scale)
    if PLOT:
        plt.plot(x2, template.pdf(x2), linestyle='--', color='green')
    mse_robust = ((fh - template.pdf(x2))**2).mean()
    imse_robust = 0.5 * ((fh - template.pdf(x2))**2).sum() * (x2[1] - x2[0])
    if PLOT:
        print "MSE (robust case) =", mse_robust
        print "IMSE (robust case) =", imse_robust
        plt.legend(('empirical distribution (naive)', 'chi-2 (naive)',
                    'empirical distribution (robust)', 'chi-2 (robust)'),
                   loc='upper center', bbox_to_anchor=(0.5, 0.))
        plt.show()
    
    return mse_naive, mse_robust, imse_naive, imse_robust
    allsummary[i,:] = np.asarray([v[~np.isnan(v)].mean() for v in values])
    del img
# deal with the NaNs, since they confuse argsort
nonan_mask = np.isnan(allsummary).sum(1) == 0 
fallsummary = allsummary[nonan_mask,:]
# --------> /!\ fixme: find a better way to trim
# per ROI trimmed-list of subjects (10 each side)
M = np.argsort(fallsummary, 0)[10:-10]
trimmed_ind = reduce(np.intersect1d, M.T)
trimmed_allsummary = fallsummary[trimmed_ind]
del M
#trimmed_allsummary = fallsummary

# SVD decomposition of the covariance matrix
covariance = np.cov(trimmed_allsummary.T)
robust_location, robust_covariance = fast_mcd(fallsummary)
u, s, vh = linalg.svd(robust_covariance)
# --------> /!\ fixme: look at that criterion (75%)
# keep only 75% of the covariance
inv_s = (1. / s) * \
        ((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1)))
inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T)

# --------> /!\ fixme: median ?
# compute Mahalanobis distances
Y = fallsummary - robust_location
#Y = fallsummary - np.mean(fallsummary, 0)
R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1))
# find outliers threshold
sortedR = R[~np.isnan(R)].copy()
sortedR.sort()