Example #1
0
def compute_mmd2u_and_null_distributions(Ks, m, n, iterations=1000,
                                         seed=0, parallel=True,
                                         permutation=None, n_jobs=-1,
                                         verbose=False):
    """Compute MMD2u statistic and its null-distribution for each unit
    from kernel matrices Ks. Each null-distributions is approximated
    with the given number of iterations. Parallel (multiprocess, with
    n_jobs processes) computation is available. Note: n_jobs=-1 means
    'use all available cores'. Precomputed permutations (array of size
    iterations x (m+n)) can be used instead of randomly generated
    ones to enforce reproducibility and keep the desired permutation
    schema for each kernel/unit. This is important during parallel
    computation.
    """
    n_units = len(Ks)
    unit_statistic = np.zeros(n_units)
    unit_statistic_permutation = np.zeros((n_units, iterations))

    print("Computing MMD2u for each unit.")
    for i, K in enumerate(Ks):
        mmd2u = MMD2u(K, m, n)
        unit_statistic[i] = mmd2u

    print("Computing MMD2u's null-distribution, for each unit.")
    if not parallel:
        for i, K in enumerate(Ks):
            if permutation is None:
                 # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED
                 # FOR EACH UNIT!
                mmd2u_null = compute_null_distribution(K, m, n,
                                                       iterations=iterations,
                                                       verbose=verbose,
                                                       random_state=seed,
                                                       marker_interval=100)
            else:
                mmd2u_null = compute_null_distribution_given_permutations(K, m, n,
                                                                          permutation,
                                                                          iterations=iterations)
            
            unit_statistic_permutation[i, :] = mmd2u_null
    else:
        print("Parallel computation!")
        if permutation is None:
            # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED FOR EACH UNIT!
            results = Parallel(n_jobs=n_jobs, verbose=10)(delayed(compute_null_distribution)(K, m, n, iterations=iterations, verbose=False, random_state=seed) for K in Ks)
        else:
            results = Parallel(n_jobs=n_jobs, verbose=10)(delayed(compute_null_distribution_given_permutations)(K, m, n, permutation, iterations=iterations) for K in Ks)
            
        unit_statistic_permutation = np.vstack(results)

    return unit_statistic, unit_statistic_permutation
Example #2
0
def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True):
    """
    Compute MMD^2_u, its null distribution and the p-value of the
    kernel two-sample test.

    Parameters:
    ----------
    K: array-like
        Kernel matrix
    y: array_like
        class labels
    verbose: bool
        Verbosity

    Returns:
    -------
    mmd2u: float
        MMD^2_u value.
    acc_null: array
        Null distribution of the MMD^2_u
    p_value: float
         p-value
    """
    assert len(np.unique(y)) == 2, 'KTST only works on binary problems'

    # Assuming that the first m rows of the kernel matrix are from one
    # class and the other n rows from the second class.
    m = len(y[y == 0])
    n = len(y[y == 1])
    mmd2u = MMD2u(K, m, n)
    if verbose:
        print("MMD^2_u = %s" % mmd2u)
        print("Computing the null distribution.")
    if subjects:
        perms = [_permutation_subjects_ktst(y) for i in range(iterations)]
        mmd2u_null = compute_null_distribution_given_permutations(
            K, m, n, perms, iterations)
    else:
        mmd2u_null = compute_null_distribution(K,
                                               m,
                                               n,
                                               iterations,
                                               verbose=verbose)

    p_value = max(1.0 / iterations,
                  (mmd2u_null > mmd2u).sum() / float(iterations))
    if verbose:
        print("p-value ~= %s \t (resolution : %s)" %
              (p_value, 1.0 / iterations))

    return mmd2u, mmd2u_null, p_value
Example #3
0
def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True):
    """
    Compute MMD^2_u, its null distribution and the p-value of the
    kernel two-sample test.

    Parameters:
    ----------
    K: array-like
        Kernel matrix
    y: array_like
        class labels
    verbose: bool
        Verbosity

    Returns:
    -------
    mmd2u: float
        MMD^2_u value.
    acc_null: array
        Null distribution of the MMD^2_u
    p_value: float
         p-value
    """
    assert len(np.unique(y)) == 2, 'KTST only works on binary problems'

    # Assuming that the first m rows of the kernel matrix are from one
    # class and the other n rows from the second class.
    m = len(y[y == 0])
    n = len(y[y == 1])
    mmd2u = MMD2u(K, m, n)
    if verbose:
        print("MMD^2_u = %s" % mmd2u)
        print("Computing the null distribution.")
    if subjects:
        perms = [_permutation_subjects_ktst(y) for i in range(iterations)]
        mmd2u_null = compute_null_distribution_given_permutations(K, m, n,
                                                                  perms,
                                                                  iterations)
    else:
        mmd2u_null = compute_null_distribution(K, m, n, iterations,
                                               verbose=verbose)

    p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum()
                  / float(iterations))
    if verbose:
        print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations))

    return mmd2u, mmd2u_null, p_value
def MMD_single_modality(data_b6, data_btbr, modality='Structural',
                             iterations=100000, plot=True):
    """
    Process the data with the following approach: Embedding + 
    RBF_kernel + KTST
    Parameters:
    -----------
    
    Return:
    ----------
        MMD distance, null_distribution, p-value
    """
    print 'Analyzing %s data' %(modality)
    
    #Concatenating the data
    vectors = np.vstack((data_b6, data_btbr))
    n_b6 = len(data_b6)
    n_btbr = len(data_btbr)
   
    sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2    
    k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2)    
    
    if plot:
        plot_similarity_matrix(k_matrix)
    
    #Computing the MMD
    mmd2u = MMD2u(k_matrix, n_b6, n_btbr)
    print("MMD^2_u = %s" % mmd2u)    
    #Computing the null-distribution
        
    #Null distribution only on B6 mice
#    sigma2_b6 = np.median(pairwise_distances(vectors_cl1, metric='euclidean'))**2    
#    k_matrix_b6 = pairwise_kernels(vectors_cl1, metric='rbf', gamma=1.0/sigma2_b6)
#    mmd2u_null = compute_null_distribution(k_matrix_b6, 5, 5, iterations, seed=123, verbose=False)
  
    mmd2u_null = compute_null_distribution(k_matrix, n_b6, n_btbr, iterations, 
                                           seed=123, verbose=False)
    
    print np.max(mmd2u_null)
    #Computing the p-value
    p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations))
    print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations))    
    print 'Number of stds from MMD^2_u to mean value of null distribution: %s' % ((mmd2u - np.mean(mmd2u_null))/np.std(mmd2u_null))
    
    if plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        prob, bins, patches = plt.hist(mmd2u_null, bins=50, normed=True)
        ax.plot(mmd2u, prob.max()/30, 'w*', markersize=15, 
                markeredgecolor='k', markeredgewidth=2, 
                label="$%s MMD^2_u = %s$" % (modality, mmd2u))
    #    func_p_value = max(1.0/iterations, (functional_mmd[1] > functional_mmd[0]).sum() / float(iterations))

        ax.annotate('p-value: %s' %(p_value), 
                    xy=(float(mmd2u), prob.max()/9.),  xycoords='data',
                    xytext=(-105, 30), textcoords='offset points',
                    bbox=dict(boxstyle="round", fc="1."),
                    arrowprops=dict(arrowstyle="->",
                                    connectionstyle="angle,angleA=0,angleB=90,rad=10"),
                    )
        plt.xlabel('$MMD^2_u$')
        plt.ylabel('$p(MMD^2_u)$')
        plt.legend(numpoints=1)
#        plt.title('%s_DATA: $p$-value=%s' %(modality, p_value))
        print ''
def compute_mmd_struc_func(k_mat, struc_b6, struc_btbr, func_b6, func_btbr, iterations=100000, plot=True):
    """
    Computes the mmd values for the structural and functional problems and plot
    them with the null distributions.
    
    Parameters:
    ----------
    k_mat: ndarray
           Kernel matrix
    struc_b6: array like
           Structural vectors for B6 class
    struc_btbr: array like
           Structural vectors for BTBR class
    func_b6: array like
           Functional vectors for B6 class
    func_btbr: array like
           Functional vectors for BTBR class
    """
    # Computing the number of samples belonging to structural data in order
    # to split the kernel matrix.
    l_struc = len(struc_b6) + len(struc_btbr)

    # Computing MMD values
    struc_mmd = MMD2u(k_mat[:l_struc][:, :l_struc], len(struc_b6), len(struc_btbr))
    func_mmd = MMD2u(k_mat[l_struc:][:, l_struc:], len(func_b6), len(func_btbr))
    print "struc_mmd = %s, func_mmd = %s" % (struc_mmd, func_mmd)

    # Computing the null-distribution
    mmd2u_null_all = compute_null_distribution(
        k_mat,
        struc_b6.shape[0] + func_b6.shape[0],
        struc_btbr.shape[0] + func_btbr.shape[0],
        iterations,
        seed=123,
        verbose=False,
    )
    # Computing the p-value
    struc_p_value = max(1.0 / iterations, (mmd2u_null_all > struc_mmd).sum() / float(iterations))
    print ("struc_p-value ~= %s \t (resolution : %s)" % (struc_p_value, 1.0 / iterations))
    func_p_value = max(1.0 / iterations, (mmd2u_null_all > func_mmd).sum() / float(iterations))
    print ("func_p-value ~= %s \t (resolution : %s)" % (func_p_value, 1.0 / iterations))

    if plot:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        prob, bins, patches = plt.hist(mmd2u_null_all, bins=50, normed=True)
        ax.plot(
            struc_mmd,
            prob.max() / 30,
            "w*",
            markersize=15,
            markeredgecolor="k",
            markeredgewidth=2,
            label="$MMD^2_S = %s$" % struc_mmd,
        )
        ax.plot(
            func_mmd,
            prob.max() / 30,
            "w^",
            markersize=15,
            markeredgecolor="k",
            markeredgewidth=2,
            label="$MMD^2_F = %s$" % func_mmd,
        )
        plt.xlabel("$MMD^2_u$")
        plt.ylabel("$p(MMD^2_u)$")
        #        plt.title('$MMD^2_u$: null-distribution and observed values')

        ax.annotate(
            "p-value: %s" % (struc_p_value),
            xy=(float(struc_mmd), 4.0),
            xycoords="data",
            xytext=(-105, 30),
            textcoords="offset points",
            bbox=dict(boxstyle="round", fc="1."),
            arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"),
        )

        ax.annotate(
            "p-value: %s" % (func_p_value),
            xy=(float(func_mmd), 4.0),
            xycoords="data",
            xytext=(10, 30),
            textcoords="offset points",
            bbox=dict(boxstyle="round", fc="1."),
            arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"),
        )

        plt.legend(numpoints=1)
Example #6
0
def compute_mmd2u_and_null_distributions(Ks,
                                         m,
                                         n,
                                         iterations=1000,
                                         seed=0,
                                         parallel=True,
                                         permutation=None,
                                         n_jobs=-1,
                                         verbose=False):
    """Compute MMD2u statistic and its null-distribution for each unit
    from kernel matrices Ks. Each null-distributions is approximated
    with the given number of iterations. Parallel (multiprocess, with
    n_jobs processes) computation is available. Note: n_jobs=-1 means
    'use all available cores'. Precomputed permutations (array of size
    iterations x (m+n)) can be used instead of randomly generated
    ones to enforce reproducibility and keep the desired permutation
    schema for each kernel/unit. This is important during parallel
    computation.
    """
    n_units = len(Ks)
    unit_statistic = np.zeros(n_units)
    unit_statistic_permutation = np.zeros((n_units, iterations))

    print("Computing MMD2u for each unit.")
    for i, K in enumerate(Ks):
        mmd2u = MMD2u(K, m, n)
        unit_statistic[i] = (
            n + m
        ) * mmd2u  # For using asymptotic distribution of MMD the statistic is (n+m)*mmd2u

    print("Computing MMD2u's null-distribution, for each unit.")
    if not parallel:
        for i, K in enumerate(Ks):
            if permutation is None:
                mmd2u_null = compute_null_distribution(
                    K,
                    m,
                    n,
                    iterations=iterations,
                    verbose=verbose,
                    seed=seed,
                    marker_interval=100
                )  # NOTE: IT IS FUNDAMENTAL THAT THE SAME IS USED SEED FOR EACH UNIT!
            else:
                mmd2u_null = compute_null_distribution_given_permutations(
                    K, m, n, permutation, iterations=iterations)

            unit_statistic_permutation[i, :] = mmd2u_null
    else:
        print("Parallel computation!")
        if permutation is None:
            results = Parallel(n_jobs=n_jobs, verbose=10)(
                delayed(compute_null_distribution)(K,
                                                   m,
                                                   n,
                                                   iterations=iterations,
                                                   verbose=False,
                                                   seed=seed) for K in Ks
            )  # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED FOR EACH UNIT!
        else:
            results = Parallel(n_jobs=n_jobs, verbose=10)(
                delayed(compute_null_distribution_given_permutations)(
                    K, m, n, permutation, iterations=iterations) for K in Ks)

        unit_statistic_permutation = np.vstack(results)

    return unit_statistic, unit_statistic_permutation