Beispiel #1
0
def sigclust(X, mc_iters=100, method=2, verbose=True, scale=True,
             p_op=True, ngrains=100):
    """
    Return tuple (p, clust) where p is the p-value for k-means++ clustering of
        data matrix X with k==2, and clust is a (binary) array of length
        num_samples = X.shape[0] whose Nth value is the cluster assigned to
        the Nth sample (row) of X at the k-means step.  
        Equivalently, clust == k_means(X,2)[1].

    mc_iters is an integer giving the number of iterations in
        the Monte Carlo step.

    method = 0 uses the sample covariance matrix eigenvalues directly
        for simulation.
    method = 1 applies hard thresholding.
    method = 2 applies soft thresholding.

    scale = True  applies mean centering and variance normalization
        (sigma = 1) preprocessing to the input X.
    verbose = True prints some additional statistics of input data.

    When method == 2 (solf-thresholding), p_op indicates to perform some
        addiitonal optimization on the parameter tau.  If p_op == False,
        the parameter tau is set to that which best preserves the trace of
        the sample covariance matrix (this is just the output of comp_sim_tau):
        sum_{i}lambda_i == sum_{i}({lambda_i - tau - sigma^2}_{+} + sigma^2).
        If p_op == True, tau is set to some value between 0 and the output
        of comp_sim_tau which maximizes the relative size of the largest
        simulation variance. ngrains is then number of values checked in
        this optimization.  p_op and ngrains are ignored for method != 2.
    """
    if scale:
        print("Scaling and centering input matrix.")
        X = pp_scale(X)
    num_samples, num_features = X.shape
    if verbose:
        print("""Number of samples: %d\nNumber of features: %d""" %
              (num_samples, num_features))

    ci, labels = cluster_index_2(X)
    print("Cluster index of input data: %f" % ci)

    mad = MAD(X)
    if verbose:
        print("Median absolute deviation from the median of input data: %f"
              % mad)

    bg_noise_var = (mad * normalizer) ** 2
    print("Estimated variance for background noise: %f"
          % bg_noise_var)

    sample_cov_mat = np.cov(X.T)

    eig_vals = np.linalg.eigvals(sample_cov_mat)

    sim_vars = comp_sim_vars(eig_vals, bg_noise_var, method, p_op, ngrains)

    if verbose:
        print("The %d variances for simulation have\nmean: %f\n"
              "standard deviation: %f."
              % (X.shape[1], np.mean(sim_vars), np.std(sim_vars)))

    sim_cov_mat = np.diag(sim_vars)

    # MONTE CARLO STEP

    # Counter for simulated cluster indices less than or equal to ci.
    lte = 0
    print("""Simulating %d cluster indices.  Please wait...""" %
          mc_iters)
    CIs = np.zeros(mc_iters)
    for i in np.arange(mc_iters):
        # Generate mc_iters datasets
        # each of the same size as
        # the original input.
        sim_data = np.random.multivariate_normal(
            np.zeros(num_features), sim_cov_mat, num_samples)
        ci_sim = (cluster_index_2(sim_data))[0]
        CIs[i] = ci_sim
        if ci_sim <= ci:
            lte += 1
    print("Simulation complete.")
    print("The simulated cluster indices had\n"
          "mean: %f\nstandard deviation: %f." %
          (np.mean(CIs), np.std(CIs)))

    p = lte / mc_iters
    print("In %d iterations there were\n"
          "%d cluster indices <= the cluster index %f\n"
          "of the input data." % (mc_iters, lte, ci))
    print("p-value:  %f" % p)
    return p, labels
Beispiel #2
0
def sigclust(X, mc_iters=100, thresh = 2,
             verbose=True, scale = True):
    """
    Returns tuple with first element the p-value for k-means++ clustering of array X with k==2.  The second element of the returned tuple is a (binary) array of length num_samples = X.shape[0] whose Nth value is the cluster assigned to the Nth sample (row) of X at the k-means step. (Eqivalently, sigclust(X)[1] == k_means(X,2)[1])
    mc_iters is an integer giving the number of iterations in the Monte Carlo step.
    floor is an optional minimum on simulation variances.
    scale = True  applies mean centering and variance normalization (sigma = 1) preprocessing to the input X.
    verbose = True prints some additional statistics of inpute data.
    """
    if scale:
        print("Scaling and centering input matrix.")
        X = pp_scale(X)
    num_samples, num_features = X.shape
    if verbose:
        print("""
Number of samples: %d, 
Number of features: %d""" %
              (num_samples, num_features))
    
    ci, labels = cluster_index_2(X)
    print("Cluster index of input data: %f" % ci)

    mad = MAD(X)
    if verbose:
        print("""Median absolute deviation 
from the median of input data:  %f""" % mad)
        

    bg_noise_var = (mad*normalizer)**2
    print("""Estimated variance for 
background noise: %f""" % bg_noise_var)

    floor_final = max(floor, bg_noise_var)

    sample_cov_mat = np.cov(X.T)

    eig_vals, eig_vects = np.linalg.eig(sample_cov_mat)

    sim_vars = comp_sim_vars(eig_vals, bg_noise_var, thresh )
    

            
        
    
    if verbose:
        print("""The %d variances for simulation have
 mean: %f 
standard deviation: %f.""" %
        (X.shape[1],
         np.mean(sim_vars),
         np.std(sim_vars)))

    sim_cov_mat = np.diag(sim_vars)
    
    ##MONTE CARLO STEP

    #Counter for simulated cluster indices less than or equal to ci.
    lte = 0
    print("""Simulating %d cluster indices.
Please wait...""" %
          mc_iters)
    CIs = np.zeros(mc_iters)
    for i in np.arange(mc_iters):
    #Generate mc_iters datasets each of the same size as the original input.
        sim_data = np.random.multivariate_normal(np.zeros(num_features), sim_cov_mat, num_samples)

        # Now sim_data.shape = X.shape

        ci_sim = (cluster_index_2(sim_data))[0]
        CIs[i] = ci_sim
        if ci_sim <= ci:
            lte += 1
    #P value
    print("Simulation complete.")
    print("""The simulated cluster indices had
mean: %f
standard deviation: %f.""" %
          (np.mean(CIs), np.std(CIs)))
    
    p = lte / mc_iters
    print("""In %d iterations there were 
%d cluster indices <= the cluster index %f
 of the input data.""" %
          (mc_iters, lte, ci))
    print("p-value:  %f" % p)
    return p, labels