def sigclust(X, mc_iters=100, method=2, verbose=True, scale=True, p_op=True, ngrains=100): """ Return tuple (p, clust) where p is the p-value for k-means++ clustering of data matrix X with k==2, and clust is a (binary) array of length num_samples = X.shape[0] whose Nth value is the cluster assigned to the Nth sample (row) of X at the k-means step. Equivalently, clust == k_means(X,2)[1]. mc_iters is an integer giving the number of iterations in the Monte Carlo step. method = 0 uses the sample covariance matrix eigenvalues directly for simulation. method = 1 applies hard thresholding. method = 2 applies soft thresholding. scale = True applies mean centering and variance normalization (sigma = 1) preprocessing to the input X. verbose = True prints some additional statistics of input data. When method == 2 (solf-thresholding), p_op indicates to perform some addiitonal optimization on the parameter tau. If p_op == False, the parameter tau is set to that which best preserves the trace of the sample covariance matrix (this is just the output of comp_sim_tau): sum_{i}lambda_i == sum_{i}({lambda_i - tau - sigma^2}_{+} + sigma^2). If p_op == True, tau is set to some value between 0 and the output of comp_sim_tau which maximizes the relative size of the largest simulation variance. ngrains is then number of values checked in this optimization. p_op and ngrains are ignored for method != 2. """ if scale: print("Scaling and centering input matrix.") X = pp_scale(X) num_samples, num_features = X.shape if verbose: print("""Number of samples: %d\nNumber of features: %d""" % (num_samples, num_features)) ci, labels = cluster_index_2(X) print("Cluster index of input data: %f" % ci) mad = MAD(X) if verbose: print("Median absolute deviation from the median of input data: %f" % mad) bg_noise_var = (mad * normalizer) ** 2 print("Estimated variance for background noise: %f" % bg_noise_var) sample_cov_mat = np.cov(X.T) eig_vals = np.linalg.eigvals(sample_cov_mat) sim_vars = comp_sim_vars(eig_vals, bg_noise_var, method, p_op, ngrains) if verbose: print("The %d variances for simulation have\nmean: %f\n" "standard deviation: %f." % (X.shape[1], np.mean(sim_vars), np.std(sim_vars))) sim_cov_mat = np.diag(sim_vars) # MONTE CARLO STEP # Counter for simulated cluster indices less than or equal to ci. lte = 0 print("""Simulating %d cluster indices. Please wait...""" % mc_iters) CIs = np.zeros(mc_iters) for i in np.arange(mc_iters): # Generate mc_iters datasets # each of the same size as # the original input. sim_data = np.random.multivariate_normal( np.zeros(num_features), sim_cov_mat, num_samples) ci_sim = (cluster_index_2(sim_data))[0] CIs[i] = ci_sim if ci_sim <= ci: lte += 1 print("Simulation complete.") print("The simulated cluster indices had\n" "mean: %f\nstandard deviation: %f." % (np.mean(CIs), np.std(CIs))) p = lte / mc_iters print("In %d iterations there were\n" "%d cluster indices <= the cluster index %f\n" "of the input data." % (mc_iters, lte, ci)) print("p-value: %f" % p) return p, labels
def sigclust(X, mc_iters=100, thresh = 2, verbose=True, scale = True): """ Returns tuple with first element the p-value for k-means++ clustering of array X with k==2. The second element of the returned tuple is a (binary) array of length num_samples = X.shape[0] whose Nth value is the cluster assigned to the Nth sample (row) of X at the k-means step. (Eqivalently, sigclust(X)[1] == k_means(X,2)[1]) mc_iters is an integer giving the number of iterations in the Monte Carlo step. floor is an optional minimum on simulation variances. scale = True applies mean centering and variance normalization (sigma = 1) preprocessing to the input X. verbose = True prints some additional statistics of inpute data. """ if scale: print("Scaling and centering input matrix.") X = pp_scale(X) num_samples, num_features = X.shape if verbose: print(""" Number of samples: %d, Number of features: %d""" % (num_samples, num_features)) ci, labels = cluster_index_2(X) print("Cluster index of input data: %f" % ci) mad = MAD(X) if verbose: print("""Median absolute deviation from the median of input data: %f""" % mad) bg_noise_var = (mad*normalizer)**2 print("""Estimated variance for background noise: %f""" % bg_noise_var) floor_final = max(floor, bg_noise_var) sample_cov_mat = np.cov(X.T) eig_vals, eig_vects = np.linalg.eig(sample_cov_mat) sim_vars = comp_sim_vars(eig_vals, bg_noise_var, thresh ) if verbose: print("""The %d variances for simulation have mean: %f standard deviation: %f.""" % (X.shape[1], np.mean(sim_vars), np.std(sim_vars))) sim_cov_mat = np.diag(sim_vars) ##MONTE CARLO STEP #Counter for simulated cluster indices less than or equal to ci. lte = 0 print("""Simulating %d cluster indices. Please wait...""" % mc_iters) CIs = np.zeros(mc_iters) for i in np.arange(mc_iters): #Generate mc_iters datasets each of the same size as the original input. sim_data = np.random.multivariate_normal(np.zeros(num_features), sim_cov_mat, num_samples) # Now sim_data.shape = X.shape ci_sim = (cluster_index_2(sim_data))[0] CIs[i] = ci_sim if ci_sim <= ci: lte += 1 #P value print("Simulation complete.") print("""The simulated cluster indices had mean: %f standard deviation: %f.""" % (np.mean(CIs), np.std(CIs))) p = lte / mc_iters print("""In %d iterations there were %d cluster indices <= the cluster index %f of the input data.""" % (mc_iters, lte, ci)) print("p-value: %f" % p) return p, labels