def test_perform_kmeans(self): """ assert that the kmeans sets of a known cluster as consensus matrix is the same as the known cluster """ n_samples = 11 n_clusters = 3 cluster_set = np.int_(np.ones(n_samples)) for r in range(0, n_samples): cluster_set[r] = int(np.random.randint(n_clusters)) n_repeats = 33 n_test_perm = 5 n_test_rows = n_samples I = np.zeros((n_test_rows, n_test_rows)) M = np.zeros((n_test_rows, n_test_rows)) for r in range(0, n_repeats): f_perm = np.random.permutation(n_test_rows) f_perm = f_perm[0:n_test_perm] cluster_p = cluster_set[f_perm] I = kn.update_indicator_matrix(f_perm, I) M = kn.update_linkage_matrix(cluster_p, f_perm, M) CC = M / np.maximum(I, 1e-15) label_set = kn.perform_kmeans(CC, n_clusters) self.assertTrue(sets_a_eq_b(cluster_set, label_set), msg='kemans sets differ from cluster')
def run_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization and write results. Args: run_parameters: parameter set dictionary. """ number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def run_kmeans(run_parameters): #----------------------------------------------------- """ wrapper: call sequence to perform kmeans clustering and save the results. Args: run_parameters: parameter set dictionary. """ number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat_T = spreadsheet_df.values.T number_of_samples = spreadsheet_mat_T.shape[0] distance_matrix = pairwise_distances(spreadsheet_mat_T) labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters) sample_names = spreadsheet_df.columns save_clustering_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) return labels
def run_cc_kmeans_clusters_worker(spreadsheet_mat, run_parameters, sample): #----------------------------------------------------- """Worker to execute kmeans in a single process Args: spreadsheet_mat: genes x samples matrix. run_parameters: dictionary of run-time parameters. sample: each loops. Returns: None """ import knpackage.toolbox as kn import numpy as np np.random.seed(sample) rows_sampling_fraction = run_parameters["rows_sampling_fraction"] cols_sampling_fraction = run_parameters["cols_sampling_fraction"] number_of_clusters = run_parameters["number_of_clusters"] spreadsheet_mat, sample_permutation = kn.sample_a_matrix( spreadsheet_mat, rows_sampling_fraction, cols_sampling_fraction) spreadsheet_mat_T = spreadsheet_mat.T labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters) h_mat = labels_to_hmat(labels, number_of_clusters) kn.save_a_clustering_to_tmp(h_mat, sample_permutation, run_parameters, sample)
def run_cc_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_nmf' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_bootstraps = run_parameters['number_of_bootstraps'] number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) number_of_samples = spreadsheet_mat.shape[1] if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [spreadsheet_mat, run_parameters] dependency_list = [ run_cc_nmf_clusters_worker, save_a_clustering_to_tmp, dstutil.determine_parallelism_locally ] dstutil.execute_distribute_computing_job( run_parameters['cluster_ip_address'], number_of_bootstraps, func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples) labels = kn.perform_kmeans(consensus_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(consensus_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_net_nmf(run_parameters): """ wrapper: call sequence to perform network based stratification and write results. Args: run_parameters: parameter set dictionary. """ np.random.seed(0) number_of_clusters = run_parameters['number_of_clusters'] gg_network_name_full_path = run_parameters['gg_network_name_full_path'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat) spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) sample_names = spreadsheet_df.columns spreadsheet_mat = spreadsheet_df.values spreadsheet_mat, \ iterations = kn.smooth_matrix_with_rwr (spreadsheet_mat, network_mat, run_parameters) spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_pos, lap_diag, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) distance_matrix = pairwise_distances( h_mat.T, n_jobs=-1) # [n_samples, n_features]. Use all available cores save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def form_consensus_matrix_graphic(consensus_matrix, k=3): """ use K-means to reorder the consensus matrix for graphic display. Args: consensus_matrix: calculated consensus matrix in samples x samples order. k: number of clusters estimate (inner diminsion k of factored h_matrix). Returns: cc_cm: consensus_matrix with rows and columns in K-means sort order. """ cc_cm = consensus_matrix.copy() labels = kn.perform_kmeans(consensus_matrix, k) sorted_labels = np.argsort(labels) cc_cm = cc_cm[sorted_labels[:, None], sorted_labels] return cc_cm
def run_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization and write results. Args: run_parameters: parameter set dictionary. """ np.random.seed(0) number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.values spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) sample_names = spreadsheet_df.columns distance_matrix = pairwise_distances( h_mat.T, n_jobs=-1) # [n_samples, n_features] use all available cores save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def run_cc_net_nmf(run_parameters): """ wrapper: call sequence to perform network based stratification with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_net_nmf' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_clusters = run_parameters['number_of_clusters'] number_of_bootstraps = run_parameters['number_of_bootstraps'] gg_network_name_full_path = run_parameters['gg_network_name_full_path'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat) spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_mat = spreadsheet_df.values number_of_samples = spreadsheet_mat.shape[1] sample_names = spreadsheet_df.columns if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_net_nmf_clusters_parallel(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [ network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters ] dependency_list = [ run_cc_net_nmf_clusters_worker, save_a_clustering_to_tmp, dstutil.determine_parallelism_locally ] cluster_ip_address = run_parameters['cluster_ip_address'] dstutil.execute_distribute_computing_job( cluster_ip_address, number_of_bootstraps, func_args, find_and_save_cc_net_nmf_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples) distance_matrix = pairwise_distances( consensus_matrix, n_jobs=-1) # [n_samples, n_samples] use all available cores labels = kn.perform_kmeans(consensus_matrix, number_of_clusters) save_consensus_clustering(consensus_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters, network_mat) kn.remove_dir(run_parameters["tmp_directory"])