def test_update_linkage_matrix(self): """ create a consensus matrix by sampling a synthesized set of clusters assert that the clustering is equivalent """ n_samples = 11 n_clusters = 3 cluster_set = np.int_(np.ones(n_samples)) for r in range(0, n_samples): cluster_set[r] = int(np.random.randint(n_clusters)) n_repeats = 100 n_test_perm = 5 n_test_rows = n_samples I = np.zeros((n_test_rows, n_test_rows)) M = np.zeros((n_test_rows, n_test_rows)) for r in range(0, n_repeats): f_perm = np.random.permutation(n_test_rows) f_perm = f_perm[0:n_test_perm] cluster_p = cluster_set[f_perm] I = kn.update_indicator_matrix(f_perm, I) M = kn.update_linkage_matrix(cluster_p, f_perm, M) CC = M / np.maximum(I, 1e-15) for s in range(0, n_clusters): s_dex = cluster_set == s c_c = CC[s_dex, :] c_c = c_c[:, s_dex] n_check = c_c - 1 self.assertEqual(n_check.sum(), 0, msg='cluster grouping exception')
def test_perform_kmeans(self): """ assert that the kmeans sets of a known cluster as consensus matrix is the same as the known cluster """ n_samples = 11 n_clusters = 3 cluster_set = np.int_(np.ones(n_samples)) for r in range(0, n_samples): cluster_set[r] = int(np.random.randint(n_clusters)) n_repeats = 33 n_test_perm = 5 n_test_rows = n_samples I = np.zeros((n_test_rows, n_test_rows)) M = np.zeros((n_test_rows, n_test_rows)) for r in range(0, n_repeats): f_perm = np.random.permutation(n_test_rows) f_perm = f_perm[0:n_test_perm] cluster_p = cluster_set[f_perm] I = kn.update_indicator_matrix(f_perm, I) M = kn.update_linkage_matrix(cluster_p, f_perm, M) CC = M / np.maximum(I, 1e-15) label_set = kn.perform_kmeans(CC, n_clusters) self.assertTrue(sets_a_eq_b(cluster_set, label_set), msg='kemans sets differ from cluster')
def get_linkage_matrix(run_parameters, linkage_matrix, indicator_matrix): """ read bootstrap temp_h* and temp_p* files, compute and add the linkage_matrix. Args: run_parameters: parameter set dictionary. linkage_matrix: connectivity matrix from initialization or previous call. Returns: linkage_matrix: summed with "temp_h*" files in run_parameters["tmp_directory"]. """ if run_parameters['processing_method'] == 'distribute': tmp_dir = os.path.join( run_parameters['cluster_shared_volumn'], os.path.basename(os.path.normpath( run_parameters['tmp_directory']))) else: tmp_dir = run_parameters["tmp_directory"] dir_list = os.listdir(tmp_dir) for tmp_f in dir_list: if tmp_f[0:6] == 'tmp_p_': pname = os.path.join(tmp_dir, tmp_f) hname = os.path.join(tmp_dir, 'tmp_h_' + tmp_f[6:len(tmp_f)]) sample_permutation = np.load(pname) h_mat = np.load(hname) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_permutation, linkage_matrix) indicator_matrix = kn.update_indicator_matrix( sample_permutation, indicator_matrix) return linkage_matrix, indicator_matrix
def run_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization and write results. Args: run_parameters: parameter set dictionary. """ number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def run_net_nmf(run_parameters): """ wrapper: call sequence to perform network based stratification and write results. Args: run_parameters: parameter set dictionary. """ np.random.seed(0) number_of_clusters = run_parameters['number_of_clusters'] gg_network_name_full_path = run_parameters['gg_network_name_full_path'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat) spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) sample_names = spreadsheet_df.columns spreadsheet_mat = spreadsheet_df.values spreadsheet_mat, \ iterations = kn.smooth_matrix_with_rwr (spreadsheet_mat, network_mat, run_parameters) spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_pos, lap_diag, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) distance_matrix = pairwise_distances( h_mat.T, n_jobs=-1) # [n_samples, n_features]. Use all available cores save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def run_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization and write results. Args: run_parameters: parameter set dictionary. """ np.random.seed(0) number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.values spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) sample_names = spreadsheet_df.columns distance_matrix = pairwise_distances( h_mat.T, n_jobs=-1) # [n_samples, n_features] use all available cores save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)