def run_bootstrap_net_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling and network smoothing Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_correlation(run_parameters): """ perform feature prioritization Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, jobs_id) dstutil.parallelize_processes_locally(run_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_correlation(run_parameters): """ perform gene prioritization Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory" ] phenotype_name_full_path = run_parameters["phenotype_name_full_path" ] spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) phenotype_df = kn.get_spreadsheet_df(phenotype_name_full_path ) phenotype_df = phenotype_df.T number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters( run_parameters , spreadsheet_df , phenotype_df , jobs_id ) dstutil.parallelize_processes_locally( run_correlation_worker , zipped_arguments , number_of_jobs ) write_phenotype_data_all(run_parameters ) kn.remove_dir (results_tmp_directory)
def run_cc_net_similarity(run_parameters): """ wrapper: call sequence to perform signature analysis with random walk smoothing and bootstrapped similarity and save results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_similarity_' run_parameters = update_tmp_directory(run_parameters, tmp_dir) expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path" ] gg_network_name = run_parameters['gg_network_name_full_path' ] similarity_measure = run_parameters["similarity_measure" ] number_of_bootstraps = run_parameters['number_of_bootstraps' ] processing_method = run_parameters['processing_method' ] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name ) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name) # network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names) expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() expression_mat, iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters) signature_mat, iterations = kn.smooth_matrix_with_rwr(signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc[:] = signature_mat if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps) else: raise ValueError('processing_method contains bad value.') # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df) similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters) similarity_df = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_bootstrap_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory"] n_bootstraps = run_parameters["number_of_bootstraps"] results_tmp_directory = run_parameters["results_tmp_directory"] phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(results_tmp_directory)
def run_cc_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_nmf' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_bootstraps = run_parameters['number_of_bootstraps'] number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) number_of_samples = spreadsheet_mat.shape[1] if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [spreadsheet_mat, run_parameters] dependency_list = [ run_cc_nmf_clusters_worker, save_a_clustering_to_tmp, dstutil.determine_parallelism_locally ] dstutil.execute_distribute_computing_job( run_parameters['cluster_ip_address'], number_of_bootstraps, func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples) labels = kn.perform_kmeans(consensus_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(consensus_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_cc_similarity(run_parameters): """ Performs similarity analysis with bootstraps and saves the similarity matrix. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_similarity' run_parameters = update_tmp_directory(run_parameters, tmp_dir) expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path"] similarity_measure = run_parameters["similarity_measure"] number_of_bootstraps = run_parameters['number_of_bootstraps'] processing_method = run_parameters['processing_method'] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps) else: raise ValueError('processing_method contains bad value.') # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df) similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters) similarity_df = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_bootstrap_correlation(run_parameters): """ perform feature prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T n_bootstraps = run_parameters["number_of_bootstraps"] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def test_create_dir_AND_remove_dir(self): """ assert that the functions work togeather to create and remove a directory even when files have been added """ dir_name = 'tmp_test' dir_path = self.run_parameters['test_directory'] new_directory_name = kn.create_dir(dir_path, dir_name) self.assertTrue(os.path.exists(new_directory_name), msg='create_dir function exception') A = np.random.rand(10, 10) time_stamp = '123456789' a_name = os.path.join(new_directory_name, 'temp_test' + time_stamp) with open(a_name, 'wb') as fh: A.dump(fh) A_back = np.load(a_name) if os.path.isfile(a_name): os.remove(a_name) A_diff = A - A_back A_diff = A_diff.sum() self.assertEqual(A_diff, 0, msg='write / read directory exception') kn.remove_dir(new_directory_name) self.assertFalse(os.path.exists(new_directory_name), msg='remove_dir function exception')
def run_net_correlation(run_parameters): """ perform gene prioritization with network smoothing Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix( gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr( spreadsheet_df.values, network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(run_parameters["results_tmp_directory"])
def run_cc_net_nmf(run_parameters): """ wrapper: call sequence to perform network based stratification with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_net_nmf' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_clusters = run_parameters['number_of_clusters'] number_of_bootstraps = run_parameters['number_of_bootstraps'] gg_network_name_full_path = run_parameters['gg_network_name_full_path'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat) spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_mat = spreadsheet_df.values number_of_samples = spreadsheet_mat.shape[1] sample_names = spreadsheet_df.columns if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_net_nmf_clusters_parallel(network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [ network_mat, spreadsheet_mat, lap_diag, lap_pos, run_parameters ] dependency_list = [ run_cc_net_nmf_clusters_worker, save_a_clustering_to_tmp, dstutil.determine_parallelism_locally ] cluster_ip_address = run_parameters['cluster_ip_address'] dstutil.execute_distribute_computing_job( cluster_ip_address, number_of_bootstraps, func_args, find_and_save_cc_net_nmf_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples) distance_matrix = pairwise_distances( consensus_matrix, n_jobs=-1) # [n_samples, n_samples] use all available cores labels = kn.perform_kmeans(consensus_matrix, number_of_clusters) save_consensus_clustering(consensus_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters, network_mat) kn.remove_dir(run_parameters["tmp_directory"])
def tearDown(self): kn.remove_dir(self.run_parameters["tmp_directory"]) del self.run_parameters
def run_cc_link_hclust(run_parameters): #----------------------------------------------------- """ wrapper: call sequence to perform hclust with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_link_hclust' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_bootstraps = run_parameters['number_of_bootstraps'] number_of_clusters = run_parameters['number_of_clusters'] nearest_neighbors = run_parameters['nearest_neighbors'] affinity_metric = run_parameters['affinity_metric'] linkage_criterion = run_parameters['linkage_criterion'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.values number_of_samples = spreadsheet_mat.shape[1] if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_link_hclust_clusters_worker(spreadsheet_mat, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_link_hclust_clusters_parallel(spreadsheet_mat, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [spreadsheet_mat, run_parameters] dependency_list = [ run_cc_link_hclust_clusters_worker \ , kn.save_a_clustering_to_tmp \ , dstutil.determine_parallelism_locally ] dstutil.execute_distribute_computing_job( run_parameters['cluster_ip_address'], number_of_bootstraps, func_args, find_and_save_cc_link_hclust_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = kn.form_consensus_matrix(run_parameters, number_of_samples) labels, \ distance_matrix = perform_link_hclust( consensus_matrix , number_of_clusters , nearest_neighbors , affinity_metric , linkage_criterion) sample_names = spreadsheet_df.columns save_consensus_matrix(consensus_matrix, sample_names, labels, run_parameters) save_clustering_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])