def run_correlation(run_parameters): """ perform gene prioritization Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory" ] phenotype_name_full_path = run_parameters["phenotype_name_full_path" ] spreadsheet_name_full_path = run_parameters["spreadsheet_name_full_path"] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) phenotype_df = kn.get_spreadsheet_df(phenotype_name_full_path ) phenotype_df = phenotype_df.T number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters( run_parameters , spreadsheet_df , phenotype_df , jobs_id ) dstutil.parallelize_processes_locally( run_correlation_worker , zipped_arguments , number_of_jobs ) write_phenotype_data_all(run_parameters ) kn.remove_dir (results_tmp_directory)
def run_bootstrap_net_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling and network smoothing Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_similarity(run_parameters): """ Performs similarity analysis and saves the similarity matrix. Args: run_parameters: parameter set dictionary. """ expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path"] similarity_measure = run_parameters["similarity_measure"] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names similarity_mat = generate_similarity_mat(expression_df, signature_df, similarity_measure) # similarity_mat = map_similarity_range(similarity_mat, 0) similarity_df = pd.DataFrame(similarity_mat, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters)
def run_select_subtype_df(run_parameters): """ Subset samples based on some row value, e.g., patients with longer survival. Output can be a smaller spreadsheet with fewer columns. From a genes x samples spreadsheet and a samples x phenotypes spreadsheet, return both spreadsheets with only the samples corresponding to a category in a phenotype. Args: run_parameters with keys: "results_directory", "spreadsheet_file_name", "phenotype_file_name", "phenotype_id", "select_category" """ results_directory = run_parameters['results_directory'] spreadsheet_file_name = run_parameters['spreadsheet_file_name'] phenotype_file_name = run_parameters['phenotype_file_name'] phenotype_id = run_parameters['phenotype_id'] select_category = run_parameters['select_category'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_file_name) phenotype_df = kn.get_spreadsheet_df(phenotype_file_name) spreadsheet_df, phenotype_df = select_subtype_df(spreadsheet_df, phenotype_df, phenotype_id, select_category) transform_name = "phenotype_category" write_transform_df(spreadsheet_df, spreadsheet_file_name, transform_name, results_directory) write_transform_df(phenotype_df, phenotype_file_name, transform_name, results_directory)
def run_correlation(run_parameters): """ perform feature prioritization Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, jobs_id) dstutil.parallelize_processes_locally(run_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_lasso_predict(run_parameters): gene_file = run_parameters['spreadsheet_name_full_path' ] sign_file = run_parameters['response_name_full_path' ] test_file = run_parameters['test_spreadsheet_name_full_path'] gene_df = kn.get_spreadsheet_df(gene_file) sign_df = kn.get_spreadsheet_df(sign_file) test_df = kn.get_spreadsheet_df(test_file) row_names = test_df.columns gene_mat = gene_df.values sign_mat = sign_df.values[0] test_mat = test_df.values min_alpha = run_parameters['min_alpha'] max_alpha = run_parameters['max_alpha'] n_alpha = run_parameters['n_alpha'] intercept = run_parameters['fit_intercept'] normalization = run_parameters['normalize'] max_iter = run_parameters['max_iter'] tolerance = run_parameters['tolerance'] alpha_grid = np.linspace(min_alpha, max_alpha, num=n_alpha) reg_model = linear_model.LassoCV( alphas=alpha_grid, fit_intercept=intercept, \ normalize=normalization, max_iter=max_iter, tol=tolerance, cv=5) reg_model.fit( gene_mat.T, sign_mat) filename = os.path.join(run_parameters['results_directory'], 'lasso_model.pkl') pickle.dump(reg_model, open(filename, 'wb')) response_predict = reg_model.predict(test_mat.T) predict_df = pd.DataFrame(response_predict.T, index=row_names, columns=['predict']) write_predict_data(predict_df, run_parameters)
def run_cc_net_similarity(run_parameters): """ wrapper: call sequence to perform signature analysis with random walk smoothing and bootstrapped similarity and save results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_similarity_' run_parameters = update_tmp_directory(run_parameters, tmp_dir) expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path" ] gg_network_name = run_parameters['gg_network_name_full_path' ] similarity_measure = run_parameters["similarity_measure" ] number_of_bootstraps = run_parameters['number_of_bootstraps' ] processing_method = run_parameters['processing_method' ] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name ) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name) # network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names) expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() expression_mat, iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters) signature_mat, iterations = kn.smooth_matrix_with_rwr(signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc[:] = signature_mat if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps) else: raise ValueError('processing_method contains bad value.') # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df) similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters) similarity_df = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_bootstrap_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') results_tmp_directory = run_parameters["results_tmp_directory"] n_bootstraps = run_parameters["number_of_bootstraps"] results_tmp_directory = run_parameters["results_tmp_directory"] phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(results_tmp_directory)
def run_elastic_predict(run_parameters): ''' Using Elastic net model to predict response data against feature data Args: run_parameters: dictionary of run parameters ''' gene_file = run_parameters['spreadsheet_name_full_path'] sign_file = run_parameters['response_name_full_path'] test_file = run_parameters['test_spreadsheet_name_full_path'] gene_df = kn.get_spreadsheet_df(gene_file) sign_df = kn.get_spreadsheet_df(sign_file) test_df = kn.get_spreadsheet_df(test_file) row_names = test_df.columns gene_mat = gene_df.values sign_mat = sign_df.values[0] test_mat = test_df.values eps = run_parameters['eps'] min_alpha = run_parameters['min_alpha'] max_alpha = run_parameters['max_alpha'] n_alpha = run_parameters['n_alpha'] min_l1 = run_parameters['min_l1'] max_l1 = run_parameters['max_l1'] n_l1 = run_parameters['n_l1'] intercept = run_parameters['fit_intercept'] normalize = run_parameters['normalize'] max_iter = run_parameters['max_iter'] tolerance = run_parameters['tolerance'] alpha_grid = np.linspace(min_alpha, max_alpha, num=n_alpha) l1_grid = np.linspace(min_l1, max_l1, num=n_l1) reg_model = linear_model.ElasticNetCV( l1_ratio=l1_grid, alphas=alpha_grid, fit_intercept=intercept, eps = eps,\ normalize=normalize, max_iter=max_iter, tol=tolerance, cv=5) reg_model.fit(gene_mat.T, sign_mat) filename = os.path.join(run_parameters['results_directory'], 'elastic_net_model.pkl') pickle.dump(reg_model, open(filename, 'wb')) response_predict = reg_model.predict(test_mat.T) predict_df = pd.DataFrame(response_predict.T, index=row_names, columns=['predict']) write_predict_data(predict_df, run_parameters)
def run_cc_similarity(run_parameters): """ Performs similarity analysis with bootstraps and saves the similarity matrix. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_similarity' run_parameters = update_tmp_directory(run_parameters, tmp_dir) expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path"] similarity_measure = run_parameters["similarity_measure"] number_of_bootstraps = run_parameters['number_of_bootstraps'] processing_method = run_parameters['processing_method'] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps) else: raise ValueError('processing_method contains bad value.') # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df) similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters) similarity_df = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def phenotype_expander(run_parameters): """ Run phenotype expander on the whole dataframe of phenotype data. Save the results to tsv file. """ phenotype_df = kn.get_spreadsheet_df( run_parameters['phenotype_name_full_path']) output_dict = run_pre_processing_phenotype_expander( phenotype_df, run_parameters['threshold']) result_df = pd.DataFrame(index=phenotype_df.index) for key, df_list in output_dict.items(): if key == ColumnType.CATEGORICAL: for item in df_list: col_df = phenotype_df.loc[:, item.columns[0]].dropna() uniq_array = np.unique(col_df.values) col_names = [ item.columns[0] + '_' + str(i) for i in uniq_array ] cur_df = pd.DataFrame(columns=col_names, index=col_df.index) cur_append_df = pd.DataFrame(columns=col_names, index=phenotype_df.index) for i, val in enumerate(uniq_array): cur_df.loc[col_df == val, col_names[i]] = 1 cur_df.loc[col_df != val, col_names[i]] = 0 cur_append_df.loc[cur_df.index, :] = cur_df result_df = pd.concat([result_df, cur_append_df], axis=1) file_name = kn.create_timestamped_filename("phenotype_expander_result", "tsv") file_path = os.path.join(run_parameters["results_directory"], file_name) result_df.index.name = "sample_id" result_df.to_csv(file_path, header=True, index=True, sep='\t', na_rep='NA')
def run_kaplan_meier(button): """ callback for kaplan_meier_execute_button """ if get_km_file_button.file_selector.value == LIST_BOX_UPDATE_MESSAGE: if get_km_file_button.description == 'Clear': get_km_file_button.view_box.value = '' get_km_file_button.view_box.description = '' get_km_file_button.description = 'View' refresh_files_list(get_km_file_button) return if button.description == 'Clear': button.description = button.original_description button.im_view_box.value = BLAK_IMAGE button.view_box.value = '' return else: button.description = 'Clear' phenotype_df = kn.get_spreadsheet_df( os.path.join(input_data_dir, get_km_file_button.file_selector.value)) cluster_id_name = button.cluster_id_listbox.value event_name = button.event_id_listbox.value time_name = button.time_id_listbox.value disp_kaplan_meier(phenotype_df, cluster_id_name, event_name, time_name, button)
def run_merge_df(run_parameters): """ Merge two phenotype matrices that correspond to same columns (Union) Args: run_parameters with keys: "results_directory", "spreadsheet_1_file_name", "spreadsheet_2_file_name" """ results_directory = run_parameters['results_directory'] spreadsheet_1_file_name = run_parameters['spreadsheet_1_file_name'] spreadsheet_2_file_name = run_parameters['spreadsheet_2_file_name'] spreadsheet_1_df = kn.get_spreadsheet_df(spreadsheet_1_file_name) spreadsheet_2_df = kn.get_spreadsheet_df(spreadsheet_2_file_name) result_df = merge_df(spreadsheet_1_df, spreadsheet_2_df) transform_name = "merge" write_transform_df(result_df, spreadsheet_1_file_name, transform_name, results_directory)
def run_link_hclust(run_parameters): #----------------------------------------------------- """ wrapper: call sequence to perform hierchical clustering using linkage and save the results. Args: run_parameters: parameter set dictionary. """ np.random.seed() nearest_neighbors = run_parameters['nearest_neighbors'] number_of_clusters = run_parameters['number_of_clusters'] affinity_metric = run_parameters['affinity_metric'] linkage_criterion = run_parameters['linkage_criterion'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.values number_of_samples = spreadsheet_mat.shape[1] labels, distance_matrix = perform_link_hclust(spreadsheet_mat, number_of_clusters, nearest_neighbors, affinity_metric, linkage_criterion) sample_names = spreadsheet_df.columns save_clustering_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) return labels
def run_kmeans(run_parameters): #----------------------------------------------------- """ wrapper: call sequence to perform kmeans clustering and save the results. Args: run_parameters: parameter set dictionary. """ number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat_T = spreadsheet_df.values.T number_of_samples = spreadsheet_mat_T.shape[0] distance_matrix = pairwise_distances(spreadsheet_mat_T) labels = kn.perform_kmeans(spreadsheet_mat_T, number_of_clusters) sample_names = spreadsheet_df.columns save_clustering_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) return labels
def run_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization and write results. Args: run_parameters: parameter set dictionary. """ number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_nmf(spreadsheet_mat, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def reset_phenotype_cols_list(change): """ Reset the three parameters dropdown listboxes to a new file selection. Args: change: IPywidgets widget control change event """ if get_km_file_button.file_selector.value == LIST_BOX_UPDATE_MESSAGE: if get_km_file_button.description == 'Clear': get_km_file_button.view_box.value = '' get_km_file_button.view_box.description = '' get_km_file_button.description = 'View' refresh_files_list(get_km_file_button) return options_df = kn.get_spreadsheet_df( os.path.join(input_data_dir, get_km_file_button.file_selector.value)) sorted_options_list = sorted(list(options_df.columns.values)) if len(sorted_options_list) > 0: def_val = sorted_options_list[0] else: def_val = '' cluster_id_listbox.options = sorted_options_list cluster_id_listbox.value = def_val event_id_listbox.options = sorted_options_list event_id_listbox.value = def_val time_id_listbox.options = sorted_options_list time_id_listbox.value = def_val
def combine_phenotype_data_and_clustering(run_parameters): """This is to insert the sample clusters column into the phenotype dataframe. Returns: phenotype_df: phenotype dataframe with the first column as sample clusters. """ phenotype_df = kn.get_spreadsheet_df( run_parameters['phenotype_name_full_path']) phenotype_df.insert(0, 'Cluster_ID', np.nan) # pylint: disable=no-member cluster_labels_df = pd.read_csv( run_parameters['cluster_mapping_full_path'], index_col=0, header=None, sep='\t') cluster_labels_df.columns = ['Cluster_ID'] common_samples = kn.find_common_node_names(phenotype_df.index, cluster_labels_df.index) phenotype_df.loc[common_samples, 'Cluster_ID'] = cluster_labels_df.loc[common_samples, 'Cluster_ID'] # pylint: disable=no-member return phenotype_df
def run_GRN_lasso(run_parameters): """ """ spreadsheet = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) gene_list = spreadsheet.index tf_idx = range(int(spreadsheet.shape[0] * 0.2)) tf_spreadsheet = spreadsheet.iloc[tf_idx, :] result_df = pd.DataFrame(index=gene_list, columns=tf_spreadsheet.index) param_dict = { 'n_alphas': 1000, 'fit_intercept': run_parameters['fit_intercept'], 'normalize': run_parameters['normalize'], 'max_iter': 2000, 'cv': 5 } for i in range(spreadsheet.shape[0]): # curr_response = spreadsheet.values[i, :].reshape(-1,1) curr_response = spreadsheet.values[i, :].ravel() curr_model = algo_lasso(tf_spreadsheet.values.T, curr_response, param_dict) coef = curr_model.coef_.ravel() # (x-min(x))/(max(x)-min(x)) result_df.loc[gene_list[i], :] = (coef - min(coef)) / (max(coef) - min(coef)) file_path = os.path.join(run_parameters['results_directory'], 'GRN_coefficient_result.tsv') result_df.to_csv(file_path, header=True, index=True, sep='\t')
def run_net_similarity(run_parameters): """ Run random walk first to smooth expression and signature then perform similarity analysis and save the similarity matrix. Args: run_parameters: parameter set dictionary. """ expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path"] gg_network_name = run_parameters['gg_network_name_full_path'] similarity_measure = run_parameters["similarity_measure"] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names network_mat, unique_gene_names = kn.get_sparse_network_matrix( gg_network_name) # network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names) expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() expression_mat, iterations = kn.smooth_matrix_with_rwr( expression_mat, network_mat, run_parameters) signature_mat, iterations = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc[:] = signature_mat similarity_mat = generate_similarity_mat(expression_df, signature_df, similarity_measure) # similarity_mat = map_similarity_range(similarity_mat, 0) similarity_df = pd.DataFrame(similarity_mat, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters)
def run_fisher(run_parameters): ''' wrapper: call sequence to perform fisher gene-set characterization Args: run_parameters: dictionary of run parameters ''' # ----------------------------------- # - Data read and extraction Section - # ----------------------------------- spreadsheet_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) prop_gene_network_df = kn.get_network_df( run_parameters['pg_network_name_full_path']) spreadsheet_gene_names = kn.extract_spreadsheet_gene_names(spreadsheet_df) prop_gene_network_n1_names, \ prop_gene_network_n2_names = kn.extract_network_node_names(prop_gene_network_df) # ----------------------------------------------------------------------- # - limit the gene set to the intersection of network and user gene set - # ----------------------------------------------------------------------- common_gene_names = kn.find_common_node_names(prop_gene_network_n2_names, spreadsheet_gene_names) common_gene_names_dict = kn.create_node_names_dict(common_gene_names) prop_gene_network_n1_names_dict = kn.create_node_names_dict( prop_gene_network_n1_names) reverse_prop_dict = kn.create_reverse_node_names_dict( prop_gene_network_n1_names_dict) # ---------------------------------------------------------------------------- # - restrict spreadsheet and network to common genes and drop everthing else - # ---------------------------------------------------------------------------- new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, common_gene_names) prop_gene_network_df = kn.update_network_df(prop_gene_network_df, common_gene_names, "node_2") prop_gene_network_df['wt'] = 1 # ---------------------------------------------------------------------------- # - map every gene name to an integer index in sequential order startng at 0 - # ---------------------------------------------------------------------------- prop_gene_network_df = kn.map_node_names_to_index( prop_gene_network_df, prop_gene_network_n1_names_dict, "node_1") prop_gene_network_df = kn.map_node_names_to_index(prop_gene_network_df, common_gene_names_dict, "node_2") # -------------------------------------------- # - store the network in a csr sparse format - # -------------------------------------------- universe_count = len(common_gene_names) prop_gene_network_sparse = kn.convert_network_df_to_sparse( prop_gene_network_df, universe_count, len(prop_gene_network_n1_names)) fisher_contingency_pval = get_fisher_exact_test(prop_gene_network_sparse, reverse_prop_dict, new_spreadsheet_df) fisher_final_result = save_fisher_test_result( fisher_contingency_pval, run_parameters['results_directory'], spreadsheet_df.columns.values, 2) map_and_save_droplist(spreadsheet_df, common_gene_names, 'fisher_droplist', run_parameters) return fisher_final_result
def run_lasso_predict(run_parameters): gene_samples_train_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) response_train_df = kn.get_spreadsheet_df( run_parameters['response_name_full_path']) gene_samples_test_df = kn.get_spreadsheet_df( run_parameters['test_spreadsheet_name_full_path']) response_test_sample_names = list(gene_samples_test_df.columns) reg_moE = linear_model.Lasso() response_predict = reg_moE.fit(gene_samples_train_df.transpose().values, response_train_df.values[0]).predict( gene_samples_test_df.transpose().values) predict_df = pd.DataFrame(response_predict.T, index=response_test_sample_names, columns=['predict']) write_predict_data(predict_df, run_parameters)
def run_net_similarity(run_parameters): """ Run random walk first to smooth expression and signature then perform similarity analysis and save the similarity matrix. Args: run_parameters: parameter set dictionary. """ expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path" ] gg_network_name = run_parameters['gg_network_name_full_path' ] similarity_measure = run_parameters["similarity_measure" ] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df( signature_name) expression_col_names = expression_df.columns signature_col_names = signature_df.columns #--------------------- network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df( signature_df, unique_gene_names) #--------------------- expression_mat = expression_df.values signature_mat = signature_df.values expression_mat, \ iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters) signature_mat, \ iterations = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc [:] = signature_mat # --------------------------------------------- similarity_mat = generate_similarity_mat(expression_df, signature_df,similarity_measure) # --------------------------------------------- similarity_df = pd.DataFrame( similarity_mat, index = expression_col_names, columns = signature_col_names ) save_final_expression_signature( similarity_df, run_parameters ) save_best_match_signature ( similarity_df, run_parameters )
def view_spreadsheet_file_head(full_file_name): """ notebook convenience """ if os.path.isfile(full_file_name): sp_df = kn.get_spreadsheet_df(full_file_name) deNada, f_name = os.path.split(full_file_name) print(f_name, ' size:', sp_df.shape) display(sp_df.head(10)) else: print('file not found on local path')
def run_svr_predict(run_parameters): ''' Using SVR model to predict response data against feature data Args: run_parameters: dictionary of run parameters ''' gene_file = run_parameters['spreadsheet_name_full_path'] sign_file = run_parameters['response_name_full_path'] test_file = run_parameters['test_spreadsheet_name_full_path'] gene_df = kn.get_spreadsheet_df(gene_file) sign_df = kn.get_spreadsheet_df(sign_file) test_df = kn.get_spreadsheet_df(test_file) row_names = test_df.columns gene_mat = gene_df.values sign_mat = sign_df.values[0] test_mat = test_df.values svr_kernel = run_parameters['svr_kernel'] p_grid = {'svr_degree': 3, 'svr_gamma': 'auto', 'svr_coef0': 0.0, \ 'svr_tol': 0.001, 'svr_C': 1.0, 'svr_epsilon': 0.1, \ 'svr_shrinking': True, 'svr_cache_size':200, \ 'svr_verbose': False, 'svr_max_iter': -1} for k, v in p_grid.items(): if k in run_parameters: p_grid[k] = v reg_model = SVR(kernel=svr_kernel, degree=p_grid['svr_degree'], gamma=p_grid['svr_gamma'], \ coef0=p_grid['svr_coef0'], tol=p_grid['svr_tol'], C=p_grid['svr_C'], epsilon=p_grid['svr_epsilon'], \ shrinking=p_grid['svr_shrinking'], cache_size=p_grid['svr_cache_size'], verbose=p_grid['svr_verbose'], \ max_iter=p_grid['svr_max_iter']) reg_model.fit(gene_mat.T, sign_mat) filename = os.path.join(run_parameters['results_directory'], 'svr_model.pkl') pickle.dump(reg_model, open(filename, 'wb')) response_predict = reg_model.predict(test_mat.T) predict_df = pd.DataFrame(response_predict.T, index=row_names, columns=['predict']) write_predict_data(predict_df, run_parameters)
def run_DRaWR(run_parameters): ''' wrapper: call sequence to perform random walk with restart Args: run_parameters: dictionary of run parameters ''' network_sparse, unique_gene_names, \ pg_network_n1_names = build_hybrid_sparse_matrix(run_parameters, True, True) unique_all_node_names = unique_gene_names + pg_network_n1_names spreadsheet_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_all_node_names) unique_genes_length = len(unique_gene_names) property_length = len(set(pg_network_n1_names)) base_col = np.append(np.ones(unique_genes_length, dtype=np.int), np.zeros(property_length, dtype=np.int)) new_spreadsheet_df = kn.append_column_to_spreadsheet( new_spreadsheet_df, base_col, 'base') hetero_network = normalize(network_sparse, norm='l1', axis=0) final_spreadsheet_matrix, step = kn.smooth_matrix_with_rwr( normalize(new_spreadsheet_df, norm='l1', axis=0), hetero_network, run_parameters) final_spreadsheet_df = pd.DataFrame(final_spreadsheet_matrix) final_spreadsheet_df.index = new_spreadsheet_df.index.values final_spreadsheet_df.columns = new_spreadsheet_df.columns.values prop_spreadsheet_df = rank_drawr_property(final_spreadsheet_df, pg_network_n1_names) spreadsheet_df_mask = final_spreadsheet_df.loc[ final_spreadsheet_df.index.isin(spreadsheet_df.index)] gene_result_df = construct_drawr_result_df(spreadsheet_df_mask, 0, spreadsheet_df_mask.shape[0], True, run_parameters) prop_result_df = construct_drawr_result_df(final_spreadsheet_df, unique_genes_length, final_spreadsheet_df.shape[0], False, run_parameters) save_timestamped_df(prop_spreadsheet_df, run_parameters['results_directory'], 'DRaWR_ranked_by_property') save_timestamped_df(gene_result_df, run_parameters['results_directory'], 'DRaWR_sorted_by_gene_score') save_timestamped_df(prop_result_df, run_parameters['results_directory'], 'DRaWR_sorted_by_property_score') map_and_save_droplist(spreadsheet_df, unique_gene_names, 'DRaWR_droplist', run_parameters) return prop_spreadsheet_df
def run_bootstrap_correlation(run_parameters): """ perform feature prioritization using bootstrap sampling Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) phenotype_df = phenotype_df.T n_bootstraps = run_parameters["number_of_bootstraps"] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, n_bootstraps, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_spreadsheet_numerical_transform(run_parameters): """ numerical transformation of dataframe Args: run_parameters with keys: "results_directory", "spreadsheet_file_name", "numeric_function", (with corresponding options): (z_transform_axis, z_transform_ddof) (log_transform_log_base, log_transform_log_offset_ (threshold_cut_off, threshold_substitution_value, threshold_scope) """ results_directory = run_parameters['results_directory'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] numeric_function = run_parameters['numeric_function'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) if numeric_function == 'abs': spreadsheet_df = abs_df(spreadsheet_df) transform_name = "absolute_value" elif numeric_function == 'z_transform': z_transform_axis = run_parameters['z_transform_axis'] z_transform_ddof = run_parameters['z_transform_ddof'] spreadsheet_df = z_transform_df(spreadsheet_df, axis=z_transform_axis, ddof=z_transform_ddof) transform_name = 'z_transform' elif numeric_function == 'log_transform': log_transform_log_base = run_parameters['log_transform_log_base'] if log_transform_log_base == "e": log_transform_log_base = np.exp(1) log_transform_log_offset = run_parameters['log_transform_log_offset'] spreadsheet_df = log_transform_df(spreadsheet_df, log_base=log_transform_log_base, log_offset=log_transform_log_offset) transform_name = 'log_transform' elif numeric_function == 'threshold': threshold_cut_off = run_parameters['threshold_cut_off'] threshold_substitution_value = run_parameters[ 'threshold_substitution_value'] threshold_scope = run_parameters['threshold_scope'] spreadsheet_df = threshold_df(spreadsheet_df, cut_off=threshold_cut_off, sub_val=threshold_substitution_value, scope=threshold_scope) transform_name = 'threshold' else: return write_transform_df(spreadsheet_df, spreadsheet_name_full_path, transform_name, results_directory)
def run_common_samples_df(run_parameters): """ Make two spreadsheets consistent by samples: two new spreadsheets created with samples being the intersection of sample sets of given spreadsheets. Args: run_parameters with keys: "results_directory", "spreadsheet_1_file_name", "spreadsheet_2_file_name" """ results_directory = run_parameters['results_directory'] spreadsheet_1_file_name = run_parameters['spreadsheet_1_file_name'] spreadsheet_2_file_name = run_parameters['spreadsheet_2_file_name'] spreadsheet_1_df = kn.get_spreadsheet_df(spreadsheet_1_file_name) spreadsheet_2_df = kn.get_spreadsheet_df(spreadsheet_2_file_name) spreadsheet_1_df, spreadsheet_2_df = common_samples_df( spreadsheet_1_df, spreadsheet_2_df) transform_name = "common_samples" write_transform_df(spreadsheet_1_df, spreadsheet_1_file_name, transform_name, results_directory) write_transform_df(spreadsheet_2_df, spreadsheet_2_file_name, transform_name, results_directory)
def run_cc_nmf(run_parameters): """ wrapper: call sequence to perform non-negative matrix factorization with consensus clustering and write results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_nmf' run_parameters = update_tmp_directory(run_parameters, tmp_dir) processing_method = run_parameters['processing_method'] number_of_bootstraps = run_parameters['number_of_bootstraps'] number_of_clusters = run_parameters['number_of_clusters'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_mat = spreadsheet_df.as_matrix() spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) number_of_samples = spreadsheet_mat.shape[1] if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_nmf_clusters_worker(spreadsheet_mat, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_nmf_clusters_parallel(spreadsheet_mat, run_parameters, number_of_bootstraps) elif processing_method == 'distribute': func_args = [spreadsheet_mat, run_parameters] dependency_list = [ run_cc_nmf_clusters_worker, save_a_clustering_to_tmp, dstutil.determine_parallelism_locally ] dstutil.execute_distribute_computing_job( run_parameters['cluster_ip_address'], number_of_bootstraps, func_args, find_and_save_cc_nmf_clusters_parallel, dependency_list) else: raise ValueError('processing_method contains bad value.') consensus_matrix = form_consensus_matrix(run_parameters, number_of_samples) labels = kn.perform_kmeans(consensus_matrix, number_of_clusters) sample_names = spreadsheet_df.columns save_consensus_clustering(consensus_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])