def run_bootstrap_net_correlation(run_parameters): """ perform gene prioritization using bootstrap sampling and network smoothing Args: run_parameters: parameter set dictionary. """ run_parameters["results_tmp_directory"] = kn.create_dir(run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df(run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df(run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr(spreadsheet_df.as_matrix(), network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] number_of_jobs = len(phenotype_df.index) jobs_id = range(0, number_of_jobs) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_bootstrap_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) kn.remove_dir(run_parameters["results_tmp_directory"])
def run_cc_net_similarity(run_parameters): """ wrapper: call sequence to perform signature analysis with random walk smoothing and bootstrapped similarity and save results. Args: run_parameters: parameter set dictionary. """ tmp_dir = 'tmp_cc_similarity_' run_parameters = update_tmp_directory(run_parameters, tmp_dir) expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path" ] gg_network_name = run_parameters['gg_network_name_full_path' ] similarity_measure = run_parameters["similarity_measure" ] number_of_bootstraps = run_parameters['number_of_bootstraps' ] processing_method = run_parameters['processing_method' ] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name ) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names network_mat, unique_gene_names = kn.get_sparse_network_matrix(gg_network_name) # network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names) expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() expression_mat, iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters) signature_mat, iterations = kn.smooth_matrix_with_rwr(signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc[:] = signature_mat if processing_method == 'serial': for sample in range(0, number_of_bootstraps): run_cc_similarity_signature_worker(expression_df, signature_df, run_parameters, sample) elif processing_method == 'parallel': find_and_save_cc_similarity_parallel(expression_df, signature_df, run_parameters, number_of_bootstraps) else: raise ValueError('processing_method contains bad value.') # consensus_df = form_consensus_df(run_parameters, expression_df, signature_df) similarity_df = assemble_similarity_df(expression_df, signature_df, run_parameters) similarity_df = pd.DataFrame(similarity_df.values, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters) kn.remove_dir(run_parameters["tmp_directory"])
def run_cc_net_nmf_clusters_worker(network_mat, spreadsheet_mat, lap_dag, lap_val, run_parameters, sample): """Worker to execute net_nmf_clusters in a single process Args: network_mat: genes x genes symmetric matrix. spreadsheet_mat: genes x samples matrix. lap_dag: laplacian matrix component, L = lap_dag - lap_val. lap_val: laplacian matrix component, L = lap_dag - lap_val. run_parameters: dictionay of run-time parameters. sample: each single loop. Returns: None """ np.random.seed(sample) rows_sampling_fraction = run_parameters["rows_sampling_fraction"] cols_sampling_fraction = run_parameters["cols_sampling_fraction"] spreadsheet_mat, \ sample_permutation = kn.sample_a_matrix( spreadsheet_mat , rows_sampling_fraction , cols_sampling_fraction ) spreadsheet_mat, \ iterations = kn.smooth_matrix_with_rwr(spreadsheet_mat, network_mat, run_parameters) spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_val, lap_dag, run_parameters) save_a_clustering_to_tmp(h_mat, sample_permutation, run_parameters, sample)
def run_net_similarity(run_parameters): """ Run random walk first to smooth expression and signature then perform similarity analysis and save the similarity matrix. Args: run_parameters: parameter set dictionary. """ expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path"] gg_network_name = run_parameters['gg_network_name_full_path'] similarity_measure = run_parameters["similarity_measure"] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df(signature_name) samples_names = expression_df.columns signatures_names = signature_df.columns signatures_names = [i.split('.')[0] for i in signatures_names] signature_df.columns = signatures_names network_mat, unique_gene_names = kn.get_sparse_network_matrix( gg_network_name) # network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df(signature_df, unique_gene_names) expression_mat = expression_df.as_matrix() signature_mat = signature_df.as_matrix() expression_mat, iterations = kn.smooth_matrix_with_rwr( expression_mat, network_mat, run_parameters) signature_mat, iterations = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc[:] = signature_mat similarity_mat = generate_similarity_mat(expression_df, signature_df, similarity_measure) # similarity_mat = map_similarity_range(similarity_mat, 0) similarity_df = pd.DataFrame(similarity_mat, index=samples_names, columns=signatures_names) save_final_samples_signature(similarity_df, run_parameters) save_best_match_signature(similarity_df, run_parameters)
def run_net_similarity(run_parameters): """ Run random walk first to smooth expression and signature then perform similarity analysis and save the similarity matrix. Args: run_parameters: parameter set dictionary. """ expression_name = run_parameters["spreadsheet_name_full_path"] signature_name = run_parameters["signature_name_full_path" ] gg_network_name = run_parameters['gg_network_name_full_path' ] similarity_measure = run_parameters["similarity_measure" ] expression_df = kn.get_spreadsheet_df(expression_name) signature_df = kn.get_spreadsheet_df( signature_name) expression_col_names = expression_df.columns signature_col_names = signature_df.columns #--------------------- network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name) expression_df = kn.update_spreadsheet_df(expression_df, unique_gene_names) signature_df = kn.update_spreadsheet_df( signature_df, unique_gene_names) #--------------------- expression_mat = expression_df.values signature_mat = signature_df.values expression_mat, \ iterations = kn.smooth_matrix_with_rwr(expression_mat, network_mat, run_parameters) signature_mat, \ iterations = kn.smooth_matrix_with_rwr( signature_mat, network_mat, run_parameters) expression_df.iloc[:] = expression_mat signature_df.iloc [:] = signature_mat # --------------------------------------------- similarity_mat = generate_similarity_mat(expression_df, signature_df,similarity_measure) # --------------------------------------------- similarity_df = pd.DataFrame( similarity_mat, index = expression_col_names, columns = signature_col_names ) save_final_expression_signature( similarity_df, run_parameters ) save_best_match_signature ( similarity_df, run_parameters )
def run_DRaWR(run_parameters): ''' wrapper: call sequence to perform random walk with restart Args: run_parameters: dictionary of run parameters ''' network_sparse, unique_gene_names, \ pg_network_n1_names = build_hybrid_sparse_matrix(run_parameters, True, True) unique_all_node_names = unique_gene_names + pg_network_n1_names spreadsheet_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_all_node_names) unique_genes_length = len(unique_gene_names) property_length = len(set(pg_network_n1_names)) base_col = np.append(np.ones(unique_genes_length, dtype=np.int), np.zeros(property_length, dtype=np.int)) new_spreadsheet_df = kn.append_column_to_spreadsheet( new_spreadsheet_df, base_col, 'base') hetero_network = normalize(network_sparse, norm='l1', axis=0) final_spreadsheet_matrix, step = kn.smooth_matrix_with_rwr( normalize(new_spreadsheet_df, norm='l1', axis=0), hetero_network, run_parameters) final_spreadsheet_df = pd.DataFrame(final_spreadsheet_matrix) final_spreadsheet_df.index = new_spreadsheet_df.index.values final_spreadsheet_df.columns = new_spreadsheet_df.columns.values prop_spreadsheet_df = rank_drawr_property(final_spreadsheet_df, pg_network_n1_names) spreadsheet_df_mask = final_spreadsheet_df.loc[ final_spreadsheet_df.index.isin(spreadsheet_df.index)] gene_result_df = construct_drawr_result_df(spreadsheet_df_mask, 0, spreadsheet_df_mask.shape[0], True, run_parameters) prop_result_df = construct_drawr_result_df(final_spreadsheet_df, unique_genes_length, final_spreadsheet_df.shape[0], False, run_parameters) save_timestamped_df(prop_spreadsheet_df, run_parameters['results_directory'], 'DRaWR_ranked_by_property') save_timestamped_df(gene_result_df, run_parameters['results_directory'], 'DRaWR_sorted_by_gene_score') save_timestamped_df(prop_result_df, run_parameters['results_directory'], 'DRaWR_sorted_by_property_score') map_and_save_droplist(spreadsheet_df, unique_gene_names, 'DRaWR_droplist', run_parameters) return prop_spreadsheet_df
def run_bootstrap_net_correlation_worker(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, job_id): """ worker for bootstrap network parallelization. Args: run_parameters: dict of parameters spreadsheet_df: spreadsheet data frame phenotype_df: phenotype data frame network_mat: adjacency matrix spreadsheet_genes_as_input: list of genes baseline_array: network smooted baseline array job_id: parallel iteration number """ np.random.seed(job_id) n_bootstraps = run_parameters["number_of_bootstraps" ] cols_sampling_fraction = run_parameters["cols_sampling_fraction"] top_beta_of_sort = run_parameters["top_beta_of_sort" ] restart_accumulator = np.zeros(network_mat.shape[0]) gm_accumulator = np.ones(network_mat.shape[0]) borda_count = np.zeros(network_mat.shape[0]) phenotype_df = phenotype_df.iloc[[job_id], :] spreadsheet_df ,\ phenotype_df ,\ msg = datacln.check_input_value_for_gene_prioritazion(spreadsheet_df, phenotype_df) sample_smooth = spreadsheet_df.as_matrix() pearson_array = get_correlation(sample_smooth, phenotype_df.values[0], run_parameters) for bootstrap_number in range(0, n_bootstraps): sample_random , sample_permutation = sample_a_matrix_pearson(sample_smooth, 1.0, cols_sampling_fraction) phenotype_response = phenotype_df.values[0, None] phenotype_response = phenotype_response[0, sample_permutation] pc_array = get_correlation(sample_random, phenotype_response, run_parameters) mask = np.in1d(spreadsheet_df.index, spreadsheet_genes_as_input) pc_array[~mask] = 0.0 pc_array = np.abs(trim_to_top_beta(pc_array, top_beta_of_sort)) restart_accumulator[pc_array != 0] += 1.0 pc_array = pc_array / max( sum(pc_array), EPSILON_0 ) pc_array = kn.smooth_matrix_with_rwr(pc_array, network_mat, run_parameters)[0] pc_array = pc_array - baseline_array borda_count = sum_array_ranking_to_borda_count(borda_count, pc_array) gm_accumulator = (np.abs(pc_array) + EPSILON_0) * gm_accumulator restart_accumulator = restart_accumulator / n_bootstraps borda_count = borda_count / n_bootstraps viz_score = (borda_count - min(borda_count)) / (max(borda_count) - min(borda_count)) phenotype_name = phenotype_df.index.values[0] gene_name_list = spreadsheet_df.index gene_orig_list = spreadsheet_genes_as_input quantitative_score = borda_count generate_net_correlation_output(pearson_array, quantitative_score, viz_score, restart_accumulator, phenotype_name, gene_name_list, gene_orig_list, run_parameters)
def run_net_correlation_worker(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, job_id): """ core function for parallel run_net_correlation Args: run_parameters: dict of parameters spreadsheet_df: spreadsheet data frame phenotype_df: phenotype data frame network_mat: adjacency matrix spreadsheet_genes_as_input: list of genes baseline_array: network smooted baseline array job_id: parallel iteration number """ np.random.seed(job_id) phenotype_df = phenotype_df.iloc[[job_id], :] spreadsheet_df ,\ phenotype_df ,\ msg = datacln.check_input_value_for_gene_prioritazion(spreadsheet_df, phenotype_df) sample_smooth = spreadsheet_df.values pc_array = get_correlation(sample_smooth, phenotype_df.values[0], run_parameters) pearson_array = pc_array.copy() mask = np.in1d(spreadsheet_df.index, spreadsheet_genes_as_input) pc_array[~mask] = 0.0 pc_array = np.abs( trim_to_top_beta(pc_array, run_parameters["top_beta_of_sort"])) restart_accumulator = pc_array.copy() restart_accumulator[restart_accumulator != 0] = 1 pc_array = pc_array / max(sum(pc_array), EPSILON_0) pc_array = kn.smooth_matrix_with_rwr(pc_array, network_mat, run_parameters)[0] pc_array = pc_array - baseline_array quantitative_score = pc_array viz_score = (pc_array - min(pc_array)) / (max(pc_array) - min(pc_array)) phenotype_name = phenotype_df.index.values[0] gene_name_list = spreadsheet_df.index gene_orig_list = spreadsheet_genes_as_input generate_net_correlation_output(pearson_array, quantitative_score, viz_score, restart_accumulator, phenotype_name, gene_name_list, gene_orig_list, run_parameters)
def run_net_nmf(run_parameters): """ wrapper: call sequence to perform network based stratification and write results. Args: run_parameters: parameter set dictionary. """ np.random.seed(0) number_of_clusters = run_parameters['number_of_clusters'] gg_network_name_full_path = run_parameters['gg_network_name_full_path'] spreadsheet_name_full_path = run_parameters['spreadsheet_name_full_path'] network_mat, \ unique_gene_names = kn.get_sparse_network_matrix(gg_network_name_full_path) network_mat = kn.normalize_sparse_mat_by_diagonal(network_mat) lap_diag, lap_pos = kn.form_network_laplacian_matrix(network_mat) spreadsheet_df = kn.get_spreadsheet_df(spreadsheet_name_full_path) spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) sample_names = spreadsheet_df.columns spreadsheet_mat = spreadsheet_df.values spreadsheet_mat, \ iterations = kn.smooth_matrix_with_rwr (spreadsheet_mat, network_mat, run_parameters) spreadsheet_mat = kn.get_quantile_norm_matrix(spreadsheet_mat) h_mat = kn.perform_net_nmf(spreadsheet_mat, lap_pos, lap_diag, run_parameters) linkage_matrix = np.zeros( (spreadsheet_mat.shape[1], spreadsheet_mat.shape[1])) sample_perm = np.arange(0, spreadsheet_mat.shape[1]) linkage_matrix = kn.update_linkage_matrix(h_mat, sample_perm, linkage_matrix) labels = kn.perform_kmeans(linkage_matrix, number_of_clusters) distance_matrix = pairwise_distances( h_mat.T, n_jobs=-1) # [n_samples, n_features]. Use all available cores save_consensus_clustering(linkage_matrix, sample_names, labels, run_parameters) calculate_and_save_silhouette_scores(distance_matrix, sample_names, labels, run_parameters) save_final_samples_clustering(sample_names, labels, run_parameters) save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters)
def test_smooth_matrix_with_rwr_single_vector(self): """ Assert that a test matrix will converge to the precomputed answer in the predicted number of steps (iterations). Depends on run_parameters and the values set herein. """ EXPECTED_STEPS = 31 F0 = np.array([1.0, 0.0]) A = (np.eye(2) + np.ones((2, 2))) / 3 F_exact = np.array([0.5, 0.5]) F_calculated, steps = kn.smooth_matrix_with_rwr( F0, A, self.run_parameters) self.assertEqual(steps, EXPECTED_STEPS, msg='minor difference') T = (np.abs(F_exact - F_calculated)) self.assertAlmostEqual(T.sum(), 0)
def test_smooth_matrix_with_rwr_non_sparse(self): """ Assert that a test matrix will converge to the precomputed answer in the predicted number of steps (iterations). Depends on run_parameters and the values set herein. """ EXPECTED_STEPS = 32 F0 = np.eye(2) A = (np.eye(2) + np.ones((2, 2))) / 3 F_exact = np.ones((2, 2)) * 0.5 F_calculated, steps = kn.smooth_matrix_with_rwr( F0, A, self.run_parameters) self.assertEqual(steps, EXPECTED_STEPS) T = (np.abs(F_exact - F_calculated)) self.assertAlmostEqual(T.sum(), 0)
def run_net_path(run_parameters): ''' wrapper: call sequence to perform net path Args: run_parameters: dictionary of run parameters ''' network_sparse, unique_gene_names, \ pg_network_n1_names = build_hybrid_sparse_matrix(run_parameters, False, False) spreadsheet_df = kn.get_spreadsheet_df( run_parameters['spreadsheet_name_full_path']) new_spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) hetero_network = normalize(network_sparse, norm='l1', axis=0) final_rwr_matrix, step = kn.smooth_matrix_with_rwr( np.eye(hetero_network.shape[0]), hetero_network, run_parameters) smooth_rwr_matrix = smooth_final_spreadsheet_matrix( final_rwr_matrix, len(unique_gene_names)) cosine_matrix = get_net_path_results(len(unique_gene_names), smooth_rwr_matrix, run_parameters) cosine_matrix_df = pd.DataFrame(cosine_matrix, index=unique_gene_names, columns=pg_network_n1_names) # save_cosine_matrix_df(cosine_matrix_df, run_parameters) property_rank_df = rank_netpath_property(new_spreadsheet_df, cosine_matrix_df) prop_result_df = construct_netpath_result_df(new_spreadsheet_df, cosine_matrix_df) save_timestamped_df(property_rank_df, run_parameters['results_directory'], 'net_path_ranked_by_property') save_timestamped_df(prop_result_df, run_parameters['results_directory'], 'net_path_sorted_by_property_score') map_and_save_droplist(spreadsheet_df, unique_gene_names, 'net_path_droplist', run_parameters) return property_rank_df
def run_net_correlation(run_parameters): """ perform gene prioritization with network smoothing Args: run_parameters: parameter set dictionary. """ max_cpu = run_parameters["max_cpu"] run_parameters["results_tmp_directory"] = kn.create_dir( run_parameters["results_directory"], 'tmp') gg_network_name_full_path = run_parameters['gg_network_name_full_path'] network_mat, unique_gene_names = kn.get_sparse_network_matrix( gg_network_name_full_path) network_mat = normalize(network_mat, norm="l1", axis=0) phenotype_df = kn.get_spreadsheet_df( run_parameters["phenotype_name_full_path"]) spreadsheet_df = kn.get_spreadsheet_df( run_parameters["spreadsheet_name_full_path"]) spreadsheet_genes_as_input = spreadsheet_df.index.values phenotype_df = phenotype_df.T spreadsheet_df = kn.update_spreadsheet_df(spreadsheet_df, unique_gene_names) spreadsheet_df = zscore_dataframe(spreadsheet_df) sample_smooth, iterations = kn.smooth_matrix_with_rwr( spreadsheet_df.values, network_mat.T, run_parameters) spreadsheet_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index, columns=spreadsheet_df.columns) baseline_array = np.ones(network_mat.shape[0]) / network_mat.shape[0] baseline_array = kn.smooth_matrix_with_rwr(baseline_array, network_mat, run_parameters)[0] #----------------------------------------------------------------------------------------- # Partition the phenotype dataframe (partition size = MaxCPU) #----------------------------------------------------------------------------------------- len_phenotype = len(phenotype_df.index) array_of_jobs = range(0, len_phenotype) if (len_phenotype <= max_cpu): jobs_id = array_of_jobs number_of_jobs = len(jobs_id) zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- else: for i in range(0, len_phenotype, max_cpu): jobs_id = array_of_jobs[i:i + max_cpu] number_of_jobs = len(jobs_id) #----------------------------------------------------------------------------------------- zipped_arguments = dstutil.zip_parameters(run_parameters, spreadsheet_df, phenotype_df, network_mat, spreadsheet_genes_as_input, baseline_array, jobs_id) dstutil.parallelize_processes_locally(run_net_correlation_worker, zipped_arguments, number_of_jobs) write_phenotype_data_all(run_parameters) #----------------------------------------------------------------------------------------- kn.remove_dir(run_parameters["results_tmp_directory"])
def save_spreadsheet_and_variance_heatmap(spreadsheet_df, labels, run_parameters, network_mat=None): """ save the full genes by samples spreadsheet as processed or smoothed if network is provided. Also save variance in separate file. Args: spreadsheet_df: the dataframe as processed run_parameters: with keys for "results_directory", "method", (optional - "top_number_of_genes") network_mat: (if appropriate) normalized network adjacency matrix used in processing Output: genes_by_samples_heatmp_{method}_{timestamp}_viz.tsv genes_averages_by_cluster_{method}_{timestamp}_viz.tsv top_genes_by_cluster_{method}_{timestamp}_download.tsv """ top_number_of_genes = run_parameters['top_number_of_genes'] if network_mat is not None: sample_smooth, nun = kn.smooth_matrix_with_rwr(spreadsheet_df.values, network_mat, run_parameters) clusters_df = pd.DataFrame(sample_smooth, index=spreadsheet_df.index.values, columns=spreadsheet_df.columns.values) else: clusters_df = spreadsheet_df clusters_df.to_csv(get_output_file_name(run_parameters, 'genes_by_samples_heatmap', 'viz'), sep='\t') cluster_ave_df = pd.DataFrame({ i: spreadsheet_df.iloc[:, labels == i].mean(axis=1) for i in np.unique(labels) }) col_labels = [] for cluster_number in np.unique(labels): col_labels.append('Cluster_%d' % (cluster_number)) cluster_ave_df.columns = col_labels cluster_ave_df.to_csv(get_output_file_name(run_parameters, 'genes_averages_by_cluster', 'viz'), sep='\t') clusters_variance_df = pd.DataFrame(clusters_df.var(axis=1), columns=['variance']) clusters_variance_df.to_csv(get_output_file_name(run_parameters, 'genes_variance', 'viz'), sep='\t', float_format='%g') top_number_of_genes_df = pd.DataFrame(data=np.zeros( (cluster_ave_df.shape)), columns=cluster_ave_df.columns, index=cluster_ave_df.index.values) for sample in top_number_of_genes_df.columns.values: top_index = np.argsort(cluster_ave_df[sample].values)[::-1] top_number_of_genes_df[sample].iloc[ top_index[0:top_number_of_genes]] = 1 top_number_of_genes_df.to_csv(get_output_file_name(run_parameters, 'top_genes_by_cluster', 'download'), sep='\t')