def find_clusters_and_gene_enrichment(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, start_k=2, end_k=6, calc_go=True, enrichment_list_file_names=None, meta_groups=None, filter_expression=None, cluster_algorithm=None): # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] averaged_expression_sets = [] tested_gene_expression = load_gene_expression_profile_by_genes( tested_gene_list_file_name, gene_expression_file_name, gene_filter_file_name, tested_gene_list_path, gene_expression_path, gene_filter_file_path) tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers( tested_gene_expression) if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "number of filtered patients from phenotypes: {}".format( len(filtered_patients)) else: print "no filter applied" filtered_patients = tested_gene_expression_headers_columns tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients( filtered_patients, tested_gene_expression_headers_columns, tested_gene_expression) if np.shape(tested_gene_expression)[1] == 1: print "no expressions were found after filtering by labels {}. skipping...".format( filter_expression) return None total_gene_list = load_gene_list(total_gene_list_file_name) tested_gene_list = load_gene_list(tested_gene_list_file_name) row_var = np.var(tested_gene_expression, axis=1) row_var_sorted = np.sort(row_var)[::-1] labels_assignment_patients = None if meta_groups is not None: print "clustering patients by groups" labels_assignment_patients = labels_assignments( meta_groups, phenotype_file_name, tested_gene_expression_headers_columns) enrichment_lists = [] if enrichment_list_file_names is not None: for cur in enrichment_list_file_names: enrichment_lists.append(load_gene_list(cur)) if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] gene_expression_top_var = np.delete(tested_gene_expression, row_var_masked_indices, axis=0) gene_expression_top_var_header_rows = np.delete( tested_gene_expression_headers_rows, row_var_masked_indices, axis=0) gene_expression_top_var_header_columns = tested_gene_expression_headers_columns clfs_results = {} output_rows = [] if calc_go: if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download( constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')): # wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_expression_top_var_header_rows) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] print GO_results if cluster_algorithm == "kmeans": for n_clusters in range(start_k, end_k + 1): clfs_results[n_clusters] = [] centres, km_clf, dist = kmeanssample(X=gene_expression_top_var, k=n_clusters, metric="euclidean") for i in range(n_clusters): ranks = [] for j in range(n_clusters): ranks.append( np.average( np.delete(gene_expression_top_var, np.where(km_clf != j)[0], axis=0))) ranks = rankdata(ranks) cluster_labels = np.array(km_clf) for j in range(n_clusters): cluster_labels[np.where(km_clf == ranks[j] - 1)] = j labels_assignment = [cluster_labels + 1] cluster_indices = np.where(km_clf != i)[0] gene_expression_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) gene_headers_row_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) clfs_results[n_clusters].append( (gene_headers_row_cluster, gene_headers_row_cluster)) desc = "k={} clustering cluster {} has {} genes".format( n_clusters, i, len(gene_expression_cluster)) gene_list = ",".join(gene_headers_row_cluster) url = check_enrichment(gene_list) go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if calc_go: g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_headers_row_cluster) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip( *GO_results) if len(enrichment_lists) != 0: for j, cur in enumerate(enrichment_lists): go_terms.append( enrichment_list_file_names[j].split(".")[0]) uncorrectd_pvals.append( calc_HG_test( [x.split(".")[0] for x in tested_gene_list], [x.split(".")[0] for x in cur], [ x.split(".")[0] for x in gene_headers_row_cluster ])) FDRs.append(".") go_names.append(".") go_ns.append(".") output_rows.append((desc, "\r\n".join([ x.split(".")[0] for x in gene_headers_row_cluster ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))) gene_sorted_heatmap = np.rot90(np.flip( gene_expression_top_var[cluster_labels.argsort(), :], 1), k=-1, axes=(1, 0)) find_clusters(end_k, gene_sorted_heatmap, gene_expression_top_var_header_columns, start_k, e2g_convertor(gene_expression_top_var_header_rows), tested_gene_list_file_name, labels_assignment=labels_assignment_patients) plot_heatmap(gene_expression_top_var, gene_expression_top_var_header_columns, labels_assignment, gene_expression_top_var_header_rows, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) if cluster_algorithm == "hierarchical": df = pd.DataFrame(data=gene_sorted_heatmap, index=gene_expression_top_var_header_columns, columns=gene_expression_top_var_header_rows) # correlations = df.corr() # correlations_array = np.asarray(df.corr()) # # row_linkage = hierarchy.linkage( # distance.pdist(correlations_array), method='average') # # col_linkage = hierarchy.linkage( # distance.pdist(correlations_array.T), method='average') # enrichment_gene_list = load_gene_list("uvm_mito_part.txt") dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg")) row_colors = map(dct.get, labels_assignment_patients[0]) dct = {1: 'b', 2: 'r'} gene_expression_top_var_header_rows_trimmed = [ x.split(".")[0] for x in gene_expression_top_var_header_rows ] # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed]) g = sns.clustermap(df, row_colors=row_colors, metric="euclidean", robust=True, method="single") # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage, # labels=df.index, # color_threshold=0.60) den_genes = scipy.cluster.hierarchy.dendrogram( g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7) clusters = get_cluster_classes(den_genes) g.savefig( os.path.join(constants.BASE_PROFILE, "output", "hierarchical_cluster_{}.png".format(time.time()))) for cur_labels_assignment_patient in labels_assignment_patients: plot_heatmap(gene_sorted_heatmap, gene_expression_top_var_header_rows, [cur_labels_assignment_patient], gene_expression_top_var_header_columns, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) print_to_excel( output_rows=output_rows, gene_list_file_name=tested_gene_list_file_name.split(".")[0], gene_expression_file_name=gene_expression_file_name.split(".")[0], var_th_index=var_th_index)