def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False, method='bonferroni,sidak,holm', pvalcalc='fisher'): ''' This is the wrapper of the Goatools function. :param study: a list of study gene :param pop: a list of population gene :param assoc: the association from the gene to the go term :return: ''' if type(study) == str and type(pop) == str: # load the study and pop from the file study, pop = GO._read_geneset(study, pop, compare=compare) else: # convert to the set study = frozenset(study) pop = set(pop) methods = method.split(",") if obo == 'go-basic.obo': obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo" if not os.path.exists(obo): print("obo file not found, start to download") wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo) obo_dag = GODag(obo) propagate_counts = not no_propagate_counts if type(assoc) == dict: buf = "" for k, v in assoc.items(): if not v: continue line = ";".join([str(x) for x in v if x]) buf += "{}\t{}\n".format(k, line) path = os.path.dirname(os.path.realpath(__file__)) + "/assoc" with open(path, 'w') as fp: fp.write(buf) assoc = read_associations(path) elif type(assoc) == defaultdict: pass else: # if from a file assoc = read_associations(assoc) g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=alpha, pvalcalc=pvalcalc, methods=methods) results = g.run_study(study) # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value) r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n' for x in results: r += x.__str__() + "\n" tb = pd.read_table(StringIO(r)) return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'): # Load the Gene Ontology n_comps = metagene_matrix.shape[1] self.download_and_cache_resources( ) # Download ontology and annotations, if necessary gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo') # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) gea = GOEnrichmentStudy(population, associations, gene_ontology, propagate_counts=True, alpha=0.05, methods=[method]) gea_results_by_component = {} rankings = self.ranked_genes_by_component(metagene_matrix) for ci in range(n_comps): study_genes = rankings[ci] print('\nComp. %d: %s...' % (ci, str(study_genes[:10]))) gea_results_by_component[ci] = gea.run_study(study_genes) # Get results into a dataframe per component. Easiest way is to use routine to # write a .tsv file, then read back and filter gea_results_df_by_component = [] for ci in range(n_comps): ge_df = self._perform_gene_enrichment_analysis_one_component( ci, gea_results_by_component, gea) if ge_df is not None: gea_results_df_by_component += [ge_df] # Merge the per-component dataframes into a single one gea_all_sig_results_df = pd.DataFrame() gea_all_sig_results_df = gea_all_sig_results_df.append( gea_results_df_by_component) gea_all_sig_results_df.to_csv(self.cache_dir + '%s_gea_all.tsv' % self.prefix, sep='\t')
class GoEnrich(): def __init__(self): obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo") background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')] geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt') self.goeaobj = GOEnrichmentStudy( background, geneid2gos_yeast, obodag, propogate_counts=False, alpha=0.05, methods=['fdr_bh']) def measure_enrichment(self, gene_set=['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C', 'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W', 'YJL130C'], run_name='base', cluster_id=1): gene_ids = ['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C', 'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W', 'YJL130C'] goea_results_all = self.goeaobj.run_study(gene_ids) # we can get significant only # goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] self.goeaobj.wr_txt("../Results/" + run_name + "_" + str(cluster_id) + ".txt", goea_results_all)
def check_group_enrichment(tested_gene_file_name, total_gene_file_name): total_gene_list = load_gene_list(total_gene_file_name) tested_gene = load_gene_list(tested_gene_file_name) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results) else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] output_rows = [("\r\n".join(e2g_convertor(tested_gene)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))] print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
def test_goea(): """Test GOEA with method, fdr.""" obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr']) goea_results = goeaobj.run_study(study_ids) goeaobj.print_summary(goea_results)
def get_goea_results(method="fdr_bh"): """Get GOEA results.""" root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") obo_fin = os.path.join(root_dir, "goslim_generic.obo") obo_dag = GODag(obo_fin) assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method]) study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))] goea_results = goeaobj.run_study(study_ids, methods=[method]) return goea_results
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes study_ids = _get_geneids() population_ids = GeneID2nt.keys() # Get databases gene2go = get_assoc_ncbi_taxids([9606], loading_bar=None) fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def run_bonferroni(log): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] study_ids = [line.rstrip() for line in open("../data/study")] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = os.path.join(REPO, "data/association") assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes print('CWD', os.getcwd()) study_ids = _get_geneids() population_ids = GENEID2NT.keys() # Get databases print(os.getcwd()) fin = os.path.join(REPO, 'gene2go') dnld_ncbi_gene_file(fin, loading_bar=None) gene2go = read_ncbi_gene2go(fin, [9606]) fin_obo = os.path.join(REPO, "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
min_dist = 5000 max_dist = 50000 else: sys.exit(-1) with open(snakemake.input.clusters) as f: for line in f: cols = line.strip().split() cluster = chr(int(cols[3]) + 65) if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist: genes[cluster].add(cols[7]) background.add(cols[7]) obodag = GODag("go-basic.obo") id2go = read_associations("sym2go.txt") goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) outfile = open(snakemake.output.txt, 'w') for cluster, geneids in sorted(genes.items()): outfile.write("Cluster {}\n".format(cluster)) goea_results_all = goeaobj.run_study(geneids) for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]): outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment)) outfile.write("\n") #GOEnrichmentStudy.print_summary(goea_results_sig)
def find_clusters_and_gene_enrichment(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, start_k=2, end_k=6, calc_go=True, enrichment_list_file_names=None, meta_groups=None, filter_expression=None, cluster_algorithm=None): # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] averaged_expression_sets = [] tested_gene_expression = load_gene_expression_profile_by_genes( tested_gene_list_file_name, gene_expression_file_name, gene_filter_file_name, tested_gene_list_path, gene_expression_path, gene_filter_file_path) tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers( tested_gene_expression) if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "number of filtered patients from phenotypes: {}".format( len(filtered_patients)) else: print "no filter applied" filtered_patients = tested_gene_expression_headers_columns tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients( filtered_patients, tested_gene_expression_headers_columns, tested_gene_expression) if np.shape(tested_gene_expression)[1] == 1: print "no expressions were found after filtering by labels {}. skipping...".format( filter_expression) return None total_gene_list = load_gene_list(total_gene_list_file_name) tested_gene_list = load_gene_list(tested_gene_list_file_name) row_var = np.var(tested_gene_expression, axis=1) row_var_sorted = np.sort(row_var)[::-1] labels_assignment_patients = None if meta_groups is not None: print "clustering patients by groups" labels_assignment_patients = labels_assignments( meta_groups, phenotype_file_name, tested_gene_expression_headers_columns) enrichment_lists = [] if enrichment_list_file_names is not None: for cur in enrichment_list_file_names: enrichment_lists.append(load_gene_list(cur)) if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] gene_expression_top_var = np.delete(tested_gene_expression, row_var_masked_indices, axis=0) gene_expression_top_var_header_rows = np.delete( tested_gene_expression_headers_rows, row_var_masked_indices, axis=0) gene_expression_top_var_header_columns = tested_gene_expression_headers_columns clfs_results = {} output_rows = [] if calc_go: if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download( constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')): # wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_expression_top_var_header_rows) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] print GO_results if cluster_algorithm == "kmeans": for n_clusters in range(start_k, end_k + 1): clfs_results[n_clusters] = [] centres, km_clf, dist = kmeanssample(X=gene_expression_top_var, k=n_clusters, metric="euclidean") for i in range(n_clusters): ranks = [] for j in range(n_clusters): ranks.append( np.average( np.delete(gene_expression_top_var, np.where(km_clf != j)[0], axis=0))) ranks = rankdata(ranks) cluster_labels = np.array(km_clf) for j in range(n_clusters): cluster_labels[np.where(km_clf == ranks[j] - 1)] = j labels_assignment = [cluster_labels + 1] cluster_indices = np.where(km_clf != i)[0] gene_expression_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) gene_headers_row_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) clfs_results[n_clusters].append( (gene_headers_row_cluster, gene_headers_row_cluster)) desc = "k={} clustering cluster {} has {} genes".format( n_clusters, i, len(gene_expression_cluster)) gene_list = ",".join(gene_headers_row_cluster) url = check_enrichment(gene_list) go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if calc_go: g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_headers_row_cluster) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip( *GO_results) if len(enrichment_lists) != 0: for j, cur in enumerate(enrichment_lists): go_terms.append( enrichment_list_file_names[j].split(".")[0]) uncorrectd_pvals.append( calc_HG_test( [x.split(".")[0] for x in tested_gene_list], [x.split(".")[0] for x in cur], [ x.split(".")[0] for x in gene_headers_row_cluster ])) FDRs.append(".") go_names.append(".") go_ns.append(".") output_rows.append((desc, "\r\n".join([ x.split(".")[0] for x in gene_headers_row_cluster ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))) gene_sorted_heatmap = np.rot90(np.flip( gene_expression_top_var[cluster_labels.argsort(), :], 1), k=-1, axes=(1, 0)) find_clusters(end_k, gene_sorted_heatmap, gene_expression_top_var_header_columns, start_k, e2g_convertor(gene_expression_top_var_header_rows), tested_gene_list_file_name, labels_assignment=labels_assignment_patients) plot_heatmap(gene_expression_top_var, gene_expression_top_var_header_columns, labels_assignment, gene_expression_top_var_header_rows, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) if cluster_algorithm == "hierarchical": df = pd.DataFrame(data=gene_sorted_heatmap, index=gene_expression_top_var_header_columns, columns=gene_expression_top_var_header_rows) # correlations = df.corr() # correlations_array = np.asarray(df.corr()) # # row_linkage = hierarchy.linkage( # distance.pdist(correlations_array), method='average') # # col_linkage = hierarchy.linkage( # distance.pdist(correlations_array.T), method='average') # enrichment_gene_list = load_gene_list("uvm_mito_part.txt") dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg")) row_colors = map(dct.get, labels_assignment_patients[0]) dct = {1: 'b', 2: 'r'} gene_expression_top_var_header_rows_trimmed = [ x.split(".")[0] for x in gene_expression_top_var_header_rows ] # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed]) g = sns.clustermap(df, row_colors=row_colors, metric="euclidean", robust=True, method="single") # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage, # labels=df.index, # color_threshold=0.60) den_genes = scipy.cluster.hierarchy.dendrogram( g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7) clusters = get_cluster_classes(den_genes) g.savefig( os.path.join(constants.BASE_PROFILE, "output", "hierarchical_cluster_{}.png".format(time.time()))) for cur_labels_assignment_patient in labels_assignment_patients: plot_heatmap(gene_sorted_heatmap, gene_expression_top_var_header_rows, [cur_labels_assignment_patient], gene_expression_top_var_header_columns, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) print_to_excel( output_rows=output_rows, gene_list_file_name=tested_gene_list_file_name.split(".")[0], gene_expression_file_name=gene_expression_file_name.split(".")[0], var_th_index=var_th_index)
methods=['fdr_bh']) # defult multipletest correction method # I will also look for GO term enrichment in the group with the top 10 (abs value, but they are all negative) background fitness coeffs in the full model tp['abs_slope_in_full_model'] = np.abs(tp['full_model_x_slope']) fit_effect_genes = tp.loc[tp['model_comp_p_full_vs_qtl'] < 0.05].sort_values( by='abs_slope_in_full_model', ascending=False).iloc[:10]['Gene.Use'] #testing my groups results = dict() go_groups = list(mhq_dat.as_matrix(['QTL', 'Genes_with_interactions'])) go_groups.append( ['Top_x_effects', ';'.join([s.split(' ')[1] for s in fit_effect_genes])]) for entry in go_groups: study_set_names = [i for i in entry[1].split(';') if 'None' not in i] study_set = [genename_2_id.setdefault(i, 'NA') for i in study_set_names] goea_results_all = goeaobj.run_study( study_set, keep_if=lambda x: x.p_uncorrected < 0.05) results[entry[0]] = sorted(goea_results_all, key=lambda r: r.p_fdr_bh)[:10] with open('../../Analysis/GO_results.csv', 'w') as outfile: writer = csv.writer(outfile) writer.writerow([ 'Group', 'GO term', 'pval_uncorrected', 'pval_benjamini_hochberg', 'hits' ]) for r in results: for i in range(len(results[r])): writer.writerow([ r, results[r][i].name, results[r][i].p_uncorrected, results[r][i].p_fdr_bh, ';'.join([id_2_genename[i] for i in results[r][i].study_items]) ])
def goea(gene_ids, gene_symbols, trajectory, cluster, out_dir ): ## list of genes represented by their ensembl id and gene symbol ## load ontologies if not os.path.exists(out_dir): os.mkdir(out_dir) from goatools.obo_parser import GODag obodag = GODag("goea/go-basic.obo") ## load associations from goatools.associations import read_ncbi_gene2go geneid2gos_human = read_ncbi_gene2go("goea/gene2go", taxids=[9606]) ## background gene set from goea.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_human ## GOEA object from goatools.go_enrichment import GOEnrichmentStudy goeaobj = GOEnrichmentStudy( GeneID2nt_human.keys(), # List of mouse protein-coding genes geneid2gos_human, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} for gene_symbol in gene_symbols: for id in GeneID2nt_human.keys(): if GeneID2nt_human[id][5] == gene_symbol: geneid2symbol[int(id)] = gene_symbol #from PyEntrezId import Conversion #for (gene_id, gene_symbol) in zip(gene_ids, gene_symbols): # id = Conversion('*****@*****.**') # gene_id = id.convert_ensembl_to_entrez(gene_id) ## get entrez # geneid2symbol[int(gene_id)] = gene_symbol ## identify which id correspond to the genes in the cluster ## Run GOEA # 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using. """ import rpy2 from rpy2.robjects import r, pandas2ri from rpy2.robjects import pandas2ri import rpy2.robjects as robjects robjects.r(''' f <- function(geneNames) { library(clusterProfiler) kk <- enrichKEGG(geneNames) as.data.frame(kk) } ''') r_enrich = robjects.globalenv['f'] """ # print(r_enrich.r_repr()) gene_names = np.array(list(geneid2symbol.keys())) print(gene_names) """ pandas2ri.activate() res = r_enrich(gene_names) res = r_enrich(gene_names, organism="hsa", pvalueCutoff=0.5, pAdjustMethod="BH", qvalueCutoff=0.1) print(res) print(pandas2ri.ri2py(res)) return """ geneids_study = geneid2symbol.keys() with open( out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'genes.txt', 'w') as f: for gene in geneids_study: f.write("%s\n" % gene) goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] ## Write the result to file goeaobj.wr_xlsx(out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'goea_symbols.xlsx', goea_results_sig, itemid2name=geneid2symbol) goeaobj.wr_xlsx( out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'goea_geneids.xlsx', goea_results_sig)
"background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") if args.fdr: methods.append("fdr") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if) else: g.wr_tsv(outfile, results, prt_if=prt_if)
class GOEnrichment(Experiment): """ Class for running experiment that conducts enrichment of gene ontology terms in pathways in the PPI network. """ def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading enrichment study...") obodag = GODag(self.params["go_path"]) geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606]) self.enrichment_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, log=None, **self.params["enrichment_params"]) logging.info("Loading predictions...") self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items()} outputs_path = os.path.join(self.dir, "outputs.pkl") if os.path.exists(outputs_path): logging.info("Loading outputs...") with open(outputs_path, 'rb') as f: self.outputs = pickle.load(f) else: self.outputs = {} def run_study(self, proteins): """ """ results = self.enrichment_study.run_study(proteins) term_to_pval = {r.goterm.name: r.p_fdr_bh for r in results} return term_to_pval def compute_spearman_correlation(self, a_term_to_pval, b_term_to_pval): """ """ terms = list(a_term_to_pval.keys()) sp_corr, sp_pval = spearmanr([a_term_to_pval[term] for term in terms], [b_term_to_pval[term] for term in terms]) return sp_corr, sp_pval def process_disease(self, disease): """ """ results = {} output = {} # compute method scores for disease disease_proteins = set(self.diseases_dict[disease.id].proteins) if disease.id in self.outputs: disease_term_to_pval = self.outputs[disease.id]["disease"] else: disease_term_to_pval = self.run_study(disease_proteins) output["disease"] = disease_term_to_pval disease_terms = set([term for term, pval in disease_term_to_pval.items() if pval < 0.05]) top_disease_terms = set([term for term, _ in sorted(disease_term_to_pval.items(), key=lambda x: x[1])[:self.params["top_k"]]]) results = {"disease_name": disease.name, "disease_num_significant": len(disease_terms), "disease_top_{}".format(self.params['top_k']): top_disease_terms} # number of predictions to be made num_preds = (len(disease_proteins) if self.params["num_preds"] == -1 else self.params["num_preds"]) for name, preds in self.method_to_preds.items(): pred_proteins = set(map(int, preds.loc[disease.id] .sort_values(ascending=False) .index[:num_preds])) if disease.id in self.outputs: pred_term_to_pval = self.outputs[disease.id][name] else: pred_term_to_pval = self.run_study(pred_proteins) output[name] = pred_term_to_pval pred_terms = set([term for term, pval in pred_term_to_pval.items() if pval < 0.05]) top_pred_terms = set([term for term, _ in sorted(pred_term_to_pval.items(), key=lambda x: x[1])[:self.params["top_k"]]]) jaccard = (len(disease_terms & pred_terms) / len(disease_terms | pred_terms) if len(disease_terms | pred_terms) != 0 else 0) sp_corr, sp_pval = self.compute_spearman_correlation(disease_term_to_pval, pred_term_to_pval) results[f"{name}_num_significant"] = len(pred_terms) results[f"{name}_top_{self.params['top_k']}"] = top_pred_terms results[f"{name}_jaccard_sim"] = jaccard results[f"{name}_sp_corr"] = sp_corr results[f"{name}_sp_pval"] = sp_pval return disease, results, output def _run(self): """ Run the experiment. """ results = [] indices = [] outputs = {} diseases = list(self.diseases_dict.values()) diseases.sort(key=lambda x: x.split) if self.params["n_processes"] > 1: with tqdm(total=len(diseases)) as t: p = Pool(self.params["n_processes"]) for disease, result, output in p.imap(process_disease_wrapper, diseases): results.append(result) indices.append(disease.id) outputs[disease.id] = output t.update() else: with tqdm(total=len(diseases)) as t: for disease in diseases: disease, result, output = self.process_disease(disease) results.append(result) indices.append(disease.id) outputs[disease.id] = output t.update() self.outputs = outputs self.results = pd.DataFrame(results, index=indices) def save_results(self, summary=True): """ Saves the results to a csv using a pandas Data Fram """ print("Saving Results...") self.results.to_csv(os.path.join(self.dir, 'results.csv')) #if self.params["save_enrichment_results"]: # with open(os.path.join(self.dir,'outputs.pkl'), 'wb') as f: # pickle.dump(self.outputs, f) def load_results(self): """ Loads the results from a csv to a pandas Data Frame. """ print("Loading Results...") self.results = pd.read_csv(os.path.join(self.dir, 'results.csv'))
def check_group_enrichment(tested_gene_file_name, total_gene_file_name, go_folder, th=1): if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0: return [] if type(total_gene_file_name) == str: total_gene_list = load_gene_list(total_gene_file_name) else: total_gene_list = total_gene_file_name if type(tested_gene_file_name) == str: tested_gene_list = load_gene_list(tested_gene_file_name) else: tested_gene_list = tested_gene_file_name if not os.path.exists(os.path.join(go_folder, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(go_folder, constants.GO_FILE_NAME)) if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)): if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME + ".gz")): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open( os.path.join( go_folder, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) global assoc if assoc is None: assoc = read_ncbi_gene2go(os.path.join( go_folder, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, log=None) # "bonferroni", "fdr_bh" g_res = g.run_study( [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected) for cur in g_res] # , cur.p_fdr_bh if cur.p_fdr_bh <= th hg_report = [{ HG_GO_ROOT: cur[0], HG_GO_ID: cur[1], HG_GO_NAME: cur[2], HG_VALUE: cur[3], HG_PVAL: cur[4] } for cur in GO_results] # , HG_QVAL : cur[5] hg_report.sort(key=lambda x: x[HG_PVAL]) # HG_QVAL return hg_report
def enrich(gene2go: str, study: str, obo: str, population: str = None, geneid2symbol: str = None, correct='fdr_bh', alpha=0.05, top=20, goea_out=None, dag_out=None, dpi=300, show_gene_limit=6, only_plot_sig=False): """ Go enrichment based on goatools :param gene2go: a file with two columns: gene_id \t go_term_id :param study: a file with at least one column, first column contains gene id, second columns is regulation direction :param obo: go-basic file download from GeneOntology :param population: a file with each row contains one gene; default to use all genes in gene2go file as population :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot :param correct: pvalue adjustment method: Method used for testing and adjustment of pvalues. Can be either the full name or initial letters. Available methods are: - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : step down method using Sidak adjustments - `holm` : step-down method using Bonferroni adjustments - `simes-hochberg` : step-up method (independent) - `hommel` : closed method based on Simes tests (non-negative) - `fdr_bh` : Benjamini/Hochberg (non-negative) - `fdr_by` : Benjamini/Yekutieli (negative) - `fdr_tsbh` : two stage fdr correction (non-negative) - `fdr_tsbky` : two stage fdr correction (non-negative) :param alpha: fdr cutoff, default 0.05 :param top: n top go terms to plot, sorted by corrected pvalue :param goea_out: output enrichment result file :param dag_out: dag figure file :param dpi: resolution of image, no effect for svg :param show_gene_limit: the max number of gene in a node to show :param only_plot_sig: only plot dag for significantly enriched terms :return: None """ if str(correct) == '3': correct = 'fdr_bh' if geneid2symbol: geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol) if x.strip()) else: geneid2symbol = dict() obo = GODag(obo, optional_attrs=['relationship', 'is_a']) gene2go = read_associations(gene2go) study_genes = [x.strip().split()[0] for x in open(study)] try: reg_dict = dict(x.strip().split()[:2] for x in open(study)) except: reg_dict = {x.strip(): '' for x in open(study)} if not population: population = gene2go.keys() else: population = [ x.strip().split()[0] for x in open(population) if x.strip() ] goea_obj = GOEnrichmentStudy(population, gene2go, obo, propagate_counts=False, alpha=alpha, methods=('fdr_bh', )) keep_if = lambda r: r.ratio_in_study[0] != 0 goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if) goea_out = goea_out or study + '.goea.xls' goea_obj.wr_tsv(goea_out, goea_results_all) def func(y): results = [] genes = [x.strip() for x in y.split(',')] for gene in genes: tmp = [gene] if gene in reg_dict: tmp.append(reg_dict[gene]) if gene in geneid2symbol: tmp.append(geneid2symbol[gene]) results.append('|'.join(tmp)) return ';'.join(results) # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(',')) table = pd.read_table(goea_out, header=0, index_col=0) # 重新校正pvalue, 修改内容 fdr = multipletests(table['p_uncorrected'], method=correct)[1] table['p_fdr_bh'] = fdr # 修改goea_result_all方便后续的画图 for r, fdr in zip(goea_results_all, fdr): r.p_fdr_bh = fdr table.columns = [ x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns ] table['enrichment'] = [ 'e' if x <= alpha else 'p' for x in table['p_corrected'] ] table['study_items'] = table.loc[:, 'study_items'].map(func) # table = table.sort_values(by=['p_corrected', 'p_uncorrected']) table.to_csv(goea_out, header=True, index=True, sep='\t') # -------------------plot dag------------------------ for each in ['BP', 'MF', 'CC']: if only_plot_sig: goea_results_sig = table[table['enrichment'] == 'e'] else: goea_results_sig = table.copy() goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each] if not goea_results_sig.shape[0]: print(f"No significant term to plot for {each} ") return if goea_results_sig.shape[0] >= top: goea_results_sig = goea_results_sig.iloc[:top] goid_subset = list(goea_results_sig.index) # t = obo[goid_subset[5]] # for k, v in t.relationship.items(): # print(t, k, type(v), list(v)[0].id) # print(dag_out[:-4]+'.'+each+dag_out[-4:]) dag_out = dag_out or study + '.goea.dag.svg' plot_gos( dag_out[:-4] + '.' + each + dag_out[-4:], goid_subset, # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现 obo, goea_results= goea_results_all, # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname) # We can further configure the plot... id2symbol=geneid2symbol, # Print study gene Symbols, not GeneIDs study_items=show_gene_limit, # Only max 6 gene Symbols on GO terms items_p_line=3, # Print 3 genes per line) dpi=0 if dag_out.endswith('svg') else dpi, # title="Directed Graph of enriched {} terms".format(each) )
'MEX3A' ] target_gene_prot_ids = get_prot_ids_of_genes(target_gene) gene_prot_id_dict = get_prot_ids_of_genes(gene_names) target_study = list(target_gene_prot_ids.values()) target_study = [x for inside_list in target_study for x in inside_list] study = list(gene_prot_id_dict.values()) study = [x for inside_list in study for x in inside_list] g = GOEnrichmentStudy(pop, assoc, go, propagate_counts=True, alpha=0.05, methods=methods) g_res_target = g.run_study(target_study) pathways_of_target = [x.GO for x in g_res_target if x.study_count > 0] target_assoc = {} for key, entry in assoc.items(): for go_id in pathways_of_target: if go_id in entry: target_assoc[key] = entry g = GOEnrichmentStudy(pop, assoc, go, propagate_counts=True, alpha=0.05, methods=methods) g_res = g.run_study(study) g.prt_txt(sys.stdout, g_res) x = 0
class FunctionalEnrichmentAnalysis(Experiment): """ """ def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh']) def run_study(self): """ """ top_nodes = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:] top_proteins = self.network.get_names(top_nodes) self.raw_results = self.go_study.run_study(set(top_proteins)) def to_csv(self): """ """ self.results = [] for r in self.raw_results: self.results.append({ "name": r.name, "pvalue": r.p_fdr_bh, "goterm_id": r.goterm.id }) self.results = sorted(self.results, key = lambda x: x["pvalue"]) results_df = pd.DataFrame(self.results) results_df.to_csv(os.path.join(self.dir, "all_terms.csv"))
class GeneOntology: def __init__(self, go_obo_path='data/go.obo'): canonical_orfs = paper_orfs self.obodag = GODag(go_obo_path) # read genes containing GO Ontology annotations orfs_with_go = read_sgd_orfs() # only use canonical orfs dataset self.orfs_with_go = orfs_with_go.join(canonical_orfs[[]], how='inner') # create mapping of gene names to set of GO annotaitons assoc = defaultdict(set) for idx, gene in self.orfs_with_go.iterrows(): assoc[gene['name']] = set(gene.ontology.split(',')) self.assoc = assoc self.methods = ['fdr_bh', 'bonferroni'] self.devnull = open('/dev/null', 'w') # create GO enrichment object to run GO self.goeaobj = GOEnrichmentStudy( assoc.keys(), # List of protein-coding genes assoc, # geneid/GO associations self.obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=self.methods, log=self.devnull) def run_go(self, geneids, sig=0.001): """Run gene ontology against set of genes""" self.goea_results_all = self.goeaobj.run_study(geneids) self.goea_results_sig = [ r for r in self.goea_results_all if (r.p_fdr_bh < sig and r.study_count > 0) ] cols = [ 'id', 'name', 'pop_count', 'pop_n', 'study_count', 'study_n', 'pop_items', 'study_items' ] + self.methods results_dic = {} for c in cols: results_dic[c] = [] for g in self.goea_results_all: study_items = ','.join(g.study_items) name = g.name fdr = g.p_fdr_bh pop_items = ','.join(g.pop_items) results_dic['id'].append(g.GO) results_dic['name'].append(name) for method in self.methods: results_dic[method].append( g.__dict__['p_' + method.replace('-', '_')]) results_dic['study_items'].append(study_items) results_dic['pop_items'].append(pop_items) results_dic['study_count'].append(g.study_count) results_dic['pop_count'].append(g.pop_count) results_dic['study_n'].append(g.study_n) results_dic['pop_n'].append(g.pop_n) results_df = pandas.DataFrame(results_dic) self.results_df = results_df[cols].sort_values('fdr_bh').reset_index( drop=True) self.results_sig_df = self.results_df[ (self.results_df.fdr_bh < sig) & (self.results_df.study_count > 0)] def plot_sig(self): plot_results("test_{NS}.pdf", self.goea_results_sig)
def goe( genelist, go_file, goa_file, bg=None, nmin=5, conversion=None, evidence_set={ 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA', 'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM' }): """Finds GO enrichment with goatools (0.7.11 tested). **WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression. Parameters ------------ genelist: list of str Genes to search for enrichment. go_file: str File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)). goa_file: str File path for GO associations. See parameter **conversion**. bg: list of str Background genes. nmin: int Minimum number of principal genes required in GO. conversion: tuple Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation. * name_from: Gene naming system of genelist. For gene names, use 'symbol,alias'. * name_to: Gene naming system of goa_file. Examples: * Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz). * Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz). * species: Species for gene name conversion. Examples: 'human', 'mouse'. evidence_set: set of str `GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias. Returns ---------- goe: pandas.DataFrame GO enrichment. gotop: str Top enriched GO ID genes: list of str or None Intersection list of genes in gotop and also bg. None if bg is None. """ from tempfile import NamedTemporaryFile from os import linesep from goatools.go_enrichment import GOEnrichmentStudy from goatools.obo_parser import GODag from goatools.associations import read_gaf from collections import defaultdict import itertools from biothings_client import get_client import pandas as pd import logging assert type(genelist) is list and len(genelist) > 0 if nmin < 1: nmin = 1 bg0 = bg # Convert gene names if conversion is not None: assert len(conversion) == 3 name_from, name_to, species = conversion mg = get_client('gene') ans = set(genelist) if bg is not None: t1 = set(bg) assert len(ans - t1) == 0 ans |= t1 ans = list(ans) ans = mg.querymany(ans, scopes=name_from, fields=name_to, species=species) t1 = set(['query', '_score', name_to.split('.')[0]]) ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans)) ans = sorted(ans, key=lambda x: x['_score']) convert = {x['query']: x for x in ans} for xi in name_to.split('.'): convert = filter(lambda x: xi in x[1], convert.items()) convert = {x[0]: x[1][xi] for x in convert} convert = { x[0]: x[1] if type(x[1]) is str else x[1][0] for x in convert.items() } genelist2 = list( set([convert[x] for x in filter(lambda x: x in convert, genelist)])) if bg is not None: bg = list( set([convert[x] for x in filter(lambda x: x in convert, bg)])) t1 = set(genelist) converti = list(filter(lambda x: x[0] in t1, convert.items())) t1 = defaultdict(list) for xi in converti: t1[xi[1]].append(xi[0]) converti = dict(t1) t1 = defaultdict(list) for xi in convert.items(): t1[xi[1]].append(xi[0]) convertia = dict(t1) else: genelist2 = genelist # Load GO DAG and association files logging.debug('Reading GO DAG file ' + go_file) godag = GODag(go_file) logging.debug('Reading GO association file ' + goa_file) goa = read_gaf(goa_file, evidence_set=evidence_set) if bg is None: bg = list(goa.keys()) # Compute enrichment goe = GOEnrichmentStudy(bg, goa, godag) ans = goe.run_study(genelist2) # Format output with NamedTemporaryFile() as f: goe.wr_tsv(f.name, ans) ans = f.read() ans = ans.decode() ans = [x.split('\t') for x in ans.split(linesep)] if len(ans[-1]) < 2: ans = ans[:-1] if len(ans) == 0 or len(ans[0]) == 0: raise ValueError('No enrichment found. Check your input ID type.') ans[0][0] = ans[0][0].strip('# ') ans = pd.DataFrame(ans[1:], columns=ans[0]) ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'], axis=1, inplace=True) for xj in ['p_uncorrected', 'p_bonferroni']: ans[xj] = pd.to_numeric(ans[xj], errors='raise') ans['depth'] = pd.to_numeric(ans['depth'], errors='raise', downcast='unsigned') # Odds ratio column and sort column ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio( ans['ratio_in_pop']) ans = ans[[ 'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio', 'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items' ]] ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', '')) # Convert back study_items if conversion is not None: ans['study_items'] = ans['study_items'].apply(lambda x: ','.join( list( itertools.chain.from_iterable( [converti[y] for y in x.split(',')]))) if len(x) > 0 else x) ans.sort_values('p_uncorrected', inplace=True) # Get top enriched GO by P-value gotop = ans[ (ans['odds_ratio'] > 1) & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)] if len(gotop) == 0: raise ValueError('No GO enrichment found for given criteria.') gotop = str(gotop.iloc[0]['GO']) if bg0 is not None: # Children GOs gos = set([gotop] + list(godag.query_term(gotop).get_all_children())) # Look for genes genes = list( filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0, goa)) if conversion is not None: genes = [ convertia[x] for x in filter(lambda x: x in convertia, genes) ] genes = list(set(list(itertools.chain.from_iterable(genes)))) genes = set(genes) genes = list(filter(lambda x: x in genes, bg0)) else: genes = None return (ans, gotop, genes)
if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = args.method.split(",") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods) results = g.run_study(study) if args.outfile is None: g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval) else: # Users can print to both tab-separated file and xlsx file in one run. outfiles = args.outfile.split(",") prt_if = None # Print all values if args.pval is not None: # Only print out when uncorrected p-value < this value. prt_if = lambda nt: nt.p_uncorrected < args.pval for outfile in outfiles: if outfile.endswith(".xlsx"): g.wr_xlsx(outfile, results, prt_if=prt_if) else: g.wr_tsv(outfile, results, prt_if=prt_if)
def check_group_enrichment_goatools(tested_gene_file_name, total_gene_file_name, th=1): if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0: return [] if type(total_gene_file_name) == str: total_gene_list = load_gene_list(total_gene_file_name) else: total_gene_list = total_gene_file_name if type(tested_gene_file_name) == str: tested_gene_list = load_gene_list(tested_gene_file_name) else: tested_gene_list = tested_gene_file_name if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open( os.path.join( constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open( os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) sw = Stopwatch() sw.start() g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=[], log=None) # "bonferroni", "fdr_bh" g_res = g.run_study( [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)]) print sw.stop("done GO analysis in ") # GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if # cur.p_fdr_bh <= 0.05] GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected) for cur in g_res if cur.p_uncorrected <= th] hg_report = [{ HG_GO_ROOT: cur[0], HG_GO_ID: cur[1], HG_GO_NAME: cur[2], HG_VALUE: cur[3], HG_PVAL: cur[4], HG_QVAL: 1 } for cur in GO_results] # , HG_QVAL : cur[5] # hg_report.sort(key=lambda x: x[HG_QVAL]) hg_report.sort(key=lambda x: x[HG_PVAL]) if len(GO_results) > 0: go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals = zip( *GO_results) # , FDRs else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] # output_rows = [("\r\n".join(e2g_convertor(tested_gene_list)), "\r\n".join(go_ns), # "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), # "\r\n".join(map(str, FDRs)))] # print_to_excel(output_rows, str(tested_gene_file_name)[:10], str(total_gene_file_name)[:10]) return hg_report
def get_GO(gene_query, species='mouse'): """ Get Gene Ontologies (GOs). Args: gene_query (array of str): gene list. species (str): Select species. Either "mouse" or "human" Returns: pandas.dataframe: GO analysis results as dataframe. """ sig_thresh = 3 num_genes = None GOIs = gene_query # prepare files # check files _check_data_and_download_if_necessary(go_folder) obodag = GODag(os.path.join(go_folder, "go-basic.obo")) #go analysis if (species == 'human'): geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"), taxids=[9606]) print("{N:,} annotated genes".format(N=len(geneid2gos))) Xtable = pd.read_csv(os.path.join(go_folder, 'hg19_xref.txt'), sep='\t') Xtable.index = Xtable['Approved Symbol'] GOIs_entrez = [ int(x) for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID']) ] elif (species == 'mouse'): geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"), taxids=[10090]) print("{N:,} annotated genes".format(N=len(geneid2gos))) from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus Xtable = pd.read_csv(os.path.join(go_folder, 'biomart_xref.mm10.txt'), sep='\t') Xtable = Xtable[['Associated Gene Name', 'EntrezGene ID']].dropna() Xtable.index = Xtable['Associated Gene Name'] GOIs_entrez = [ int(x) for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID']) ] print("processing " + str(len(GOIs)) + " genes ...") goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method goea_results = goeaobj.run_study(GOIs_entrez) li = [] names = [] go_default_output = goea_results[0].get_prtflds_default() for i in goea_results: li.append(i.get_field_values(go_default_output)) names.append(i.name) df_GO = pd.DataFrame(li) if len(li) != 0: df_GO.columns = go_default_output df_GO["genes"] = df_GO.study_items.apply( lambda x: _ids2symbols(x, species)) else: print("Found No GO with significant p-value") return df_GO
genename_2_id[s[2]] = s[1] id_2_genename = {genename_2_id[i]: i for i in genename_2_id} # Only looking at "biological process" GO terms geneid2gos_yeast = read_gaf('../accessory_files/gene_association.sgd', namespace='BP') ids = [i for i in geneid2gos_yeast.keys()] background_set = [genename_2_id[i] for i in genename_2_id] goeaobj = GOEnrichmentStudy( background_set, # List of all genes in analysis geneid2gos_yeast, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method goea_results_all = goeaobj.run_study(multi_hit_sgdids, keep_if=lambda x: x.p_uncorrected < 0.05) go_results = sorted(goea_results_all, key=lambda r: r.p_fdr_bh) cols = [ 'GO ID', 'GO term', 'pval_uncorrected', 'pval_benjamini_hochberg', 'num_hits', 'num_in_group', 'hits' ] big_mat = [] for res in go_results: big_mat.append([ res.GO, res.name, res.p_uncorrected, res.p_fdr_bh, res.ratio_in_study[0], res.ratio_in_pop[0], ';'.join([id_2_genename[i] for i in res.study_items]) ]) pd.DataFrame(big_mat,
out = open( 'coord_{}_A-{}_B-{}.tsv'.format(strategy, species[key[0]]['vulgar'], species[key[1]]['vulgar']), 'w') out.write('\n'.join([ '{}\t{}\t{}'.format(c, b * reso, b * reso + reso) for c, b in pos_list ]) + '\n') out.close() out = open( 'genes_{}_A-{}_B-{}.tsv'.format(strategy, species[key[0]]['vulgar'], species[key[1]]['vulgar']), 'w') out.write('\n'.join(gene_list) + '\n') out.close() # run GSE test print(' - Repressed ({:5,d}): '.format(len(gene_list))) results_all = goeaobj.run_study( [reverse[k] for k in gene_list if k in reverse], log=None) results_sig = [r for r in results_all if r.p_fdr_bh < 0.05] GO_results[spe]['Repressed'] = results_all if results_sig: for r, s in sorted([(r.get_pvalue(), r.name) for r in results_sig]): print(' -> {:8.3g} {}'.format(r, s)) plot_results("sign_{}_pv05_repressed.png".format(spe.replace(' ', ' ')), results_sig, log=open('/dev/null', 'w')) print('.' * 50) pos_list = [(c, b) for c, b in results[key] if test2(key, c, b) and np.isfinite(results[key][c, b][which])] gene_list = reduce(lambda x, y: x + y, [genes.get((c, b), []) for c, b in pos_list])
from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") from goatools.go_enrichment import GOEnrichmentStudy goeaobj = GOEnrichmentStudy( uniprot_df.Entry, ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.001, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method gos = [] for e in set(uniprot_df[uniprot_df['n'] >= 2]['Encoding']): goea_results = goeaobj.run_study( list(uniprot_df[uniprot_df['Encoding'] == e].Entry)) for r in goea_results: if (r.enrichment == 'e'): id = r.goterm.id name = r.name cat = r.goterm.namespace members = r.study_items p = r.p_fdr_bh gos.append([id, name, cat, e, members, p]) godf = pd.DataFrame( gos, columns=['id', 'name', 'category', 'encoding', 'members', 'p']) godf['unique?'] = ~godf['name'].duplicated(keep=False) godf['member_count'] = godf.members.apply(lambda x: len(x)) godf['representation'] = godf.apply(lambda row: row.member_count / int( np.unique(uniprot_df[uniprot_df.Encoding == row.encoding]['n'])),
elif snakemake.wildcards.state_type == 'Enhancer': min_dist = 5000 max_dist = 50000 else: sys.exit(-1) with open(snakemake.input.clusters) as f: for line in f: cols = line.strip().split() cluster = chr(int(cols[3]) + 65) if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist: genes[cluster].add(cols[7]) background.add(cols[7]) obodag = GODag("go-basic.obo") id2go = read_associations("sym2go.txt") goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) outfile = open(snakemake.output.txt, 'w') for cluster, geneids in sorted(genes.items()): outfile.write("Cluster {}\n".format(cluster)) goea_results_all = goeaobj.run_study(geneids) for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]): outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment)) outfile.write("\n") #GOEnrichmentStudy.print_summary(goea_results_sig)
dictionary = {x: funcs[x] for x in funcs if x in C_int_UP} GO_IDs = {x: assoc[x] for x in assoc if x in C_int_UP} study = dictionary.keys() g = GO_en(pop, assoc, go, propagate_counts=True, alpha=0.05, methods=["bonferroni", "sidak", "holm", "fdr"]) g_res = g.run_study(study) #Select GO terms based on Bonferroni Correction s_bonferroni = [] s_fdr = [] for x in g_res: if x.p_bonferroni <= 0.01: s_bonferroni.append((x.goterm.id, x.p_bonferroni)) if x.p_fdr <= 0.01: s_fdr.append((x.goterm.id, x.p_fdr)) enriched_GO_ID_bon = [i[0] for i in s_bonferroni] enriched_GO_ID_fdr = [i[0] for i in s_fdr] #Only select genes with GO terms that are enriched
from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") from goatools.go_enrichment import GOEnrichmentStudy goeaobj = GOEnrichmentStudy( df.Entry, ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts = False, alpha = 0.001, # default significance cut-off methods = ['fdr_bh']) # default multipletest correction method gos = [] for e in set(df[df['n']>=2]['Encoding']): goea_results = goeaobj.run_study(list(df[df['Encoding']==e].Entry)) for r in goea_results: if (r.p_fdr_bh < 0.001) & (r.enrichment=='e') : id = r.goterm.id name = r.name cat = r.goterm.namespace members = r.study_items gos.append([id, name, cat, e, members]) godf = pd.DataFrame(gos, columns=['id', 'name', 'category', 'encoding', 'members']) godf['unique?'] = ~godf['name'].duplicated(keep=False) godf['representation'] = godf.members.apply (lambda x: len(x)) godf['representation'] = godf.apply (lambda row: row.representation / int(np.unique(df[df.Encoding==row.encoding]['n'])), axis=1) #what % of encodings have at least one significant GO?