def get_go_terms_descendants(biomart_fpath, gene2go_fpath, gene_count_fpath, obo_fpath, ev_codes=None): entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath) # taxids=[9606] means select only human. if ev_codes: go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True, evidence_set=ev_codes) else: go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True) print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human))) srchhelp = GoSearch(obo_fpath, go2items=go_to_entrez_ids_human) # Get the GO terms gene_cnt_file = open(gene_count_fpath) GO_terms = [] atLine = 0 skipLines = 2 for line in gene_cnt_file: if atLine < skipLines: atLine += 1 continue GO_id = line.split('\t')[0] term = GOterm(GO_id) term.add_descendants(srchhelp) for id in [GO_id] + term.descendants_ids: entrez_ids = go_to_entrez_ids_human[id] for ent_id in entrez_ids: if str(ent_id) in entrez_to_ensembl: ens_id = entrez_to_ensembl[str(ent_id)] term.genes.add(ens_id) GO_terms.append(term) return GO_terms
def fetch_go_hierarcy(go_folder, ev_exclude): obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) print("Downloading gene-GO associations") association_file_location = os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME) # if not os.path.exists(association_file_location): # wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL, # os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)) if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)): if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME + ".gz")): wget.download( constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join( constants.GO_DIR, os.path.split(constants.GO_ASSOCIATION_GENE2GEO_URL)[1])) with gzip.open( os.path.join( go_folder, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) print("Loading gene-GO associations") go2geneids = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True, ev_exclude=ev_exclude) geneids2go = read_ncbi_gene2go(association_file_location, taxids=[9606], ev_exclude=ev_exclude) ## backward compatibility to goatools python 2.7## # all_go_ids=set().union(*list(geneids2go.values())) # for cur_id in all_go_ids: # go2geneids[cur_id]=set() ############################ return (go2geneids, geneids2go)
def __init__(self, taxid, fin_gene2go, fin_gobasic): _fin = os.path.join(REPO, fin_gene2go) dnld_ncbi_gene_file(_fin, loading_bar=None) self.gene2go = read_ncbi_gene2go(_fin, [taxid]) _fin_obo = os.path.join(REPO, fin_gobasic) self.godag = get_godag(_fin_obo, loading_bar=None)
def __init__(self, tax_id=9606, logger=None, force_update=False, go_dir=DEFAULT_GO_DIR, bg_genes=None): # gene_converter can be used to enable automatic gene conversion self.gene_converter = None self.logger = logger or log.get_console_logger(self.__class__.__name__) self.tax_id = tax_id if not os.path.isdir(go_dir): self.logger.warn("Creating master GO directory at %s.", go_dir) os.makedirs(go_dir) else: self.logger.info("Using existing GO directory at %s.", go_dir) self.base_dir = go_dir # get filenames and parse both GAF and OBO self.obo_fn = self.check_and_get_obo(force_update=force_update) self.gaf_fn = self.check_and_get_gaf(force_update=force_update) self.obo = obo_parser.GODag(self.obo_fn) self.gaf = associations.read_ncbi_gene2go(self.gaf_fn, taxids=[self.tax_id]) self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf))) self.bg_genes = bg_genes if self.bg_genes is not None: self.set_bg_genes(bg_genes)
def __init__(self, dir, params): """ """ super().__init__(dir, params) set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) logging.info("Loading disease associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading network...") self.network = Network(self.params["ppi_network"]) self.degrees = np.array(list(dict(self.network.nx.degree()).values())) logging.info("Loading weights...") with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f: split_to_model = pickle.load(f) self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() for model in split_to_model.values()], axis=0) self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees) logging.info("Loading enrichment study...") geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606]) obodag = GODag("data/go/go-basic.obo") self.go_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, propagate_counts = True, alpha = 0.05, methods = ['fdr_bh'])
def check_group_enrichment(tested_gene_file_name, total_gene_file_name): total_gene_list = load_gene_list(total_gene_file_name) tested_gene = load_gene_list(tested_gene_file_name) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results) else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] output_rows = [("\r\n".join(e2g_convertor(tested_gene)), "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))] print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
def _get_id2gos(file_assc, taxids, log): """Return associations.""" taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) fin = os.path.join(REPO, file_assc) dnld_ncbi_gene_file(fin, loading_bar=None) id2gos = read_ncbi_gene2go(fin, taxids, taxid2asscs=taxid2asscs) log.write(" {N} items found in gene2go from NCBI's ftp server\n".format(N=len(id2gos))) return taxid2asscs
def fetch_go_hierarcy(): obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) print "Downloading gene-GO associations" association_file_location = os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME) if not os.path.exists(association_file_location): wget.download(constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)) print "Loading gene-GO associations" # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed? go2geneids = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True) geneids2go = read_ncbi_gene2go(association_file_location, taxids=[9606]) return (go2geneids, geneids2go)
def test_anno_read(): """Test reading an NCBI gene2go annotation file.""" fin_anno = os.path.join(REPO, 'gene2go') _dnld_anno(fin_anno) #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None) print('\nTEST STORING ONLY ONE SPECIES') obj = Gene2GoReader(fin_anno) assert len(obj.taxid2asscs) == 1 obj.prt_summary_anno2ev() print('\nTEST STORING ALL SPECIES') obj = Gene2GoReader(fin_anno, taxids=True) assert len(obj.taxid2asscs) > 1, '**EXPECTED MORE: len(taxid2asscs) == {N}'.format( N=len(obj.taxid2asscs)) obj.prt_summary_anno2ev() print('\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES') print("\nTEST read_ncbi_gene2go_old: [9606]") old_g2go_hsa = read_ncbi_gene2go_old(fin_anno, [9606]) assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, [9606]) print("\nTEST read_ncbi_gene2go_old: 9606") assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, 9606) print("\nTEST read_ncbi_gene2go_old: None") assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, None) print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES') go2geneids = True print("\nTEST read_ncbi_gene2go_old: 9606 go2geneids=True") old_go2gs_hsa = read_ncbi_gene2go_old(fin_anno, [9606], go2geneids=go2geneids) new_go2gs_hsa = read_ncbi_gene2go(fin_anno, 9606, go2geneids=go2geneids) print('OLD:', next(iter(old_go2gs_hsa.items()))) print('NEW:', next(iter(new_go2gs_hsa.items()))) assert old_go2gs_hsa == new_go2gs_hsa print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES') evcodes = set(['ISO', 'IKR']) print("\nTEST read_ncbi_gene2go_old: 9606 evcodes=True") old_gene2gos_evc = read_ncbi_gene2go_old(fin_anno, [9606], evidence_set=evcodes) new_gene2gos_evc = read_ncbi_gene2go(fin_anno, 9606, evidence_set=evcodes) print('OLD:', next(iter(old_gene2gos_evc.items()))) print('NEW:', next(iter(new_gene2gos_evc.items()))) assert old_gene2gos_evc == new_gene2gos_evc
def fetch_go_hierarcy(): obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) go = obo_parser.GODag(obo_file_location, optional_attrs=['relationship']) # also use print "Downloading gene-GO associations" association_file_location = os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME) if not os.path.exists(association_file_location): wget.download( constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)) print "Loading gene-GO associations" # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed? go2geneids_human = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True) print "Writing out GO child-parent links" if not os.path.exists(constants.OUTPUT_GLOBAL_DIR): os.makedirs(constants.OUTPUT_GLOBAL_DIR) out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE, time.time()) genes = [] isa = [] relship = [] with open(os.path.join(constants.OUTPUT_GLOBAL_DIR, out_fname), 'w') as o: for goid in go2geneids_human.keys(): if not go.has_key(goid): print "GO obo file does not contain {}".format(goid) continue entry = go[goid] for gene in go2geneids_human[entry.id]: genes.append((str(gene), entry.id)) o.write("{}\t{}\t{}\n".format("genes", *genes[-1])) children = entry.children for c in children: isa.append((c.id, entry.id)) o.write("{}\t{}\t{}\n".format("is a", *isa[-1])) rels = entry.relationship_rev for rtype in rels.keys(): rs = rels[rtype] for r in rs: relship.append((rtype, r.id, entry.id)) o.write("{}\t{}\t{}\n".format(rtype, *relship[-1])) return (genes, isa, relship)
def load_ontologies_and_associations(self): print "---LOADING ONTOLOGIES AND ASSOCIATIONS---" # Check if files exist and download if not obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() # Load ontologies and associations obodag = GODag(obo_fname) geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606]) print "{N:,} annotated human genes".format(N=len(geneid2gos_human)) return obodag, geneid2gos_human
def test_all(): obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") godag = GODag(dag_fin) gosubdag = GoSubDag(godag.keys(), godag) out = sys.stdout write_hier_all(gosubdag, out)
def get_ensembl_ids(go_process_id, biomart_fpath): entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath) gene2go = download_ncbi_associations() # taxids=[9606] means select only human. # TODO: ask Marinka if we should use EXP code for evidence!! go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True) """, evidence_set='EXP'""" entrez_ids = go_to_entrez_ids_human[GO_PROCESS_ID] ensembl_ids = [] for ent_id in entrez_ids: ensembl_ids.append(entrez_to_ensembl[str(ent_id)]) print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human))) return ensembl_ids
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) paths = utils.read_paths(args.paths_file) go = obo_parser.GODag(args.obo_file) gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606]) termcounts = TermCounts(go, gene2go) if args.namespace is not None: if args.namespace == 'cc': go = { go_term: values for go_term, values in go.items() if values.namespace == 'cellular_component' } elif args.namespace == 'mf': go = { go_term: values for go_term, values in go.items() if values.namespace == 'molecular_function' } elif args.namespace == 'bp': go = { go_term: values for go_term, values in go.items() if values.namespace == 'biological_process' } else: raise ValueError('namespace can be only cc, mf or bp') wrapped = [[path, go, gene2go, termcounts] for path in paths] if args.n_cores > 1: sims = list(p_map(wrap, wrapped)) else: sims = list(map(wrap, tqdm(wrapped))) utils.create_dir_if_not_exist(dirname(args.out_sims_file)) np.savetxt(args.out_sims_file, sims)
def __init__(self, dir, params): """ Constructor Args: dir (string) directory of the experiment to be run """ super().__init__(dir, params) # Set the logger set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True) # Log title logging.info("Disease Protein Prediction") logging.info("Sabri Eyuboglu -- SNAP Group") logging.info("======================================") logging.info("Loading Disease Associations...") self.diseases_dict = load_diseases(self.params["associations_path"], self.params["disease_subset"], exclude_splits=['none']) logging.info("Loading Network...") self.network = Network(self.params["ppi_network"]) logging.info("Loading enrichment study...") obodag = GODag(self.params["go_path"]) geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606]) self.enrichment_study = GOEnrichmentStudy(self.network.get_names(), geneid2go, obodag, log=None, **self.params["enrichment_params"]) logging.info("Loading predictions...") self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), index_col=0) for name, preds in self.params["method_to_preds"].items()} outputs_path = os.path.join(self.dir, "outputs.pkl") if os.path.exists(outputs_path): logging.info("Loading outputs...") with open(outputs_path, 'rb') as f: self.outputs = pickle.load(f) else: self.outputs = {}
def get_ensembl_ids(go_process_id, biomart_fpath, ev_codes=None): entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath) gene2go = 'data/gene2go.txt' # If file doesn't exist, then replace this line with gene2go = download_ncbi_associations() # taxids=[9606] means select only human. go_to_entrez_ids_human = read_ncbi_gene2go(gene2go, taxids=[9606], go2geneids=True) print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human))) entrez_ids = go_to_entrez_ids_human[go_process_id] print '# of Entrez IDs associated with ', go_process_id, ' = ', len(entrez_ids) ensembl_ids = [] for ent_id in entrez_ids: if str(ent_id) in entrez_to_ensembl: ensembl_ids.append(entrez_to_ensembl[str(ent_id)]) print '# of Ensembl IDs associated with ', go_process_id, ' = ', len(ensembl_ids) return ensembl_ids
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes print('CWD', os.getcwd()) study_ids = _get_geneids() population_ids = GENEID2NT.keys() # Get databases print(os.getcwd()) fin = os.path.join(REPO, 'gene2go') dnld_ncbi_gene_file(fin, loading_bar=None) gene2go = read_ncbi_gene2go(fin, [9606]) fin_obo = os.path.join(REPO, "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def get_go_terms(biomart_fpath, gene2go_fpath, gene_count_fpath, top=1): """ :param biomart_fpath: :param gene2go_fpath: :param gene_count_fpath: Path to file containing number of genes for each GO term contained in the supplementary file :param top: :return: """ entrez_to_ensembl = map_entrez_to_ensembl(biomart_fpath) # taxids=[9606] means select only human. go_to_entrez_ids_human = read_ncbi_gene2go(gene2go_fpath, taxids=[9606], go2geneids=True) print("{N} GO terms associated with human NCBI Entrez GeneIDs".format(N=len(go_to_entrez_ids_human))) # Get the |top| GO terms with the most gene annotations gene_cnt_file = open(gene_count_fpath) top_GO_ids = [] atLine = 0 skipLines = 1 for line in gene_cnt_file: if atLine < skipLines: atLine += 1 continue elif atLine > top: break atLine += 1 GO_id = line.split('\t')[0] entrez_ids = go_to_entrez_ids_human[GO_id] #print '# of Entrez IDs associated with ', GO_id, ' = ', len(entrez_ids) ensembl_ids = [] for ent_id in entrez_ids: if str(ent_id) in entrez_to_ensembl: ensembl_ids.append(entrez_to_ensembl[str(ent_id)]) top_GO_ids.append((GO_id, ensembl_ids)) #print '# of Ensembl IDs associated with ', GO_id, ' = ', len(ensembl_ids) return top_GO_ids
def load_go_annotations(proteins, levels=None, obodag_path="data/go/go-basic.obo", entrez_to_go_path="data/go/gene2go.txt"): """ args: @proteins (iterable) proteins to get annotations for @levels (list(int)) the levels of the ontology @obodag (str) path obo file @entrez_to_go_path (str) path to mapping from entrez ids to go doids return: @term_to_proteins (dict) map from term """ obodag = GODag(obodag_path) entrez_to_go = read_ncbi_gene2go(entrez_to_go_path, taxids=[9606]) def get_annotations(protein, levels): """ """ terms = set() doids = entrez_to_go[protein] for doid in doids: for parent in obodag[doid].get_all_parents(): if levels is None or obodag[parent].level in levels: terms.add(obodag[parent].name) return terms term_to_proteins = defaultdict(set) for protein in proteins: terms = get_annotations(protein, levels) for term in terms: term_to_proteins[term].add(protein) return term_to_proteins
get_sps, destnodes_sample, n_jobs=args.N_cores), []) logging.info('Num of all paths: {}'.format(len(all_paths))) fc_paths = [] for i in trange(len(all_paths)): fullpath = all_paths[i] if len(fullpath) > 2: path = all_paths[i][1:-1] if np.all([node in fcnodes for node in path]): fc_paths.append(fullpath) logging.info('Num of FC paths: {}'.format(len(fc_paths))) go = obo_parser.GODag(args.obo_file) gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606]) termcounts = TermCounts(go, gene2go) def get_sim(genes_pair): # sim_measure = lin_sim i, j = genes_pair[0], genes_pair[1] i_go = [goterm for goterm in gene2go[i] if goterm in go] j_go = [goterm for goterm in gene2go[j] if goterm in go] sims = [] for i_go_term in i_go: def wrap(j_go_term): return resnik_sim(i_go_term, j_go_term, go, termcounts) simlist = [sim for sim in map(wrap, j_go) if sim is not None] if len(simlist):
def find_clusters_and_gene_enrichment(tested_gene_list_file_name, total_gene_list_file_name, gene_expression_file_name, phenotype_file_name, gene_filter_file_name=None, tested_gene_list_path=None, total_gene_list_path=None, gene_expression_path=None, phenotype_path=None, gene_filter_file_path=None, var_th_index=None, start_k=2, end_k=6, calc_go=True, enrichment_list_file_names=None, meta_groups=None, filter_expression=None, cluster_algorithm=None): # fetch gene expression by gene_id, divided by tumor type gene_sets = [] expression_sets = [] averaged_expression_sets = [] tested_gene_expression = load_gene_expression_profile_by_genes( tested_gene_list_file_name, gene_expression_file_name, gene_filter_file_name, tested_gene_list_path, gene_expression_path, gene_filter_file_path) tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers( tested_gene_expression) if filter_expression is not None: filtered_patients = [ y for x in divided_patient_ids_by_label(phenotype_file_name, groups=filter_expression) for y in x ] print "number of filtered patients from phenotypes: {}".format( len(filtered_patients)) else: print "no filter applied" filtered_patients = tested_gene_expression_headers_columns tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients( filtered_patients, tested_gene_expression_headers_columns, tested_gene_expression) if np.shape(tested_gene_expression)[1] == 1: print "no expressions were found after filtering by labels {}. skipping...".format( filter_expression) return None total_gene_list = load_gene_list(total_gene_list_file_name) tested_gene_list = load_gene_list(tested_gene_list_file_name) row_var = np.var(tested_gene_expression, axis=1) row_var_sorted = np.sort(row_var)[::-1] labels_assignment_patients = None if meta_groups is not None: print "clustering patients by groups" labels_assignment_patients = labels_assignments( meta_groups, phenotype_file_name, tested_gene_expression_headers_columns) enrichment_lists = [] if enrichment_list_file_names is not None: for cur in enrichment_list_file_names: enrichment_lists.append(load_gene_list(cur)) if var_th_index is None: var_th_index = len(row_var_sorted) - 1 row_var_th = row_var_sorted[var_th_index] row_var_masked_indices = np.where(row_var_th > row_var)[0] gene_expression_top_var = np.delete(tested_gene_expression, row_var_masked_indices, axis=0) gene_expression_top_var_header_rows = np.delete( tested_gene_expression_headers_rows, row_var_masked_indices, axis=0) gene_expression_top_var_header_columns = tested_gene_expression_headers_columns clfs_results = {} output_rows = [] if calc_go: if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download( constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')): # wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=["bonferroni", "fdr_bh"]) g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_expression_top_var_header_rows) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] print GO_results if cluster_algorithm == "kmeans": for n_clusters in range(start_k, end_k + 1): clfs_results[n_clusters] = [] centres, km_clf, dist = kmeanssample(X=gene_expression_top_var, k=n_clusters, metric="euclidean") for i in range(n_clusters): ranks = [] for j in range(n_clusters): ranks.append( np.average( np.delete(gene_expression_top_var, np.where(km_clf != j)[0], axis=0))) ranks = rankdata(ranks) cluster_labels = np.array(km_clf) for j in range(n_clusters): cluster_labels[np.where(km_clf == ranks[j] - 1)] = j labels_assignment = [cluster_labels + 1] cluster_indices = np.where(km_clf != i)[0] gene_expression_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) gene_headers_row_cluster = np.delete( gene_expression_top_var_header_rows, cluster_indices, axis=0) clfs_results[n_clusters].append( (gene_headers_row_cluster, gene_headers_row_cluster)) desc = "k={} clustering cluster {} has {} genes".format( n_clusters, i, len(gene_expression_cluster)) gene_list = ",".join(gene_headers_row_cluster) url = check_enrichment(gene_list) go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] if calc_go: g_res = g.run_study([ int(cur) for cur in ensembl2entrez_convertor( gene_headers_row_cluster) ]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05] if len(GO_results) > 0: go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip( *GO_results) if len(enrichment_lists) != 0: for j, cur in enumerate(enrichment_lists): go_terms.append( enrichment_list_file_names[j].split(".")[0]) uncorrectd_pvals.append( calc_HG_test( [x.split(".")[0] for x in tested_gene_list], [x.split(".")[0] for x in cur], [ x.split(".")[0] for x in gene_headers_row_cluster ])) FDRs.append(".") go_names.append(".") go_ns.append(".") output_rows.append((desc, "\r\n".join([ x.split(".")[0] for x in gene_headers_row_cluster ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), "\r\n".join(map(str, FDRs)))) gene_sorted_heatmap = np.rot90(np.flip( gene_expression_top_var[cluster_labels.argsort(), :], 1), k=-1, axes=(1, 0)) find_clusters(end_k, gene_sorted_heatmap, gene_expression_top_var_header_columns, start_k, e2g_convertor(gene_expression_top_var_header_rows), tested_gene_list_file_name, labels_assignment=labels_assignment_patients) plot_heatmap(gene_expression_top_var, gene_expression_top_var_header_columns, labels_assignment, gene_expression_top_var_header_rows, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1), k=-1, axes=(1, 0)) if cluster_algorithm == "hierarchical": df = pd.DataFrame(data=gene_sorted_heatmap, index=gene_expression_top_var_header_columns, columns=gene_expression_top_var_header_rows) # correlations = df.corr() # correlations_array = np.asarray(df.corr()) # # row_linkage = hierarchy.linkage( # distance.pdist(correlations_array), method='average') # # col_linkage = hierarchy.linkage( # distance.pdist(correlations_array.T), method='average') # enrichment_gene_list = load_gene_list("uvm_mito_part.txt") dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg")) row_colors = map(dct.get, labels_assignment_patients[0]) dct = {1: 'b', 2: 'r'} gene_expression_top_var_header_rows_trimmed = [ x.split(".")[0] for x in gene_expression_top_var_header_rows ] # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed]) g = sns.clustermap(df, row_colors=row_colors, metric="euclidean", robust=True, method="single") # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage, # labels=df.index, # color_threshold=0.60) den_genes = scipy.cluster.hierarchy.dendrogram( g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7) clusters = get_cluster_classes(den_genes) g.savefig( os.path.join(constants.BASE_PROFILE, "output", "hierarchical_cluster_{}.png".format(time.time()))) for cur_labels_assignment_patient in labels_assignment_patients: plot_heatmap(gene_sorted_heatmap, gene_expression_top_var_header_rows, [cur_labels_assignment_patient], gene_expression_top_var_header_columns, tested_gene_list_file_name, n_clusters=None, label_index=None, phenotype_heatmap=None) print_to_excel( output_rows=output_rows, gene_list_file_name=tested_gene_list_file_name.split(".")[0], gene_expression_file_name=gene_expression_file_name.split(".")[0], var_th_index=var_th_index)
def check_group_enrichment_goatools(tested_gene_file_name, total_gene_file_name, th=1): if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0: return [] if type(total_gene_file_name) == str: total_gene_list = load_gene_list(total_gene_file_name) else: total_gene_list = total_gene_file_name if type(tested_gene_file_name) == str: tested_gene_list = load_gene_list(tested_gene_file_name) else: tested_gene_list = tested_gene_file_name if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open( os.path.join( constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open( os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) sw = Stopwatch() sw.start() g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, methods=[], log=None) # "bonferroni", "fdr_bh" g_res = g.run_study( [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)]) print sw.stop("done GO analysis in ") # GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if # cur.p_fdr_bh <= 0.05] GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected) for cur in g_res if cur.p_uncorrected <= th] hg_report = [{ HG_GO_ROOT: cur[0], HG_GO_ID: cur[1], HG_GO_NAME: cur[2], HG_VALUE: cur[3], HG_PVAL: cur[4], HG_QVAL: 1 } for cur in GO_results] # , HG_QVAL : cur[5] # hg_report.sort(key=lambda x: x[HG_QVAL]) hg_report.sort(key=lambda x: x[HG_PVAL]) if len(GO_results) > 0: go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals = zip( *GO_results) # , FDRs else: go_terms = [] uncorrectd_pvals = [] FDRs = [] go_names = [] go_ns = [] # output_rows = [("\r\n".join(e2g_convertor(tested_gene_list)), "\r\n".join(go_ns), # "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)), # "\r\n".join(map(str, FDRs)))] # print_to_excel(output_rows, str(tested_gene_file_name)[:10], str(total_gene_file_name)[:10]) return hg_report
get_ipython().system( ' wget http://www.geneontology.org/ontology/subsets/goslim_generic.obo') obo_fname = download_go_basic_obo() from goatools.base import download_ncbi_associations gene2go = download_ncbi_associations() if goset == 'goslim': obodag = GODag("goslim_generic.obo") else: obodag = GODag("go-basic.obo") geneid2gos = read_ncbi_gene2go("gene2go", taxids=[9606]) levels = [r.level for go, r in obodag.items()] [(i, levels.count(i)) for i in range(1, 12)] bad_go = [] for go, r in obodag.iteritems(): if r.level > cutlvl: bad_go.append(go) bad_go = set(bad_go) len(bad_go) for go, r in obodag.items(): nps = set() for p in r._parents:
def goea(gene_ids, gene_symbols, trajectory, cluster, out_dir ): ## list of genes represented by their ensembl id and gene symbol ## load ontologies if not os.path.exists(out_dir): os.mkdir(out_dir) from goatools.obo_parser import GODag obodag = GODag("goea/go-basic.obo") ## load associations from goatools.associations import read_ncbi_gene2go geneid2gos_human = read_ncbi_gene2go("goea/gene2go", taxids=[9606]) ## background gene set from goea.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_human ## GOEA object from goatools.go_enrichment import GOEnrichmentStudy goeaobj = GOEnrichmentStudy( GeneID2nt_human.keys(), # List of mouse protein-coding genes geneid2gos_human, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} for gene_symbol in gene_symbols: for id in GeneID2nt_human.keys(): if GeneID2nt_human[id][5] == gene_symbol: geneid2symbol[int(id)] = gene_symbol #from PyEntrezId import Conversion #for (gene_id, gene_symbol) in zip(gene_ids, gene_symbols): # id = Conversion('*****@*****.**') # gene_id = id.convert_ensembl_to_entrez(gene_id) ## get entrez # geneid2symbol[int(gene_id)] = gene_symbol ## identify which id correspond to the genes in the cluster ## Run GOEA # 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using. """ import rpy2 from rpy2.robjects import r, pandas2ri from rpy2.robjects import pandas2ri import rpy2.robjects as robjects robjects.r(''' f <- function(geneNames) { library(clusterProfiler) kk <- enrichKEGG(geneNames) as.data.frame(kk) } ''') r_enrich = robjects.globalenv['f'] """ # print(r_enrich.r_repr()) gene_names = np.array(list(geneid2symbol.keys())) print(gene_names) """ pandas2ri.activate() res = r_enrich(gene_names) res = r_enrich(gene_names, organism="hsa", pvalueCutoff=0.5, pAdjustMethod="BH", qvalueCutoff=0.1) print(res) print(pandas2ri.ri2py(res)) return """ geneids_study = geneid2symbol.keys() with open( out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'genes.txt', 'w') as f: for gene in geneids_study: f.write("%s\n" % gene) goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] ## Write the result to file goeaobj.wr_xlsx(out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'goea_symbols.xlsx', goea_results_sig, itemid2name=geneid2symbol) goeaobj.wr_xlsx( out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) + 'goea_geneids.xlsx', goea_results_sig)
def get_GO(gene_query, species='mouse'): """ Get Gene Ontologies (GOs). Args: gene_query (array of str): gene list. species (str): Select species. Either "mouse" or "human" Returns: pandas.dataframe: GO analysis results as dataframe. """ sig_thresh = 3 num_genes = None GOIs = gene_query # prepare files # check files _check_data_and_download_if_necessary(go_folder) obodag = GODag(os.path.join(go_folder, "go-basic.obo")) #go analysis if (species == 'human'): geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"), taxids=[9606]) print("{N:,} annotated genes".format(N=len(geneid2gos))) Xtable = pd.read_csv(os.path.join(go_folder, 'hg19_xref.txt'), sep='\t') Xtable.index = Xtable['Approved Symbol'] GOIs_entrez = [ int(x) for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID']) ] elif (species == 'mouse'): geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"), taxids=[10090]) print("{N:,} annotated genes".format(N=len(geneid2gos))) from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus Xtable = pd.read_csv(os.path.join(go_folder, 'biomart_xref.mm10.txt'), sep='\t') Xtable = Xtable[['Associated Gene Name', 'EntrezGene ID']].dropna() Xtable.index = Xtable['Associated Gene Name'] GOIs_entrez = [ int(x) for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID']) ] print("processing " + str(len(GOIs)) + " genes ...") goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method goea_results = goeaobj.run_study(GOIs_entrez) li = [] names = [] go_default_output = goea_results[0].get_prtflds_default() for i in goea_results: li.append(i.get_field_values(go_default_output)) names.append(i.name) df_GO = pd.DataFrame(li) if len(li) != 0: df_GO.columns = go_default_output df_GO["genes"] = df_GO.study_items.apply( lambda x: _ids2symbols(x, species)) else: print("Found No GO with significant p-value") return df_GO
def check_group_enrichment(tested_gene_file_name, total_gene_file_name, go_folder, th=1): if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0: return [] if type(total_gene_file_name) == str: total_gene_list = load_gene_list(total_gene_file_name) else: total_gene_list = total_gene_file_name if type(tested_gene_file_name) == str: tested_gene_list = load_gene_list(tested_gene_file_name) else: tested_gene_list = tested_gene_file_name if not os.path.exists(os.path.join(go_folder, constants.GO_FILE_NAME)): download(constants.GO_OBO_URL, constants.GO_DIR) obo_dag = GODag(os.path.join(go_folder, constants.GO_FILE_NAME)) if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)): if not os.path.exists( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME + ".gz")): download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR) with gzip.open( os.path.join( go_folder, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in: with open( os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) global assoc if assoc is None: assoc = read_ncbi_gene2go(os.path.join( go_folder, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) g = GOEnrichmentStudy( [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)], assoc, obo_dag, log=None) # "bonferroni", "fdr_bh" g_res = g.run_study( [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)]) GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected) for cur in g_res] # , cur.p_fdr_bh if cur.p_fdr_bh <= th hg_report = [{ HG_GO_ROOT: cur[0], HG_GO_ID: cur[1], HG_GO_NAME: cur[2], HG_VALUE: cur[3], HG_PVAL: cur[4] } for cur in GO_results] # , HG_QVAL : cur[5] hg_report.sort(key=lambda x: x[HG_PVAL]) # HG_QVAL return hg_report
def parse_gene2go_info(self, taxids): full_gene2go_dict = read_ncbi_gene2go(self.gene2go_path, taxids=taxids) return full_gene2go_dict
def __init__(self, go_path="data/go/go-basic.obo", gene2go_path="data/go/gene2go"): self.obo = GODag(go_path) self.gene_to_go = read_ncbi_gene2go(gene2go_path, go2geneids=False)
query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol', species='human', entrezonly=True, as_dataframe=True, df_index=False, verbose=False) if 'notfound' in query_res.columns: query_res = query_res[query_res.notfound != True] # ignore PEP8 warnings. query_result_list.append(query_res) df_res = pd.concat(query_result_list) res = dict(zip(df_res.entrezgene, df_res.symbol)) return res with HidePrints(): _go_dag = obo_parser.GODag(go_obo_path) _gaf = read_gaf(gaf_path, prt=None) _termcounts = TermCounts(_go_dag, _gaf) _gene2go = read_ncbi_gene2go(gene2go_path) _gene2symbol = _init_gene2symbol_dict() _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()} def get_genes(): return list(_gene2go.keys()) def get_symbols(): return list(_gene2symbol.values()) def get_gene2go(): return _gene2go
# Data will be stored in this variable import os import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import goatools from goatools.base import download_go_basic_obo from goatools.base import download_ncbi_associations from goatools.obo_parser import GODag from goatools.associations import read_ncbi_gene2go from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus from goatools.go_enrichment import GOEnrichmentStudy obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") geneid2gos_mouse = read_ncbi_gene2go("gene2go", taxids=[10090]) geneid2symbol = {} print("{N:,} annotated mouse genes".format(N=len(geneid2gos_mouse))) print(GeneID2nt_mus.keys().head()) goeaobj = GOEnrichmentStudy( GeneID2nt_mus.keys(), # List of mouse protein-coding genes geneid2gos_mouse, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method
def invert_dol_nonunique(d): newdict = {} for k in d: for v in d[k]: newdict.setdefault(v, []).append(k) return newdict PATH_TO_MIMOSCA = os.path.expanduser( os.path.join("~", "Desktop", "programs", "MIMOSCA-master")) path2db = os.path.expanduser( os.path.join(PATH_TO_MIMOSCA, "common_files", "data")) + os.path.sep path2Xref = os.path.expanduser( os.path.join(PATH_TO_MIMOSCA, "common_files", "data")) + os.path.sep geneid2gos = read_ncbi_gene2go(path2db + "gene2go", taxids=[9606]) go2idx = invert_dol_nonunique(geneid2gos) Xtable = pd.read_csv(os.path.join(path2Xref, 'hg19_xref.txt'), sep='\t') idx2gene = { int(idx): symbol for idx, symbol in zip(Xtable['EntrezGene ID'], Xtable['Approved Symbol']) if not math.isnan(idx) } def go2genes(go, sep=';'): indices = go2idx.get(go) if indices is None: return '' else: