def get_terms_for_go_regex(regex, taxid=9606, add_children=False): taxid = _tidy_taxid(taxid) with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) gene2go = download_ncbi_associations(prt=null) objanno = Gene2GoReader("geneinfo_cache/gene2go", taxids=[taxid], prt=null) go2geneids = objanno.get_id2gos(namespace='*', go2geneids=True, prt=null) srchhelp = GoSearch("geneinfo_cache/go-basic.obo", go2items=go2geneids, log=null) results_all = re.compile(r'({})'.format(regex), flags=re.IGNORECASE) results_not = re.compile(r'({}).independent'.format(regex), flags=re.IGNORECASE) gos_all = srchhelp.get_matching_gos(results_all, prt=null) gos_no = srchhelp.get_matching_gos(results_not, gos=gos_all) gos = gos_all.difference(gos_no) if add_children: gos = srchhelp.add_children_gos(gos) return list(gos)
def test_gosearch(log=sys.stdout): """Test GoSearch class with no annotations.""" taxids = [9606, 10090] # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) # Initialize GO-search helper object with obo and annotations(go2items) for taxid in taxids: obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'], log=log) assert len(obj.obo_dag) > 40000 GoSearch(fin_go_obo, dict(), log=log) assert len(obj.obo_dag) > 40000
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods.""" # Download ontologies and annotations, if necessary fin_go_obo = "go-basic.obo" if not os.path.exists(fin_go_obo): wget.download("http://geneontology.org/ontology/go-basic.obo") # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format( N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods.""" # Download ontologies and annotations, if necessary fin_go_obo = "go-basic.obo" if not os.path.exists(fin_go_obo): wget.download("http://geneontology.org/ontology/go-basic.obo") # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format(N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def get_genes_for_go_terms(terms, taxid=9606): if type(terms) is not list: terms = [terms] with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) gene2go = download_ncbi_associations(prt=null) objanno = Gene2GoReader("geneinfo_cache/gene2go", taxids=[taxid], prt=null) go2geneids = objanno.get_id2gos(namespace='*', go2geneids=True, prt=null) srchhelp = GoSearch("geneinfo_cache/go-basic.obo", go2items=go2geneids, log=null) geneids = srchhelp.get_items(terms) ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(ncbi_tsv): fetch_background_genes(taxid) output_py = f'geneinfo_cache/{taxid}_protein_genes.py' ncbi_tsv_to_py(ncbi_tsv, output_py, prt=null) protein_genes = importlib.import_module( output_py.replace('.py', '').replace('/', '.')) GENEID2NT = protein_genes.GENEID2NT fetch_ids = geneids fetch_ids = list(map(str, fetch_ids)) records = [] found = [] batch_size = 2000 for i in range(0, len(fetch_ids), batch_size): to_fetch = fetch_ids[i:i + batch_size] handle = Entrez.esummary(db="gene", id=",".join(to_fetch), retmax=batch_size) entry = Entrez.read(handle) docsums = entry['DocumentSummarySet']['DocumentSummary'] for doc in docsums: try: chrom_pos = (doc['Chromosome'], doc['GenomicInfo'][0]['ChrStart'], doc['GenomicInfo'][0]['ChrStop']) except: print( f"WARNING: missing chromosome coordinates for {doc['Name']} are listed as pandas.NA", file=sys.stderr) chrom_pos = (pd.NA, pd.NA, pd.NA) records.append((doc['Name'], doc['Description'], *chrom_pos)) found.append(str(doc.attributes['uid'])) missing = set(fetch_ids).difference(set(found)) df = pd.DataFrame().from_records( records, columns=['symbol', 'name', 'chrom', 'start', 'end']) return df.sort_values(by='start').reset_index(drop=True)
def get_go_ids(go_ids, species='H**o sapiens'): ''' Fetch all gene symbols associated with a list of gene ontology term IDs. Parameters ---------- go_ids : str or list of str species : str, optional Returns ------- list of str ''' assert species in TAXA if isinstance(go_ids, str): go_ids = [go_ids] obo_fname = download_go_basic_obo('db/go/go-basic.obo') gene2go = download_ncbi_associations('db/go/gene2go') taxid = TAXA[species] fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid) module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]]) module = importlib.import_module(module_name) GeneID2nt = module.GENEID2NT go2geneids = Gene2GoReader( 'db/go/gene2go', taxids=[taxid], ) go2items = defaultdict(list) for i in go2geneids.taxid2asscs[taxid]: go2items[i.GO_ID].append(i.DB_ID) srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items) with open('go.log', 'w') as log: # Add children GOs gos_all = srchhelp.add_children_gos(go_ids) # Get Entrez GeneIDs for cell cycle GOs gene_ids = set() for go_items in [ go_ids, gos_all, ]: gene_ids.update(srchhelp.get_items(go_items)) genes = [] for geneid in gene_ids: nt = GeneID2nt.get(geneid, None) if nt is not None: genes.append(nt.Symbol) return genes