Example #1
0
def get_terms_for_go_regex(regex, taxid=9606, add_children=False):

    taxid = _tidy_taxid(taxid)

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)

        gene2go = download_ncbi_associations(prt=null)

        objanno = Gene2GoReader("geneinfo_cache/gene2go",
                                taxids=[taxid],
                                prt=null)
        go2geneids = objanno.get_id2gos(namespace='*',
                                        go2geneids=True,
                                        prt=null)
        srchhelp = GoSearch("geneinfo_cache/go-basic.obo",
                            go2items=go2geneids,
                            log=null)

        results_all = re.compile(r'({})'.format(regex), flags=re.IGNORECASE)
        results_not = re.compile(r'({}).independent'.format(regex),
                                 flags=re.IGNORECASE)

        gos_all = srchhelp.get_matching_gos(results_all, prt=null)
        gos_no = srchhelp.get_matching_gos(results_not, gos=gos_all)
        gos = gos_all.difference(gos_no)
        if add_children:
            gos = srchhelp.add_children_gos(gos)

        return list(gos)
Example #2
0
def test_gosearch(log=sys.stdout):
    """Test GoSearch class with no annotations."""
    taxids = [9606, 10090]
    # Download ontologies and annotations, if necessary
    fin_go_obo = os.path.join(REPO, "go-basic.obo")
    download_go_basic_obo(fin_go_obo, loading_bar=None)
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2GeneIDs and GeneID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None)

    # Initialize GO-search helper object with obo and annotations(go2items)
    for taxid in taxids:
        obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'], log=log)
        assert len(obj.obo_dag) > 40000
    GoSearch(fin_go_obo, dict(), log=log)
    assert len(obj.obo_dag) > 40000
def get_genes_cell_cycle(taxid=9606, log=sys.stdout):
    """Test GOEA with local multipletest correction methods."""
    # Download ontologies and annotations, if necessary
    fin_go_obo = "go-basic.obo"
    if not os.path.exists(fin_go_obo):
        wget.download("http://geneontology.org/ontology/go-basic.obo")
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to 
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2GeneIDs and GeneID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs)

    # Initialize GO-search helper object with obo and annotations(go2items)
    srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'])
    # Compile search pattern for 'cell cycle'
    cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE)
    # Find ALL GOs that have 'cell cycle'. Store results in file.
    fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) 
    with open(fout_allgos, "w") as prt:
        # Search for 'cell cycle' in GO terms
        gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt)
        # Researcher carefully reviews GO results and finds GO:0005764(lysosome)
        # in the results when it should not be because the match was found:
        #     cell cycle-independent
        # Researcher removes 'lysosome' from 'cell cycle' results
        # by removing any GOs matching 'cell cycle-independent'
        cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE)
        gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt)
        gos = gos_cc_all.difference(gos_no_cc)
        # Add children GOs of cell cycle GOs
        gos_all = srch.add_children_gos(gos)
        if log is not None:
            log.write('    taxid {TAXID:>5}\n'.format(TAXID=taxid))
            log.write('    FOUND {N:>5} GOs:   {F}\n'.format(
                N=len(gos_all), F=fout_allgos))
    # Get Entrez GeneIDs for cell cycle GOs
    geneids = srch.get_items(gos_all)
    return geneids
Example #4
0
def get_genes_cell_cycle(taxid=9606, log=sys.stdout):
    """Test GOEA with local multipletest correction methods."""
    # Download ontologies and annotations, if necessary
    fin_go_obo = "go-basic.obo"
    if not os.path.exists(fin_go_obo):
        wget.download("http://geneontology.org/ontology/go-basic.obo")
    # Because get_assoc_ncbi_taxids returns id2gos, we will opt to
    # use the (optional) multi-level dictionary separate associations by taxid
    # taxid2asscs contains both GO2GeneIDs and GeneID2GOs.
    taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
    get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs)

    # Initialize GO-search helper object with obo and annotations(go2items)
    srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'])
    # Compile search pattern for 'cell cycle'
    cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE)
    # Find ALL GOs that have 'cell cycle'. Store results in file.
    fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid)
    with open(fout_allgos, "w") as prt:
        # Search for 'cell cycle' in GO terms
        gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt)
        # Researcher carefully reviews GO results and finds GO:0005764(lysosome)
        # in the results when it should not be because the match was found:
        #     cell cycle-independent
        # Researcher removes 'lysosome' from 'cell cycle' results
        # by removing any GOs matching 'cell cycle-independent'
        cell_cycle_ind = re.compile(r'cell cycle.independent',
                                    flags=re.IGNORECASE)
        gos_no_cc = srch.get_matching_gos(cell_cycle_ind,
                                          gos=gos_cc_all,
                                          prt=prt)
        gos = gos_cc_all.difference(gos_no_cc)
        # Add children GOs of cell cycle GOs
        gos_all = srch.add_children_gos(gos)
        if log is not None:
            log.write('    taxid {TAXID:>5}\n'.format(TAXID=taxid))
            log.write('    FOUND {N:>5} GOs:   {F}\n'.format(N=len(gos_all),
                                                             F=fout_allgos))
    # Get Entrez GeneIDs for cell cycle GOs
    geneids = srch.get_items(gos_all)
    return geneids
Example #5
0
def get_genes_for_go_terms(terms, taxid=9606):

    if type(terms) is not list:
        terms = [terms]

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)
        gene2go = download_ncbi_associations(prt=null)
        objanno = Gene2GoReader("geneinfo_cache/gene2go",
                                taxids=[taxid],
                                prt=null)
        go2geneids = objanno.get_id2gos(namespace='*',
                                        go2geneids=True,
                                        prt=null)
        srchhelp = GoSearch("geneinfo_cache/go-basic.obo",
                            go2items=go2geneids,
                            log=null)

        geneids = srchhelp.get_items(terms)

        ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt'
        if not os.path.exists(ncbi_tsv):
            fetch_background_genes(taxid)

        output_py = f'geneinfo_cache/{taxid}_protein_genes.py'
        ncbi_tsv_to_py(ncbi_tsv, output_py, prt=null)

    protein_genes = importlib.import_module(
        output_py.replace('.py', '').replace('/', '.'))
    GENEID2NT = protein_genes.GENEID2NT

    fetch_ids = geneids

    fetch_ids = list(map(str, fetch_ids))
    records = []
    found = []
    batch_size = 2000
    for i in range(0, len(fetch_ids), batch_size):
        to_fetch = fetch_ids[i:i + batch_size]
        handle = Entrez.esummary(db="gene",
                                 id=",".join(to_fetch),
                                 retmax=batch_size)
        entry = Entrez.read(handle)
        docsums = entry['DocumentSummarySet']['DocumentSummary']
        for doc in docsums:
            try:
                chrom_pos = (doc['Chromosome'],
                             doc['GenomicInfo'][0]['ChrStart'],
                             doc['GenomicInfo'][0]['ChrStop'])
            except:
                print(
                    f"WARNING: missing chromosome coordinates for {doc['Name']} are listed as pandas.NA",
                    file=sys.stderr)
                chrom_pos = (pd.NA, pd.NA, pd.NA)
            records.append((doc['Name'], doc['Description'], *chrom_pos))
            found.append(str(doc.attributes['uid']))
    missing = set(fetch_ids).difference(set(found))

    df = pd.DataFrame().from_records(
        records, columns=['symbol', 'name', 'chrom', 'start', 'end'])

    return df.sort_values(by='start').reset_index(drop=True)
Example #6
0
def get_go_ids(go_ids, species='H**o sapiens'):
    '''
    Fetch all gene symbols associated with a list of gene ontology term IDs.

    Parameters
    ----------
    go_ids : str or list of str
    species : str, optional

    Returns
    -------
    list of str
    '''
    assert species in TAXA

    if isinstance(go_ids, str):
        go_ids = [go_ids]

    obo_fname = download_go_basic_obo('db/go/go-basic.obo')
    gene2go = download_ncbi_associations('db/go/gene2go')

    taxid = TAXA[species]

    fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid)

    module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]])
    module = importlib.import_module(module_name)
    GeneID2nt = module.GENEID2NT

    go2geneids = Gene2GoReader(
        'db/go/gene2go',
        taxids=[taxid],
    )

    go2items = defaultdict(list)
    for i in go2geneids.taxid2asscs[taxid]:
        go2items[i.GO_ID].append(i.DB_ID)

    srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items)

    with open('go.log', 'w') as log:
        # Add children GOs
        gos_all = srchhelp.add_children_gos(go_ids)

        # Get Entrez GeneIDs for cell cycle GOs
        gene_ids = set()

        for go_items in [
                go_ids,
                gos_all,
        ]:
            gene_ids.update(srchhelp.get_items(go_items))

    genes = []

    for geneid in gene_ids:
        nt = GeneID2nt.get(geneid, None)

        if nt is not None:
            genes.append(nt.Symbol)

    return genes