Esempio n. 1
0
def get_species_genes(species):
    """
    """
    if species == 'human' or species == 'homo_sapiens':
        objanno = Gene2GoReader(gene2go_path, taxids=[9606])
        symbols_to_ids = {
            val.Symbol: key
            for key, val in GeneID2nt_hum.items()
        }
        genes_list = GeneID2nt_hum.keys()
    elif species == 'mouse' or species == 'mus_musculus':
        objanno = Gene2GoReader(gene2go_path, taxids=[10090])
        symbols_to_ids = {
            val.Symbol: key
            for key, val in GeneID2nt_mus.items()
        }
        genes_list = GeneID2nt_mus.keys()
    else:
        # TODO: overlapping gene names
        objanno = Gene2GoReader(gene2go_path, taxids=[10090, 9606])
        symbols_to_ids = {
            val.Symbol: key
            for key, val in list(GeneID2nt_mus.items()) +
            list(GeneID2nt_hum.items())
        }
        genes_list = set(GeneID2nt_mus.keys()).union(GeneID2nt_hum.keys())
    ns2assoc = objanno.get_ns2assc()
    ids_to_symbols = {val: key for key, val in symbols_to_ids.items()}
    return ns2assoc, ids_to_symbols, symbols_to_ids, genes_list
Esempio n. 2
0
def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)
    objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090])
    objanno_mmuhsa = Gene2GoReader(fin_gene2go,
                                   godag=godag,
                                   taxids=[10090, 9606])

    # Get associations
    # pylint: disable=bad-whitespace
    ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all)
    ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu)
    ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa)
    ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa)

    # Check results
    for nspc in ['BP', 'MF', 'CC']:
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc]
        assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc]
    _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)
Esempio n. 3
0
    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return
Esempio n. 4
0
def test_anno_read():
    """Test reading an NCBI gene2go annotation file."""
    fin_anno = os.path.join(REPO, 'gene2go')
    _dnld_anno(fin_anno)
    #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None)

    print('\nTEST STORING ONLY ONE SPECIES')
    obj = Gene2GoReader(fin_anno)
    assert len(obj.taxid2asscs) == 1
    obj.prt_summary_anno2ev()

    print('\nTEST STORING ALL SPECIES')
    obj = Gene2GoReader(fin_anno, taxids=True)
    assert len(obj.taxid2asscs) > 1, '**EXPECTED MORE: len(taxid2asscs) == {N}'.format(
        N=len(obj.taxid2asscs))
    obj.prt_summary_anno2ev()

    print('\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES')
    print("\nTEST read_ncbi_gene2go_old: [9606]")
    old_g2go_hsa = read_ncbi_gene2go_old(fin_anno, [9606])
    ## new_g2go_hsa = read_ncbi_gene2go(fin_anno, [9606])
    new_g2go_hsa = obj.get_id2gos_nss(taxids=[9606])
    assert old_g2go_hsa == new_g2go_hsa, \
      'OLD({O}) != NEW({N})'.format(O=len(old_g2go_hsa), N=len(new_g2go_hsa))
    print("\nTEST read_ncbi_gene2go_old: 9606")
    ## assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, 9606)
    assert old_g2go_hsa == obj.get_id2gos_nss(taxid=9606)

    print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES')
    go2geneids = True
    print("\nTEST read_ncbi_gene2go_old: 9606 go2geneids=True")
    old_go2gs_hsa = read_ncbi_gene2go_old(fin_anno, [9606], go2geneids=go2geneids)
    ## new_go2gs_hsa = read_ncbi_gene2go(fin_anno, 9606, go2geneids=go2geneids)
    new_go2gs_hsa = obj.get_id2gos_nss(taxid=9606, go2geneids=go2geneids)
    print('OLD:', next(iter(old_go2gs_hsa.items())))
    print('NEW:', next(iter(new_go2gs_hsa.items())))
    assert old_go2gs_hsa == new_go2gs_hsa, \
       'OLD({O}) != NEW({N})'.format(O=len(old_go2gs_hsa), N=len(new_go2gs_hsa))

    print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES')
    evcodes = set(['ISO', 'IKR'])
    print("\nTEST read_ncbi_gene2go_old: 9606 evcodes=True")
    old_gene2gos_evc = read_ncbi_gene2go_old(fin_anno, taxids=[9606], ev_include=evcodes)
    ## new_gene2gos_evc = read_ncbi_gene2go(fin_anno, 9606, ev_include=evcodes)
    new_gene2gos_evc = obj.get_id2gos_nss(taxid=9606, ev_include=evcodes)
    print('OLD:', next(iter(old_gene2gos_evc.items())))
    print('NEW:', next(iter(new_gene2gos_evc.items())))
    assert old_gene2gos_evc == new_gene2gos_evc
Esempio n. 5
0
def get_objanno(fin_anno, anno_type=None, **kws):
    """Read annotations in GAF, GPAD, Entrez gene2go, or text format."""
    # kws get_objanno: taxids hdr_only prt allow_missing_symbol
    anno_type = get_anno_desc(fin_anno, anno_type)
    if anno_type is not None:
        if anno_type == 'gene2go':
            # kws: taxid taxids
            kws_ncbi = {
                k: kws[k]
                for k in Gene2GoReader.exp_kws.intersection(kws.keys())
            }
            return Gene2GoReader(fin_anno, **kws_ncbi)
        if anno_type == 'gaf':
            kws_gaf = {
                k: kws[k]
                for k in GafReader.exp_kws.intersection(kws.keys())
            }
            return GafReader(fin_anno, **kws_gaf)
        if anno_type == 'gpad':
            kws_gpad = {
                k: kws[k]
                for k in GpadReader.exp_kws.intersection(kws.keys())
            }
            return GpadReader(fin_anno, **kws_gpad)
        if anno_type == 'id2gos':
            kws_id2go = {
                k: kws[k]
                for k in IdToGosReader.exp_kws.intersection(kws.keys())
            }
            return IdToGosReader(fin_anno, **kws_id2go)
    raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format(
        F=fin_anno, D=anno_type))
Esempio n. 6
0
def get_gene_id_mapping(organism=9606, force=False):
	if force & (os.path.isfile('gene2go')):
		os.remove('gene2go')

	gene2go = Gene2GoReader(goatools.base.download_ncbi_associations(), taxids=[organism]).get_id2gos_nss()

	return {str(gene):gene for gene in gene2go.keys()}
Esempio n. 7
0
def get_terms_for_go_regex(regex, taxid=9606, add_children=False):

    taxid = _tidy_taxid(taxid)

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)

        gene2go = download_ncbi_associations(prt=null)

        objanno = Gene2GoReader("geneinfo_cache/gene2go",
                                taxids=[taxid],
                                prt=null)
        go2geneids = objanno.get_id2gos(namespace='*',
                                        go2geneids=True,
                                        prt=null)
        srchhelp = GoSearch("geneinfo_cache/go-basic.obo",
                            go2items=go2geneids,
                            log=null)

        results_all = re.compile(r'({})'.format(regex), flags=re.IGNORECASE)
        results_not = re.compile(r'({}).independent'.format(regex),
                                 flags=re.IGNORECASE)

        gos_all = srchhelp.get_matching_gos(results_all, prt=null)
        gos_no = srchhelp.get_matching_gos(results_not, gos=gos_all)
        gos = gos_all.difference(gos_no)
        if add_children:
            gos = srchhelp.add_children_gos(gos)

        return list(gos)
Esempio n. 8
0
def test_i147_all_taxids():
    """Work with all taxids using Gene2GoReader"""
    # 1. Download Ontologies and Associations
    # 1a. Download Ontologies, if necessary
    #     Get http://geneontology.org/ontology/go-basic.obo
    download_go_basic_obo()

    # 1b. Download Associations, if necessary
    #     Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
    fin_gene2go = download_ncbi_associations()

    # 2. Load Ontologies, Associations and Background gene set
    # 2a. Load Ontologies
    godag = GODag("go-basic.obo")

    # 2b. Load Associations for all species
    #     Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True)

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated mouse genes".format(NS=nspc,
                                                        N=len(id2gos)))
Esempio n. 9
0
    def __GO_enrich__(self):
        go_file = "go-basic.obo"
        if not os.path.exists(go_file):
            download_go_basic_obo()

        # Load gene ontologies
        obodag = GODag("go-basic.obo")

        # Read NCBI's gene2go. Store annotations in a list of namedtuples
        fin_gene2go = download_ncbi_associations()
        objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
        # Get namespace2association where:
        #    namespace is:
        #        BP: biological_process
        #        MF: molecular_function
        #        CC: cellular_component
        #    association is a dict:
        #        key: NCBI GeneID
        #        value: A set of GO IDs associated with that gene
        ns2assoc = objanno.get_ns2assc()

        self.goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hum.keys(),  # List of human protein-acoding genes
            ns2assoc,  # geneID/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'])  # default multipletest correction method
Esempio n. 10
0
def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws):
    """Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
    print(
        'DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader'
    )
    # pylint: disable=protected-access
    frm = sys._getframe().f_back.f_code
    print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format(
        PY=frm.co_filename, FNC=frm.co_name))
    obj = Gene2GoReader(fin_gene2go, taxids=taxids)
    # By default, return id2gos. User can cause go2geneids to be returned by:
    #   >>> read_ncbi_gene2go(..., go2geneids=True
    if 'taxid2asscs' not in kws:
        if len(obj.taxid2asscs) == 1:
            taxid = next(iter(obj.taxid2asscs))
            kws_ncbi = {
                k: v
                for k, v in kws.items() if k in AnnoOptions.keys_exp
            }
            kws_ncbi['taxid'] = taxid
            return obj.get_id2gos(namespace, **kws_ncbi)
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
    # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
    t2asscs_ret = obj.get_taxid2asscs(taxids, **kws)
    t2asscs_usr = kws.get(
        'taxid2asscs',
        defaultdict(lambda: defaultdict(lambda: defaultdict(set))))
    if 'taxid2asscs' in kws:
        obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret)
    return obj.get_id2gos_all(t2asscs_ret)
Esempio n. 11
0
def prep_goea(taxid=9606,
              prop_counts=True,
              alpha=0.05,
              method='fdr_bh',
              ref_list=None):
    ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA
    # download ontology
    from goatools.base import download_go_basic_obo
    obo_fname = download_go_basic_obo()

    # download associations
    from goatools.base import download_ncbi_associations
    fin_gene2go = download_ncbi_associations()

    # load ontology
    from goatools.obo_parser import GODag
    obodag = GODag("go-basic.obo")

    # load human gene ontology
    from goatools.anno.genetogo_reader import Gene2GoReader
    objanno = Gene2GoReader(fin_gene2go,
                            taxids=[taxid
                                    ])  #9606 is taxonomy ID for h**o sapiens
    ns2assoc = objanno.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
    #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list()
    df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0)

    # if no reference list is given, default to all genes in ABHA
    if ref_list is None:
        ref_list = df_genehumans['GeneID'].to_list()

    goeaobj = GOEnrichmentStudyNS(ref_list,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=prop_counts,
                                  alpha=alpha,
                                  methods=[method])

    # get symbol to ID translation dictionary to get overexpressed IDs
    symbol2id = dict(
        zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID']))

    return goeaobj, symbol2id
Esempio n. 12
0
def get_gene_pathway_mapping(organism=9606, annotations=None, force=False):

	if force & (os.path.isfile('gene2go')):
		os.remove('gene2go')

	if annotations is None:
		annotation_filter = lambda x: True
	else:
		annotation_filter = lambda x: x in annotations

	gene2go = Gene2GoReader(goatools.base.download_ncbi_associations(), taxids=[organism]).get_id2gos_nss()

	gene_pathway_mapping = []

	for gene,gos in gene2go.items():
		gene_pathway_mapping.extend([(gene,go) for go in gos if annotation_filter(go)])

	return gene_pathway_mapping
def load_files(obo_fname, fin_gene2go):
    """function to load ontologies, associations and background gene set and then initialise a GOEA object"""

    # import the python module created in generate_background()
    # find specificy the current folder lcoation as the location of the module
    import sys

    sys.path.insert(1, ".")
    # import the module
    from genes_ncbi_3702_proteincoding import GENEID2NT as GeneID2nt_ara

    # load ontologies
    obodag = GODag(obo_fname)

    # load associations
    # Read NCBI's gene2go. Store Arabidopsis thaliana annotations in a list of named tuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[3702])
    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated Arabidopsis genes".format(NS=nspc,
                                                              N=len(id2gos)))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_ara.keys(
        ),  # List of filtered Arabidopsis protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=["fdr_bh"],
    )  # defult multipletest correction method

    return goeaobj, obodag, ns2assoc
Esempio n. 14
0
def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws):
    """Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
    obj = Gene2GoReader(fin_gene2go, taxids)
    # b_geneid2gos = not kws.get('go2geneids', False)
    opt = AnnoOptions(**kws)
    # By default, return id2gos. User can cause go2geneids to be returned by:
    #   >>> read_ncbi_gene2go(..., go2geneids=True
    if 'taxid2asscs' not in kws:
        if len(obj.taxid2asscs) == 1:
            taxid = next(iter(obj.taxid2asscs))
            return obj.get_annotations_dct(taxid, opt)
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
    # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
    t2asscs_ret = obj.get_annotations_taxid2dct(opt)
    t2asscs_usr = kws.get(
        'taxid2asscs',
        defaultdict(lambda: defaultdict(lambda: defaultdict(set))))
    if 'taxid2asscs' in kws:
        obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret)
    return obj.get_id2gos_all(t2asscs_ret)
Esempio n. 15
0
def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws):
    """Read NCBI's gene2go. Return gene2go data for user-specified taxids."""
    obj = Gene2GoReader(fin_gene2go, taxids=taxids)
    # By default, return id2gos. User can cause go2geneids to be returned by:
    #   >>> read_ncbi_gene2go(..., go2geneids=True
    if 'taxid2asscs' not in kws:
        if len(obj.taxid2asscs) == 1:
            taxid = next(iter(obj.taxid2asscs))
            kws_ncbi = {
                k: v
                for k, v in kws.items() if k in AnnoOptions.keys_exp
            }
            kws_ncbi['taxid'] = taxid
            return obj.get_id2gos(namespace, **kws_ncbi)
    # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
    # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))
    t2asscs_ret = obj.get_taxid2asscs(taxids, **kws)
    t2asscs_usr = kws.get(
        'taxid2asscs',
        defaultdict(lambda: defaultdict(lambda: defaultdict(set))))
    if 'taxid2asscs' in kws:
        obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret)
    return obj.get_id2gos_all(t2asscs_ret)
Esempio n. 16
0
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
import collections as cx

obo_fname = download_go_basic_obo()

fin_gene2go = download_ncbi_associations()

obodag = GODag("go-basic.obo")

CDK1_gene_list = list(np.loadtxt('CDK1_top_effectors.txt', dtype=str))

#def load_data(directory):
#F_adjusted=np

# Read NCBI's gene2go. Store annotations in a list of namedtuples (9606 is the tax ID for humans)
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

# Get namespace2association where:
#    namespace is:
#        BP: biological_process
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human  genes".format(NS=nspc, N=len(id2gos)))

print()
file_gene2go = download_ncbi_associations()

###      2. LOAD ONTOLOGIES, ASSOCIATIONS AND BACKGROUND GENE SET

### 2a. Load Ontologies

from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")

### 2b. Load Associations

from __future__ import print_function
from goatools.anno.genetogo_reader import Gene2GoReader

# Read NCBI's gene2go. Store annotations in a list of namedtuples
objanno = Gene2GoReader(file_gene2go, taxids=[10090])

# Get associations for each branch of the GO DAG (BP, MF, CC)
ns2assoc = objanno.get_ns2assc()
for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))

from goatools.cli.ncbi_gene_results_to_python import NCBIgeneToPythonCli

from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus

### followed "https://github.com/tanghaibao/goatools/blob/1e93d26e4c93cb17786ab5fe736f90dc4f79421a/notebooks/backround_genes_ncbi.ipynb"
### to download a set of background population genes from NCBI.

from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus
Esempio n. 18
0
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from goatools.base import download_ncbi_associations
from goatools.anno.genetogo_reader import Gene2GoReader
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import utils

import os

gene2go = download_ncbi_associations()

objanno = Gene2GoReader(gene2go, taxids=[9606], go2geneids=True)

go2geneIDs = objanno.get_goid2dbids(
    objanno.associations
)  # this is a dict. Keys are GO IDs, values are gene_IDs of the genes that are associated to that GO term

geneID2GO = objanno.get_dbid2goids(objanno.associations)

goID2goTerm = {item.GO_ID: item.GO_term for item in objanno.associations}

genes_in_GO = list(geneID2GO.keys())  # these are entrez_ids


def distance_df(emb_df, metric='euclidean'):
    """Creates a distance matrix for a given embedding DataFrame.
Esempio n. 19
0
def get_genes_for_go_terms(terms, taxid=9606):

    if type(terms) is not list:
        terms = [terms]

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)
        gene2go = download_ncbi_associations(prt=null)
        objanno = Gene2GoReader("geneinfo_cache/gene2go",
                                taxids=[taxid],
                                prt=null)
        go2geneids = objanno.get_id2gos(namespace='*',
                                        go2geneids=True,
                                        prt=null)
        srchhelp = GoSearch("geneinfo_cache/go-basic.obo",
                            go2items=go2geneids,
                            log=null)

        geneids = srchhelp.get_items(terms)

        ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt'
        if not os.path.exists(ncbi_tsv):
            fetch_background_genes(taxid)

        output_py = f'geneinfo_cache/{taxid}_protein_genes.py'
        ncbi_tsv_to_py(ncbi_tsv, output_py, prt=null)

    protein_genes = importlib.import_module(
        output_py.replace('.py', '').replace('/', '.'))
    GENEID2NT = protein_genes.GENEID2NT

    fetch_ids = geneids

    fetch_ids = list(map(str, fetch_ids))
    records = []
    found = []
    batch_size = 2000
    for i in range(0, len(fetch_ids), batch_size):
        to_fetch = fetch_ids[i:i + batch_size]
        handle = Entrez.esummary(db="gene",
                                 id=",".join(to_fetch),
                                 retmax=batch_size)
        entry = Entrez.read(handle)
        docsums = entry['DocumentSummarySet']['DocumentSummary']
        for doc in docsums:
            try:
                chrom_pos = (doc['Chromosome'],
                             doc['GenomicInfo'][0]['ChrStart'],
                             doc['GenomicInfo'][0]['ChrStop'])
            except:
                print(
                    f"WARNING: missing chromosome coordinates for {doc['Name']} are listed as pandas.NA",
                    file=sys.stderr)
                chrom_pos = (pd.NA, pd.NA, pd.NA)
            records.append((doc['Name'], doc['Description'], *chrom_pos))
            found.append(str(doc.attributes['uid']))
    missing = set(fetch_ids).difference(set(found))

    df = pd.DataFrame().from_records(
        records, columns=['symbol', 'name', 'chrom', 'start', 'end'])

    return df.sort_values(by='start').reset_index(drop=True)
Esempio n. 20
0
def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'):
    obo_fl = os.path.join(args.go_dir, "go-basic.obo")
    download_go_basic_obo(obo_fl)
    obodag = GODag(obo_fl)

    assoc_fl = os.path.join(args.go_dir, "gene2go")
    download_ncbi_associations(assoc_fl)
    objanno = Gene2GoReader(assoc_fl, taxids=[9606])
    ns2assoc = objanno.get_ns2assc()

    ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()}
    use_genes = set(coef_df.columns) & set(ncbi_map)
    bgrd_ids = [ncbi_map[gn] for gn in use_genes]

    goeaobj = GOEnrichmentStudyNS(bgrd_ids,
                                  ns2assoc,
                                  obodag,
                                  propagate_counts=False,
                                  alpha=0.05,
                                  methods=['fdr_bh'])

    plot_dict = dict()
    use_gos = set()
    coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]]

    if mode == 'bayes':
        coef_means = coef_mat.groupby(level=0, axis=1).mean()
        coef_stds = coef_mat.groupby(level=0, axis=1).std()
    else:
        coef_mat = coef_mat.groupby(level=0, axis=1).mean()

    for mtype, coefs in coef_mat.iterrows():
        if not isinstance(mtype, RandomType):
            if mode == 'abs':
                fgrd_ctf = coefs.abs().quantile(0.95)
                fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf]
                use_clr = 3.17

            elif mode == 'high':
                fgrd_ctf = coefs.quantile(0.95)
                fgrd_genes = coefs.index[coefs > fgrd_ctf]
                use_clr = 2.03
            elif mode == 'low':
                fgrd_ctf = coefs.quantile(0.05)
                fgrd_genes = coefs.index[coefs < fgrd_ctf]
                use_clr = 1.03

            elif mode == 'bayes':
                gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype]
                fgrd_genes = gene_scrs.index[gene_scrs > 0]
                use_clr = 3.17

            else:
                raise ValueError(
                    "Unrecognized `mode` argument <{}>!".format(mode))

            fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes]
            goea_out = goeaobj.run_study(fgrd_ids, prt=None)

            plot_dict[mtype] = {
                rs.name: np.log10(rs.p_fdr_bh)
                for rs in goea_out
                if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05
            }

    plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys())
    if plot_df.shape[0] == 0:
        print("Could not find any enriched GO terms across {} "
              "subgroupings!".format(plot_df.shape[1]))
        return None

    fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3,
                                    2 + plot_df.shape[1] / 5.3))

    if plot_df.shape[0] > 2:
        plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist(
            plot_df.fillna(0.0), metric='cityblock'),
                                                  method='centroid'),
                                          no_plot=True)['leaves']].transpose()
    else:
        plot_df = plot_df.transpose()

    xlabs = [rs_nm for rs_nm in plot_df.columns]
    ylabs = [
        get_fancy_label(tuple(mtype.subtype_iter())[0][1])
        for mtype in plot_df.index
    ]

    pval_cmap = sns.cubehelix_palette(start=use_clr,
                                      rot=0,
                                      dark=0,
                                      light=1,
                                      reverse=True,
                                      as_cmap=True)

    sns.heatmap(plot_df,
                cmap=pval_cmap,
                vmin=-5,
                vmax=0,
                linewidths=0.23,
                linecolor='0.73',
                xticklabels=xlabs,
                yticklabels=ylabs)

    ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31)
    ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0)
    ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009))
    ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83))

    plt.savefig(os.path.join(
        plot_dir, '__'.join([args.expr_source, args.cohort]),
        "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)),
                bbox_inches='tight',
                format='svg')

    plt.close()
Esempio n. 21
0
def go_enrichment(gene_list,
                  taxid=9606,
                  background_chrom=None,
                  background_genes=None,
                  terms=None,
                  list_study_genes=False,
                  alpha=0.05):

    if type(gene_list) is pd.core.series.Series:
        gene_list = gene_list.tolist()
    if type(terms) is pd.core.series.Series:
        terms = terms.tolist()

    _assert_entrez_email()

    gene_list = list(gene_list)

    taxid = _tidy_taxid(taxid)

    ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt'
    if not os.path.exists(ncbi_tsv):
        fetch_background_genes(taxid)

    with open(os.devnull, 'w') as null, redirect_stdout(null):

        obo_fname = download_and_move_go_basic_obo(prt=null)

        file_gene2go = download_ncbi_associations(prt=null)

        obodag = GODag("geneinfo_cache/go-basic.obo",
                       optional_attrs=['relationship', 'def'],
                       prt=null)

        # read NCBI's gene2go. Store annotations in a list of namedtuples
        objanno = Gene2GoReader(file_gene2go, taxids=[taxid])

        # get associations for each branch of the GO DAG (BP, MF, CC)
        ns2assoc = objanno.get_ns2assc()

        # limit go dag to a sub graph including only specified terms and their children
        if terms is not None:
            sub_obo_name = 'geneinfo_cache/' + str(
                hash(''.join(sorted(terms)).encode())) + '.obo'
            wrsobo = WrSubObo(obo_fname,
                              optional_attrs=['relationship', 'def'])
            wrsobo.wrobo(sub_obo_name, terms)
            obodag = GODag(sub_obo_name,
                           optional_attrs=['relationship', 'def'],
                           prt=null)

        # load background gene set of all genes
        background_genes_file = f'geneinfo_cache/{taxid}_protein_genes.txt'
        if not os.path.exists(background_genes_file):
            fetch_background_genes(taxid)

        # # load any custum subset
        if background_genes:
            if not all(type(x) is int for x in background_genes):
                if all(x.isnumeric() for x in background_genes):
                    background_genes = list(map(str, background_genes))
                else:
                    background_genes = _cached_symbol2ncbi(background_genes,
                                                           taxid=taxid)
            df = pd.read_csv(background_genes_file, sep='\t')
            no_suffix = os.path.splitext(background_genes_file)[0]
            background_genes_file = f'{no_suffix}_{hash("".join(map(str, sorted(background_genes))))}.txt'
            df.loc[df.GeneID.isin(background_genes)].to_csv(
                background_genes_file, sep='\t', index=False)

        # limit background gene set
        if background_chrom is not None:
            df = pd.read_csv(background_genes_file, sep='\t')
            background_genes_file = f'{os.path.splitext(background_genes_file)[0]}_{background_chrom}.txt'
            df.loc[df.chromosome == background_chrom].to_csv(
                background_genes_file, sep='\t', index=False)

        output_py = f'geneinfo_cache/{taxid}_background.py'
        ncbi_tsv_to_py(background_genes_file, output_py, prt=null)

        background_genes_name = output_py.replace('.py', '').replace('/', '.')
        background_genes = importlib.import_module(background_genes_name)
        importlib.reload(background_genes)
        GeneID2nt = background_genes.GENEID2NT

        if not all(type(x) is int for x in gene_list):
            gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid)

        goeaobj = GOEnrichmentStudyNS(
            GeneID2nt,  # List of mouse protein-coding genes
            ns2assoc,  # geneid/GO associations
            obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=['fdr_bh'],
            pvalcalc='fisher_scipy_stats')

        goea_results_all = goeaobj.run_study(gene_list)

        rows = []
        columns = [
            'namespace', 'term_id', 'e/p', 'pval_uncorr', 'p_fdr_bh', 'ratio',
            'bg_ratio', 'obj'
        ]
        if list_study_genes:
            columns.append('study_genes')
        for ntd in goea_results_all:

            ntd.__class__ = My_GOEnrichemntRecord  # Hack. Changes __class__ of all instances...

            row = [
                ntd.NS, ntd.GO, ntd.enrichment, ntd.p_uncorrected,
                ntd.p_fdr_bh, ntd.ratio_in_study[0] / ntd.ratio_in_study[1],
                ntd.ratio_in_pop[0] / ntd.ratio_in_pop[1], ntd
            ]

            if list_study_genes:
                row.append(_cached_ncbi2symbol(sorted(ntd.study_items)))
            rows.append(row)
        df = (pd.DataFrame().from_records(rows, columns=columns).sort_values(
            by=['p_fdr_bh', 'ratio']).reset_index(drop=True))
        return df.loc[df.p_fdr_bh < alpha]
Esempio n. 22
0
def get_go_ids(go_ids, species='H**o sapiens'):
    '''
    Fetch all gene symbols associated with a list of gene ontology term IDs.

    Parameters
    ----------
    go_ids : str or list of str
    species : str, optional

    Returns
    -------
    list of str
    '''
    assert species in TAXA

    if isinstance(go_ids, str):
        go_ids = [go_ids]

    obo_fname = download_go_basic_obo('db/go/go-basic.obo')
    gene2go = download_ncbi_associations('db/go/gene2go')

    taxid = TAXA[species]

    fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid)

    module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]])
    module = importlib.import_module(module_name)
    GeneID2nt = module.GENEID2NT

    go2geneids = Gene2GoReader(
        'db/go/gene2go',
        taxids=[taxid],
    )

    go2items = defaultdict(list)
    for i in go2geneids.taxid2asscs[taxid]:
        go2items[i.GO_ID].append(i.DB_ID)

    srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items)

    with open('go.log', 'w') as log:
        # Add children GOs
        gos_all = srchhelp.add_children_gos(go_ids)

        # Get Entrez GeneIDs for cell cycle GOs
        gene_ids = set()

        for go_items in [
                go_ids,
                gos_all,
        ]:
            gene_ids.update(srchhelp.get_items(go_items))

    genes = []

    for geneid in gene_ids:
        nt = GeneID2nt.get(geneid, None)

        if nt is not None:
            genes.append(nt.Symbol)

    return genes
Esempio n. 23
0
def pullGOenrichment(inputFile, project):
    GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT

    obo_fname = download_go_basic_obo()

    fin_gene2go = download_ncbi_associations()

    obodag = GODag("go-basic.obo")

    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc,
                                                        N=len(id2gos)))

    print(len(GeneID2nt_hum))

    goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(),  # List of human protein-coding genes
        ns2assoc,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}
    with open(inputFile, 'r') as infile:
        input_genes = csv.reader(infile)
        for line in input_genes:
            geneid = line[0]
            symbol = line[1]
            if geneid:
                geneid2symbol[int(geneid)] = symbol

    infile.close()

    geneids_study = geneid2symbol.keys()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    import collections as cx
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC']))  # cellular_component

    goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig)
    goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)