def get_species_genes(species): """ """ if species == 'human' or species == 'homo_sapiens': objanno = Gene2GoReader(gene2go_path, taxids=[9606]) symbols_to_ids = { val.Symbol: key for key, val in GeneID2nt_hum.items() } genes_list = GeneID2nt_hum.keys() elif species == 'mouse' or species == 'mus_musculus': objanno = Gene2GoReader(gene2go_path, taxids=[10090]) symbols_to_ids = { val.Symbol: key for key, val in GeneID2nt_mus.items() } genes_list = GeneID2nt_mus.keys() else: # TODO: overlapping gene names objanno = Gene2GoReader(gene2go_path, taxids=[10090, 9606]) symbols_to_ids = { val.Symbol: key for key, val in list(GeneID2nt_mus.items()) + list(GeneID2nt_hum.items()) } genes_list = set(GeneID2nt_mus.keys()).union(GeneID2nt_hum.keys()) ns2assoc = objanno.get_ns2assc() ids_to_symbols = {val: key for key, val in symbols_to_ids.items()} return ns2assoc, ids_to_symbols, symbols_to_ids, genes_list
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno_all = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) objanno_mmu = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090]) objanno_mmuhsa = Gene2GoReader(fin_gene2go, godag=godag, taxids=[10090, 9606]) # Get associations # pylint: disable=bad-whitespace ns2assoc_all_mmu = _run_get_ns2assc(10090, objanno_all) ns2assoc_mmu_mmu = _run_get_ns2assc(10090, objanno_mmu) ns2assoc_mmuhsa_all = _run_get_ns2assc(True, objanno_mmuhsa) ns2assoc_mmuhsa_mmu = _run_get_ns2assc(10090, objanno_mmuhsa) # Check results for nspc in ['BP', 'MF', 'CC']: assert ns2assoc_mmu_mmu[nspc] == ns2assoc_all_mmu[nspc] assert ns2assoc_mmu_mmu[nspc] == ns2assoc_mmuhsa_mmu[nspc] _chk_mmuhsa_all(objanno_mmuhsa, objanno_all, ns2assoc_mmuhsa_all)
def __init__( self, work_dir: str = '.', clean_work_dir: bool = False, organism: str = 'human', study_parameters: Dict[str, Union[int, float, str, List, Dict]] = { 'propagate_counts': False, 'alpha': 0.05, 'methods': ['fdr_bh'] } ) -> GOEngine: """A GOEngine that can be used for performing analysis using GOATOOLS Args: work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory. clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True. organism (str, optional): The organism . Defaults to 'human'. study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']} Returns: GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS """ print("Creating a GO Engine ...") if not os.path.exists(work_dir): raise ValueError( f"The provided work path: {work_dir} does not exist!!!") self.work_dir = work_dir if organism != 'human' and organism != 'mouse': raise ValueError( f"The provided organism: {organism} is not support, current engine mainly work with human and moues only" ) print(f"\t --> Downloading data ...") obo_fname = download_go_basic_obo( os.path.join(work_dir, 'go-basic.obo')) gene2go_fname = download_ncbi_associations( os.path.join(work_dir, 'gene2go')) ## parse the GO term print( f"\t --> parsing the data and intializing the base GOEA object...") obo_dag = GODag(obo_fname) if organism == 'human': self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(), obo_dag, **study_parameters) else: self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(), obo_dag, **study_parameters) self._clean_work_dir = clean_work_dir self._gene_ids = None return
def test_anno_read(): """Test reading an NCBI gene2go annotation file.""" fin_anno = os.path.join(REPO, 'gene2go') _dnld_anno(fin_anno) #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None) print('\nTEST STORING ONLY ONE SPECIES') obj = Gene2GoReader(fin_anno) assert len(obj.taxid2asscs) == 1 obj.prt_summary_anno2ev() print('\nTEST STORING ALL SPECIES') obj = Gene2GoReader(fin_anno, taxids=True) assert len(obj.taxid2asscs) > 1, '**EXPECTED MORE: len(taxid2asscs) == {N}'.format( N=len(obj.taxid2asscs)) obj.prt_summary_anno2ev() print('\nTEST GETTING ASSOCIATIONS FOR ONE SPECIES') print("\nTEST read_ncbi_gene2go_old: [9606]") old_g2go_hsa = read_ncbi_gene2go_old(fin_anno, [9606]) ## new_g2go_hsa = read_ncbi_gene2go(fin_anno, [9606]) new_g2go_hsa = obj.get_id2gos_nss(taxids=[9606]) assert old_g2go_hsa == new_g2go_hsa, \ 'OLD({O}) != NEW({N})'.format(O=len(old_g2go_hsa), N=len(new_g2go_hsa)) print("\nTEST read_ncbi_gene2go_old: 9606") ## assert old_g2go_hsa == read_ncbi_gene2go(fin_anno, 9606) assert old_g2go_hsa == obj.get_id2gos_nss(taxid=9606) print('\nTEST GETTING REVERSE ASSOCIATIONS: GO2GENES') go2geneids = True print("\nTEST read_ncbi_gene2go_old: 9606 go2geneids=True") old_go2gs_hsa = read_ncbi_gene2go_old(fin_anno, [9606], go2geneids=go2geneids) ## new_go2gs_hsa = read_ncbi_gene2go(fin_anno, 9606, go2geneids=go2geneids) new_go2gs_hsa = obj.get_id2gos_nss(taxid=9606, go2geneids=go2geneids) print('OLD:', next(iter(old_go2gs_hsa.items()))) print('NEW:', next(iter(new_go2gs_hsa.items()))) assert old_go2gs_hsa == new_go2gs_hsa, \ 'OLD({O}) != NEW({N})'.format(O=len(old_go2gs_hsa), N=len(new_go2gs_hsa)) print('\nTEST RETURNING ASSOCIATIONS FOR SELECTED EVIDENCE CODES') evcodes = set(['ISO', 'IKR']) print("\nTEST read_ncbi_gene2go_old: 9606 evcodes=True") old_gene2gos_evc = read_ncbi_gene2go_old(fin_anno, taxids=[9606], ev_include=evcodes) ## new_gene2gos_evc = read_ncbi_gene2go(fin_anno, 9606, ev_include=evcodes) new_gene2gos_evc = obj.get_id2gos_nss(taxid=9606, ev_include=evcodes) print('OLD:', next(iter(old_gene2gos_evc.items()))) print('NEW:', next(iter(new_gene2gos_evc.items()))) assert old_gene2gos_evc == new_gene2gos_evc
def get_objanno(fin_anno, anno_type=None, **kws): """Read annotations in GAF, GPAD, Entrez gene2go, or text format.""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol anno_type = get_anno_desc(fin_anno, anno_type) if anno_type is not None: if anno_type == 'gene2go': # kws: taxid taxids kws_ncbi = { k: kws[k] for k in Gene2GoReader.exp_kws.intersection(kws.keys()) } return Gene2GoReader(fin_anno, **kws_ncbi) if anno_type == 'gaf': kws_gaf = { k: kws[k] for k in GafReader.exp_kws.intersection(kws.keys()) } return GafReader(fin_anno, **kws_gaf) if anno_type == 'gpad': kws_gpad = { k: kws[k] for k in GpadReader.exp_kws.intersection(kws.keys()) } return GpadReader(fin_anno, **kws_gpad) if anno_type == 'id2gos': kws_id2go = { k: kws[k] for k in IdToGosReader.exp_kws.intersection(kws.keys()) } return IdToGosReader(fin_anno, **kws_id2go) raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format( F=fin_anno, D=anno_type))
def get_gene_id_mapping(organism=9606, force=False): if force & (os.path.isfile('gene2go')): os.remove('gene2go') gene2go = Gene2GoReader(goatools.base.download_ncbi_associations(), taxids=[organism]).get_id2gos_nss() return {str(gene):gene for gene in gene2go.keys()}
def get_terms_for_go_regex(regex, taxid=9606, add_children=False): taxid = _tidy_taxid(taxid) with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) gene2go = download_ncbi_associations(prt=null) objanno = Gene2GoReader("geneinfo_cache/gene2go", taxids=[taxid], prt=null) go2geneids = objanno.get_id2gos(namespace='*', go2geneids=True, prt=null) srchhelp = GoSearch("geneinfo_cache/go-basic.obo", go2items=go2geneids, log=null) results_all = re.compile(r'({})'.format(regex), flags=re.IGNORECASE) results_not = re.compile(r'({}).independent'.format(regex), flags=re.IGNORECASE) gos_all = srchhelp.get_matching_gos(results_all, prt=null) gos_no = srchhelp.get_matching_gos(results_not, gos=gos_all) gos = gos_all.difference(gos_no) if add_children: gos = srchhelp.add_children_gos(gos) return list(gos)
def test_i147_all_taxids(): """Work with all taxids using Gene2GoReader""" # 1. Download Ontologies and Associations # 1a. Download Ontologies, if necessary # Get http://geneontology.org/ontology/go-basic.obo download_go_basic_obo() # 1b. Download Associations, if necessary # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz fin_gene2go = download_ncbi_associations() # 2. Load Ontologies, Associations and Background gene set # 2a. Load Ontologies godag = GODag("go-basic.obo") # 2b. Load Associations for all species # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, godag=godag, taxids=True) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos)))
def __GO_enrich__(self): go_file = "go-basic.obo" if not os.path.exists(go_file): download_go_basic_obo() # Load gene ontologies obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples fin_gene2go = download_ncbi_associations() objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # association is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() self.goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-acoding genes ns2assoc, # geneID/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method
def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" print( 'DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader' ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( PY=frm.co_filename, FNC=frm.co_name)) obj = Gene2GoReader(fin_gene2go, taxids=taxids) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True if 'taxid2asscs' not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) kws_ncbi = { k: v for k, v in kws.items() if k in AnnoOptions.keys_exp } kws_ncbi['taxid'] = taxid return obj.get_id2gos(namespace, **kws_ncbi) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_taxid2asscs(taxids, **kws) t2asscs_usr = kws.get( 'taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) if 'taxid2asscs' in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret)
def prep_goea(taxid=9606, prop_counts=True, alpha=0.05, method='fdr_bh', ref_list=None): ### DOWNLOAD AND LOAD ALL THE GENE STUFF for GOEA # download ontology from goatools.base import download_go_basic_obo obo_fname = download_go_basic_obo() # download associations from goatools.base import download_ncbi_associations fin_gene2go = download_ncbi_associations() # load ontology from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") # load human gene ontology from goatools.anno.genetogo_reader import Gene2GoReader objanno = Gene2GoReader(fin_gene2go, taxids=[taxid ]) #9606 is taxonomy ID for h**o sapiens ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS #pop_ids = pd.read_csv('../data/df_human_geneinfo.csv',index_col=0)['GeneID'].to_list() df_genehumans = pd.read_csv('../data/df_human_geneinfo.csv', index_col=0) # if no reference list is given, default to all genes in ABHA if ref_list is None: ref_list = df_genehumans['GeneID'].to_list() goeaobj = GOEnrichmentStudyNS(ref_list, ns2assoc, obodag, propagate_counts=prop_counts, alpha=alpha, methods=[method]) # get symbol to ID translation dictionary to get overexpressed IDs symbol2id = dict( zip(df_genehumans['Symbol'].str.upper(), df_genehumans['GeneID'])) return goeaobj, symbol2id
def get_gene_pathway_mapping(organism=9606, annotations=None, force=False): if force & (os.path.isfile('gene2go')): os.remove('gene2go') if annotations is None: annotation_filter = lambda x: True else: annotation_filter = lambda x: x in annotations gene2go = Gene2GoReader(goatools.base.download_ncbi_associations(), taxids=[organism]).get_id2gos_nss() gene_pathway_mapping = [] for gene,gos in gene2go.items(): gene_pathway_mapping.extend([(gene,go) for go in gos if annotation_filter(go)]) return gene_pathway_mapping
def load_files(obo_fname, fin_gene2go): """function to load ontologies, associations and background gene set and then initialise a GOEA object""" # import the python module created in generate_background() # find specificy the current folder lcoation as the location of the module import sys sys.path.insert(1, ".") # import the module from genes_ncbi_3702_proteincoding import GENEID2NT as GeneID2nt_ara # load ontologies obodag = GODag(obo_fname) # load associations # Read NCBI's gene2go. Store Arabidopsis thaliana annotations in a list of named tuples objanno = Gene2GoReader(fin_gene2go, taxids=[3702]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated Arabidopsis genes".format(NS=nspc, N=len(id2gos))) goeaobj = GOEnrichmentStudyNS( GeneID2nt_ara.keys( ), # List of filtered Arabidopsis protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=["fdr_bh"], ) # defult multipletest correction method return goeaobj, obodag, ns2assoc
def read_ncbi_gene2go(fin_gene2go, taxids=None, **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" obj = Gene2GoReader(fin_gene2go, taxids) # b_geneid2gos = not kws.get('go2geneids', False) opt = AnnoOptions(**kws) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True if 'taxid2asscs' not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) return obj.get_annotations_dct(taxid, opt) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_annotations_taxid2dct(opt) t2asscs_usr = kws.get( 'taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) if 'taxid2asscs' in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret)
def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" obj = Gene2GoReader(fin_gene2go, taxids=taxids) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True if 'taxid2asscs' not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) kws_ncbi = { k: v for k, v in kws.items() if k in AnnoOptions.keys_exp } kws_ncbi['taxid'] = taxid return obj.get_id2gos(namespace, **kws_ncbi) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_taxid2asscs(taxids, **kws) t2asscs_usr = kws.get( 'taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) if 'taxid2asscs' in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret)
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj import collections as cx obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") CDK1_gene_list = list(np.loadtxt('CDK1_top_effectors.txt', dtype=str)) #def load_data(directory): #F_adjusted=np # Read NCBI's gene2go. Store annotations in a list of namedtuples (9606 is the tax ID for humans) objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print()
file_gene2go = download_ncbi_associations() ### 2. LOAD ONTOLOGIES, ASSOCIATIONS AND BACKGROUND GENE SET ### 2a. Load Ontologies from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") ### 2b. Load Associations from __future__ import print_function from goatools.anno.genetogo_reader import Gene2GoReader # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(file_gene2go, taxids=[10090]) # Get associations for each branch of the GO DAG (BP, MF, CC) ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated mouse genes".format(NS=nspc, N=len(id2gos))) from goatools.cli.ncbi_gene_results_to_python import NCBIgeneToPythonCli from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus ### followed "https://github.com/tanghaibao/goatools/blob/1e93d26e4c93cb17786ab5fe736f90dc4f79421a/notebooks/backround_genes_ncbi.ipynb" ### to download a set of background population genes from NCBI. from genes_ncbi_10090_proteincoding import GENEID2NT as GeneID2nt_mus
from sklearn.metrics import f1_score from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression from goatools.base import download_ncbi_associations from goatools.anno.genetogo_reader import Gene2GoReader from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn import preprocessing import utils import os gene2go = download_ncbi_associations() objanno = Gene2GoReader(gene2go, taxids=[9606], go2geneids=True) go2geneIDs = objanno.get_goid2dbids( objanno.associations ) # this is a dict. Keys are GO IDs, values are gene_IDs of the genes that are associated to that GO term geneID2GO = objanno.get_dbid2goids(objanno.associations) goID2goTerm = {item.GO_ID: item.GO_term for item in objanno.associations} genes_in_GO = list(geneID2GO.keys()) # these are entrez_ids def distance_df(emb_df, metric='euclidean'): """Creates a distance matrix for a given embedding DataFrame.
def get_genes_for_go_terms(terms, taxid=9606): if type(terms) is not list: terms = [terms] with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) gene2go = download_ncbi_associations(prt=null) objanno = Gene2GoReader("geneinfo_cache/gene2go", taxids=[taxid], prt=null) go2geneids = objanno.get_id2gos(namespace='*', go2geneids=True, prt=null) srchhelp = GoSearch("geneinfo_cache/go-basic.obo", go2items=go2geneids, log=null) geneids = srchhelp.get_items(terms) ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(ncbi_tsv): fetch_background_genes(taxid) output_py = f'geneinfo_cache/{taxid}_protein_genes.py' ncbi_tsv_to_py(ncbi_tsv, output_py, prt=null) protein_genes = importlib.import_module( output_py.replace('.py', '').replace('/', '.')) GENEID2NT = protein_genes.GENEID2NT fetch_ids = geneids fetch_ids = list(map(str, fetch_ids)) records = [] found = [] batch_size = 2000 for i in range(0, len(fetch_ids), batch_size): to_fetch = fetch_ids[i:i + batch_size] handle = Entrez.esummary(db="gene", id=",".join(to_fetch), retmax=batch_size) entry = Entrez.read(handle) docsums = entry['DocumentSummarySet']['DocumentSummary'] for doc in docsums: try: chrom_pos = (doc['Chromosome'], doc['GenomicInfo'][0]['ChrStart'], doc['GenomicInfo'][0]['ChrStop']) except: print( f"WARNING: missing chromosome coordinates for {doc['Name']} are listed as pandas.NA", file=sys.stderr) chrom_pos = (pd.NA, pd.NA, pd.NA) records.append((doc['Name'], doc['Description'], *chrom_pos)) found.append(str(doc.attributes['uid'])) missing = set(fetch_ids).difference(set(found)) df = pd.DataFrame().from_records( records, columns=['symbol', 'name', 'chrom', 'start', 'end']) return df.sort_values(by='start').reset_index(drop=True)
def plot_go_enrichment(coef_df, auc_vals, pheno_dict, args, mode='abs'): obo_fl = os.path.join(args.go_dir, "go-basic.obo") download_go_basic_obo(obo_fl) obodag = GODag(obo_fl) assoc_fl = os.path.join(args.go_dir, "gene2go") download_ncbi_associations(assoc_fl) objanno = Gene2GoReader(assoc_fl, taxids=[9606]) ns2assoc = objanno.get_ns2assc() ncbi_map = {info.Symbol: ncbi_id for ncbi_id, info in GENEID2NT.items()} use_genes = set(coef_df.columns) & set(ncbi_map) bgrd_ids = [ncbi_map[gn] for gn in use_genes] goeaobj = GOEnrichmentStudyNS(bgrd_ids, ns2assoc, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh']) plot_dict = dict() use_gos = set() coef_mat = coef_df.loc[:, [gene in use_genes for gene in coef_df.columns]] if mode == 'bayes': coef_means = coef_mat.groupby(level=0, axis=1).mean() coef_stds = coef_mat.groupby(level=0, axis=1).std() else: coef_mat = coef_mat.groupby(level=0, axis=1).mean() for mtype, coefs in coef_mat.iterrows(): if not isinstance(mtype, RandomType): if mode == 'abs': fgrd_ctf = coefs.abs().quantile(0.95) fgrd_genes = coefs.index[coefs.abs() > fgrd_ctf] use_clr = 3.17 elif mode == 'high': fgrd_ctf = coefs.quantile(0.95) fgrd_genes = coefs.index[coefs > fgrd_ctf] use_clr = 2.03 elif mode == 'low': fgrd_ctf = coefs.quantile(0.05) fgrd_genes = coefs.index[coefs < fgrd_ctf] use_clr = 1.03 elif mode == 'bayes': gene_scrs = coef_means.loc[mtype].abs() - coef_stds.loc[mtype] fgrd_genes = gene_scrs.index[gene_scrs > 0] use_clr = 3.17 else: raise ValueError( "Unrecognized `mode` argument <{}>!".format(mode)) fgrd_ids = [ncbi_map[gn] for gn in fgrd_genes] goea_out = goeaobj.run_study(fgrd_ids, prt=None) plot_dict[mtype] = { rs.name: np.log10(rs.p_fdr_bh) for rs in goea_out if rs.enrichment == 'e' and rs.p_fdr_bh < 0.05 } plot_df = pd.DataFrame(plot_dict, columns=plot_dict.keys()) if plot_df.shape[0] == 0: print("Could not find any enriched GO terms across {} " "subgroupings!".format(plot_df.shape[1])) return None fig, ax = plt.subplots(figsize=(4.7 + plot_df.shape[0] / 2.3, 2 + plot_df.shape[1] / 5.3)) if plot_df.shape[0] > 2: plot_df = plot_df.iloc[dendrogram(linkage(distance.pdist( plot_df.fillna(0.0), metric='cityblock'), method='centroid'), no_plot=True)['leaves']].transpose() else: plot_df = plot_df.transpose() xlabs = [rs_nm for rs_nm in plot_df.columns] ylabs = [ get_fancy_label(tuple(mtype.subtype_iter())[0][1]) for mtype in plot_df.index ] pval_cmap = sns.cubehelix_palette(start=use_clr, rot=0, dark=0, light=1, reverse=True, as_cmap=True) sns.heatmap(plot_df, cmap=pval_cmap, vmin=-5, vmax=0, linewidths=0.23, linecolor='0.73', xticklabels=xlabs, yticklabels=ylabs) ax.set_xticklabels(xlabs, size=15, ha='right', rotation=31) ax.set_yticklabels(ylabs, size=9, ha='right', rotation=0) ax.set_xlim((plot_df.shape[1] / -83, plot_df.shape[1] * 1.009)) ax.set_ylim((plot_df.shape[0] * 1.009, plot_df.shape[0] / -83)) plt.savefig(os.path.join( plot_dir, '__'.join([args.expr_source, args.cohort]), "{}_go-{}-enrichment_{}.svg".format(args.gene, mode, args.classif)), bbox_inches='tight', format='svg') plt.close()
def go_enrichment(gene_list, taxid=9606, background_chrom=None, background_genes=None, terms=None, list_study_genes=False, alpha=0.05): if type(gene_list) is pd.core.series.Series: gene_list = gene_list.tolist() if type(terms) is pd.core.series.Series: terms = terms.tolist() _assert_entrez_email() gene_list = list(gene_list) taxid = _tidy_taxid(taxid) ncbi_tsv = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(ncbi_tsv): fetch_background_genes(taxid) with open(os.devnull, 'w') as null, redirect_stdout(null): obo_fname = download_and_move_go_basic_obo(prt=null) file_gene2go = download_ncbi_associations(prt=null) obodag = GODag("geneinfo_cache/go-basic.obo", optional_attrs=['relationship', 'def'], prt=null) # read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(file_gene2go, taxids=[taxid]) # get associations for each branch of the GO DAG (BP, MF, CC) ns2assoc = objanno.get_ns2assc() # limit go dag to a sub graph including only specified terms and their children if terms is not None: sub_obo_name = 'geneinfo_cache/' + str( hash(''.join(sorted(terms)).encode())) + '.obo' wrsobo = WrSubObo(obo_fname, optional_attrs=['relationship', 'def']) wrsobo.wrobo(sub_obo_name, terms) obodag = GODag(sub_obo_name, optional_attrs=['relationship', 'def'], prt=null) # load background gene set of all genes background_genes_file = f'geneinfo_cache/{taxid}_protein_genes.txt' if not os.path.exists(background_genes_file): fetch_background_genes(taxid) # # load any custum subset if background_genes: if not all(type(x) is int for x in background_genes): if all(x.isnumeric() for x in background_genes): background_genes = list(map(str, background_genes)) else: background_genes = _cached_symbol2ncbi(background_genes, taxid=taxid) df = pd.read_csv(background_genes_file, sep='\t') no_suffix = os.path.splitext(background_genes_file)[0] background_genes_file = f'{no_suffix}_{hash("".join(map(str, sorted(background_genes))))}.txt' df.loc[df.GeneID.isin(background_genes)].to_csv( background_genes_file, sep='\t', index=False) # limit background gene set if background_chrom is not None: df = pd.read_csv(background_genes_file, sep='\t') background_genes_file = f'{os.path.splitext(background_genes_file)[0]}_{background_chrom}.txt' df.loc[df.chromosome == background_chrom].to_csv( background_genes_file, sep='\t', index=False) output_py = f'geneinfo_cache/{taxid}_background.py' ncbi_tsv_to_py(background_genes_file, output_py, prt=null) background_genes_name = output_py.replace('.py', '').replace('/', '.') background_genes = importlib.import_module(background_genes_name) importlib.reload(background_genes) GeneID2nt = background_genes.GENEID2NT if not all(type(x) is int for x in gene_list): gene_list = _cached_symbol2ncbi(gene_list, taxid=taxid) goeaobj = GOEnrichmentStudyNS( GeneID2nt, # List of mouse protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh'], pvalcalc='fisher_scipy_stats') goea_results_all = goeaobj.run_study(gene_list) rows = [] columns = [ 'namespace', 'term_id', 'e/p', 'pval_uncorr', 'p_fdr_bh', 'ratio', 'bg_ratio', 'obj' ] if list_study_genes: columns.append('study_genes') for ntd in goea_results_all: ntd.__class__ = My_GOEnrichemntRecord # Hack. Changes __class__ of all instances... row = [ ntd.NS, ntd.GO, ntd.enrichment, ntd.p_uncorrected, ntd.p_fdr_bh, ntd.ratio_in_study[0] / ntd.ratio_in_study[1], ntd.ratio_in_pop[0] / ntd.ratio_in_pop[1], ntd ] if list_study_genes: row.append(_cached_ncbi2symbol(sorted(ntd.study_items))) rows.append(row) df = (pd.DataFrame().from_records(rows, columns=columns).sort_values( by=['p_fdr_bh', 'ratio']).reset_index(drop=True)) return df.loc[df.p_fdr_bh < alpha]
def get_go_ids(go_ids, species='H**o sapiens'): ''' Fetch all gene symbols associated with a list of gene ontology term IDs. Parameters ---------- go_ids : str or list of str species : str, optional Returns ------- list of str ''' assert species in TAXA if isinstance(go_ids, str): go_ids = [go_ids] obo_fname = download_go_basic_obo('db/go/go-basic.obo') gene2go = download_ncbi_associations('db/go/gene2go') taxid = TAXA[species] fin_symbols = 'genes_NCBI_{TAXID}_All.py'.format(TAXID=taxid) module_name = ''.join(['goatools.test_data.', fin_symbols[:-3]]) module = importlib.import_module(module_name) GeneID2nt = module.GENEID2NT go2geneids = Gene2GoReader( 'db/go/gene2go', taxids=[taxid], ) go2items = defaultdict(list) for i in go2geneids.taxid2asscs[taxid]: go2items[i.GO_ID].append(i.DB_ID) srchhelp = GoSearch('db/go/go-basic.obo', go2items=go2items) with open('go.log', 'w') as log: # Add children GOs gos_all = srchhelp.add_children_gos(go_ids) # Get Entrez GeneIDs for cell cycle GOs gene_ids = set() for go_items in [ go_ids, gos_all, ]: gene_ids.update(srchhelp.get_items(go_items)) genes = [] for geneid in gene_ids: nt = GeneID2nt.get(geneid, None) if nt is not None: genes.append(nt.Symbol) return genes
def pullGOenrichment(inputFile, project): GeneID2nt_hum = genes_NCBI_9606_ProteinCoding.GENEID2NT obo_fname = download_go_basic_obo() fin_gene2go = download_ncbi_associations() obodag = GODag("go-basic.obo") # Read NCBI's gene2go. Store annotations in a list of namedtuples objanno = Gene2GoReader(fin_gene2go, taxids=[9606]) # Get namespace2association where: # namespace is: # BP: biological_process # MF: molecular_function # CC: cellular_component # assocation is a dict: # key: NCBI GeneID # value: A set of GO IDs associated with that gene ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos))) print(len(GeneID2nt_hum)) goeaobj = GOEnrichmentStudyNS( GeneID2nt_hum.keys(), # List of human protein-coding genes ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.05, # default significance cut-off methods=['fdr_bh']) # defult multipletest correction method geneid2symbol = {} with open(inputFile, 'r') as infile: input_genes = csv.reader(infile) for line in input_genes: geneid = line[0] symbol = line[1] if geneid: geneid2symbol[int(geneid)] = symbol infile.close() geneids_study = geneid2symbol.keys() goea_results_all = goeaobj.run_study(geneids_study) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] import collections as cx ctr = cx.Counter([r.NS for r in goea_results_sig]) print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format( TOTAL=len(goea_results_sig), BP=ctr['BP'], # biological_process MF=ctr['MF'], # molecular_function CC=ctr['CC'])) # cellular_component goeaobj.wr_xlsx("Data/go_enrichment" + project + ".csv", goea_results_sig) goeaobj.wr_txt("Data/go_enrichment" + project + ".txt", goea_results_sig)