def __init__(self): self.prt = sys.stdout _fin_assc = os.path.join(REPO, "goa_human.gaf") self.gene2gos_orig = dnld_assc(_fin_assc, go2obj=None, prt=self.prt) self.go2genes_orig = get_b2aset(self.gene2gos_orig) _num_genes = [len(gs) for gs in self.go2genes_orig.values()] self.min_genes = min(_num_genes) self.max_genes = max(_num_genes) assert self.gene2gos_orig == get_b2aset(self.go2genes_orig)
def _run_get_id2gos(annoobjs): """Test get_id2gos""" for idx, obj in enumerate(annoobjs): print('\n{I}) get_id2gos {DESC} {NSs} {N:,} annotations'.format( I=idx, DESC=obj.get_desc(), NSs=obj.namespaces, N=len(obj.associations))) # If all namespaces are loaded, returns BP, else returns loaded NS print('Load all evidence codes') id2gos = obj.get_id2gos() assert id2gos, 'NO ANNOTATIONS FOUND' ## print(next(iter(obj.associations))) print('Load all evidence codes, except IEA') id2gos_inc = obj.get_id2gos(ev_include=INC_GOOD) id2gos_exc = obj.get_id2gos(ev_exclude={'IEA'}) assert id2gos_exc, 'NO NON-IEA ANNOTATIONS FOUND' assert id2gos_inc == id2gos_exc, \ 'INC ALL({A}) != EXC IEA({I}): {DIF}'.format( A=len(id2gos_inc), I=len(id2gos_exc), # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()). # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'}))) DIF='') num_ids = len(id2gos) print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format( I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos)))) assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:" if obj.filename[-16:] == 'data/association' and obj.godag is None: assert num_ids == 34284
def _prune_assc(self, assc_geneid2gos, max_genecnt, godag, prt=sys.stdout): """Remove GO IDs which are associated with large numbers of genes.""" #### # DEPRECATED: Now in GOATOOLS #### go2genes_orig = get_b2aset(assc_geneid2gos) #### go2genes_prun = {go:gs for go, gs in go2genes_orig.items() if len(gs) <= max_genecnt} #### num_was = len(go2genes_orig) #### num_now = len(go2genes_prun) #### gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys())) #### assert num_was-num_now == len(gos_rm) #### prt.write("{N} GO IDs removed assc. w/>{G} genes = {A} - {B}\n".format( #### N=num_was-num_now, G=max_genecnt, A=num_was, B=num_now)) assc_geneid2gos_pruned, goids_rm = get_assc_pruned(assc_geneid2gos, None, max_genecnt, prt=prt) go2genes_orig = get_b2aset(assc_geneid2gos) for desc in self.get_go2desc(goids_rm, godag, go2genes_orig).values(): prt.write(" {DESC}\n".format(DESC=desc)) cnts_genes = [len(gs) for gs in go2genes_orig.values()] # print sorted(cnts_genes) prt_percentiles("Number of genes associated with GO IDs.", cnts_genes, "{:6.0f}", prt) #### self.prt_goids_assc(goids_rm, godag, go2genes_orig, " ", prt) #### return get_b2aset(go2genes_prun) return assc_geneid2gos_pruned
def _get_id2gos(file_id2gos, godag, name2go): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') id2num = { name2go['A']: 10, name2go['B']: 10, name2go['C']: 10, name2go['D']: 10, name2go['E']: 10, name2go['F']: 10, name2go['G']: 10, name2go['H']: 10, name2go['I']: 30, name2go['L']: 30, name2go['M']: 20, name2go['N']: 30, } go2genes = cx.defaultdict(set) genenum = 0 for goid, qty in id2num.items(): for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def __init__(self, objsim): self.go2genes = get_b2aset(objsim.goeasim_assc) self.gos_bg = objsim.pobj.params['goids_study_bg'] self.gos_sig_all = set([r.GO for r in objsim.goeasim_res]) self.go2obj = objsim.pobj.params['gosubdag'].go2obj self.go2res = {r.GO: r for r in objsim.goeasim_res} self.attr_pval = "p_{METHOD}".format(METHOD=objsim.pobj.objbase.method) self.get_go2desc = objsim.pobj.objassc.get_go2desc
def _adj_for_assc(self): """Print only GO IDs from associations and their ancestors.""" if self.gene2gos: gos_assoc = set(get_b2aset(self.gene2gos).keys()) if 'item_marks' not in self.kws: self.kws['item_marks'] = {go:'>' for go in gos_assoc} if 'include_only' not in self.kws: gosubdag = GoSubDag(gos_assoc, self.gosubdag.go2obj, self.gosubdag.relationships) self.kws['include_only'] = gosubdag.go2obj
def _adj_for_assc(self): """Print only GO IDs from associations and their ancestors.""" if self.gene2gos: gos_assoc = set(get_b2aset(self.gene2gos).keys()) if 'item_marks' not in self.kws: self.kws['item_marks'] = {go:'>' for go in gos_assoc} if 'include_only' not in self.kws: gosubdag = GoSubDag(gos_assoc, self.gosubdag.go2obj, self.gosubdag.relationships) self.kws['include_only'] = gosubdag.go2obj
def set_targeted(self, goids_tgtd): """Set targeted GO IDs: Significant, but not tracked.""" self.goids_tgtd = goids_tgtd # go:self.go2genes contains only GO IDs related to the population genes go2genes = {go: self.go2genes[go] for go in goids_tgtd} genes = get_b2aset(go2genes) print("TARGETED GENES({Gs})".format(Gs=len(genes))) assc_pruned, assc_tgtd = self._split_assc(goids_tgtd) print("AASSSSCC LENS PRUNED({}) TGTD({})".format( len(assc_pruned), len(assc_tgtd))) # TBD rm self.objassc_pruned = RandAssc(assc_pruned) self.objassc_tgtd = RandAssc(assc_tgtd)
def __init__(self, params, godag): # Associations: rm obsolete GO IDs _assc_file = params['association_file'] _pop_genes = params['genes_population'] self.assc_hdr = get_gaf_hdr(_assc_file) # Remove obsolete GO IDs from association if needed _assc_geneid2gos_orig = self._init_assc(_assc_file, _pop_genes, godag) # Associations: Add parent all GO IDs if propagate_counts is True # DO propagate_counts before pruning because this step adds created higly associated GO IDs _assc_geneid2gos = _assc_geneid2gos_orig sys.stdout.write("PROPAGATE_COUNTS({VAL})\n".format( VAL=params.get('propagate_counts', False))) if params.get('propagate_counts', False): _assc_geneid2gos = { g: set(gos) for g, gos in _assc_geneid2gos.items() } update_association(_assc_geneid2gos, godag) # _go2genes = get_b2aset(_assc_geneid2gos) # vals = [len(genes) for genes in _go2genes.values()] # # Associations MUST be pruned if using propagate counts # max_genes = int(round(np.percentile(vals, 97.5))) # _assc_geneid2gos = self._prune_assc(_assc_geneid2gos, max_genes, godag) # Associations: rm GOs with lots of genes if specified by user # DO prune before getting population gene list so all pop genes have associations _assc_geneid2gos = self._possibly_prune_assc(_assc_geneid2gos, 1000, godag, params) ##### Associations: rm GOs with lots of genes if specified by user ####_assc_geneid2gos = _possibly_prune_assc(_assc_geneid2gos, 1000, godag) #### _randomize_truenull_assc = params.get('randomize_truenull_assc', None) #### if _randomize_truenull_assc is not None and '_pruned_' in _randomize_truenull_assc: #### _assc_geneid2gos = self._prune_assc(_assc_geneid2gos, 1000, godag) #### print "PRUNE" #### else: #### print "NO PRUNE" #### For sim analysis: Use population genes found in association for GOEA Sim eval self.pop_genes = set(_pop_genes).intersection( set(_assc_geneid2gos.keys())) # Speed sims: Use the association subset actually in the population _assc_all = { g: gos for g, gos in _assc_geneid2gos.items() if g in self.pop_genes } # Get all GO IDs in association self.pop_gos = self._init_assc_pop(_assc_all) self.go2genes = get_b2aset(_assc_all) self.objassc_all = RandAssc( _assc_all) # Holds assc as well as providing shuffled version # Set by local set_targeted() when RunParams is initialized self.goids_tgtd = None # Artifact GO IDs found to be also truly significant self.objassc_pruned = None self.objassc_tgtd = None
def _get_id2gos(file_id2gos, godag, name2go): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') go2genes = cx.defaultdict(set) genenum = 0 for name, qty in NAME2NUM.items(): goid = name2go[name] for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def describe_assc(org, fin_assc, go2obj, obj, prt): """Report statistics for a single association.""" # Assc. | # Assc| range | 25th | median | 75th | mean | stddev # ------------|-------|------------|------|--------|------|------|------- # hsa GO/gene | 19394 | 1 to 212 | 5 | 9 | 17 | 13 | 14 # hsa gene/GO | 17277 | 1 to 8,897 | 1 | 3 | 8 | 15 | 120 # # mus GO/gene | 19870 | 1 to 261 | 5 | 10 | 18 | 14 | 15 # mus gene/GO | 17491 | 1 to 7,009 | 1 | 3 | 8 | 16 | 129 # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()] cnts_genes_p_go = [len(genes) for genes in go2genes.values()] obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def describe_assc(org, fin_assc, go2obj, obj, prt): """Report statistics for a single association.""" # Assc. | # Assc| range | 25th | median | 75th | mean | stddev # ------------|-------|------------|------|--------|------|------|------- # hsa GO/gene | 19394 | 1 to 212 | 5 | 9 | 17 | 13 | 14 # hsa gene/GO | 17277 | 1 to 8,897 | 1 | 3 | 8 | 15 | 120 # # mus GO/gene | 19870 | 1 to 261 | 5 | 10 | 18 | 14 | 15 # mus gene/GO | 17491 | 1 to 7,009 | 1 | 3 | 8 | 16 | 129 # # dme GO/gene | 12551 | 1 to 137 | 2 | 4 | 8 | 6 | 7 # dme gene/GO | 7878 | 1 to 1,675 | 1 | 3 | 7 | 10 | 41 gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations go2genes = get_b2aset(gene2gos) assert gene2gos assert go2genes cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()] cnts_genes_p_go = [len(genes) for genes in go2genes.values()] obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt) obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
def get_go2chrs(sec2gos, sec2chr): """Dict: given a GO return a set of letters representing it's section membership(s).""" go2chrs = {} for goid, sections in get_b2aset(sec2gos).items(): go2chrs[goid] = set(sec2chr[s] for s in sections) return go2chrs
def test_anno_read(): """Test all annotation formats""" godag = get_godag(os.path.join(REPO, 'go-basic.obo')) # pylint: disable=superfluous-parens print( '- DOWNLOAD AND LOAD -----------------------------------------------') annoobjs = [ # gene2go _get_objanno('gene2go', taxid=10090), _get_objanno('gene2go', taxid=10090, namespaces={'BP'}), _get_objanno('gene2go', taxid=10090, namespaces={'MF'}), _get_objanno('gene2go', taxid=10090, namespaces={'CC'}), # gaf _get_objanno('goa_human.gaf'), _get_objanno('goa_human.gaf', namespaces={'BP'}), _get_objanno('goa_human.gaf', namespaces={'MF'}), _get_objanno('goa_human.gaf', namespaces={'CC'}), # gpad _get_objanno('goa_human.gpad', godag=godag), _get_objanno('goa_human.gpad', godag=godag, namespaces={'BP'}), _get_objanno('goa_human.gpad', godag=godag, namespaces={'MF'}), _get_objanno('goa_human.gpad', godag=godag, namespaces={'CC'}), _get_objanno('goa_human.gpad'), _get_objanno('goa_human.gpad', namespaces={'BP'}), _get_objanno('goa_human.gpad', namespaces={'MF'}), _get_objanno('goa_human.gpad', namespaces={'CC'}), # id2gos _get_objanno('data/association', 'id2gos'), _get_objanno('data/association', 'id2gos', namespaces={'BP'}), _get_objanno('data/association', 'id2gos', namespaces={'MF'}), _get_objanno('data/association', 'id2gos', namespaces={'CC'}), _get_objanno('data/association', 'id2gos', godag=godag), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'BP'}), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'MF'}), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'CC'}), ] print( '- prt_summary_anno2ev ---------------------------------------------') for idx, obj in enumerate(annoobjs): print('>>>>> {I} >>>>> prt_summary_anno2ev {ANNO}'.format( I=idx, ANNO=obj.get_desc())) obj.prt_summary_anno2ev() obj.chk_associations() print( '- print extension -------------------------------------------------') for idx, obj in enumerate(annoobjs): print('>>>>> {I} >>>>> print Extension {ANNO}'.format( I=idx, ANNO=obj.get_desc())) if obj.name in {'gaf', 'gpad'}: _prt_fld(obj, 'Extension', 10) print( '- get_id2gos ------------------------------------------------------') for idx, obj in enumerate(annoobjs): nspc = 'BP' if not obj.namespaces else next(iter(obj.namespaces)) id2gos = obj.get_id2gos(nspc) assert obj.get_id2gos(ev_include=INC_GOOD) == obj.get_id2gos(ev_exclude={'IEA'}), \ 'INC ALL({A}) != EXC IEA({I}): {DIF}'.format( A=len(obj.get_id2gos(ev_incclude=INC_GOOD)), I=len(obj.get_id2gos(ev_exclude={'IEA'})), # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()). # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'}))) DIF='') num_ids = len(id2gos) print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format( I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos)))) assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:" if obj.filename[-16:] == 'data/association' and obj.godag is None: assert num_ids == 34284 print( '- get_ns2... ------------------------------------------------------') for idx, obj in enumerate(annoobjs): if obj.name in {'gpad', 'id2gos'} and obj.godag is None: print('{IDX}) SKIPPING(No NS): {C}:get_ns2ntsanno {ANNO}'.format( IDX=idx, C=obj.__class__.__name__, ANNO=obj.get_desc())) continue _tst_ns2(obj, idx)
def test_anno_read(): """Test all annotation formats""" godag = get_godag(os.path.join(REPO, 'go-basic.obo')) # pylint: disable=superfluous-parens print('- DOWNLOAD AND LOAD -----------------------------------------------') annoobjs = [ # gene2go _get_objanno('gene2go', taxid=10090), _get_objanno('gene2go', taxid=10090, namespaces={'BP'}), _get_objanno('gene2go', taxid=10090, namespaces={'MF'}), _get_objanno('gene2go', taxid=10090, namespaces={'CC'}), # gaf _get_objanno('goa_human.gaf'), _get_objanno('goa_human.gaf', namespaces={'BP'}), _get_objanno('goa_human.gaf', namespaces={'MF'}), _get_objanno('goa_human.gaf', namespaces={'CC'}), # gpad _get_objanno('goa_human.gpad', godag=godag), _get_objanno('goa_human.gpad', godag=godag, namespaces={'BP'}), _get_objanno('goa_human.gpad', godag=godag, namespaces={'MF'}), _get_objanno('goa_human.gpad', godag=godag, namespaces={'CC'}), _get_objanno('goa_human.gpad'), _get_objanno('goa_human.gpad', namespaces={'BP'}), _get_objanno('goa_human.gpad', namespaces={'MF'}), _get_objanno('goa_human.gpad', namespaces={'CC'}), # id2gos _get_objanno('data/association', 'id2gos'), _get_objanno('data/association', 'id2gos', namespaces={'BP'}), _get_objanno('data/association', 'id2gos', namespaces={'MF'}), _get_objanno('data/association', 'id2gos', namespaces={'CC'}), _get_objanno('data/association', 'id2gos', godag=godag), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'BP'}), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'MF'}), _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'CC'}), ] print('- prt_summary_anno2ev ---------------------------------------------') for idx, obj in enumerate(annoobjs): print('>>>>> {I} >>>>> prt_summary_anno2ev {ANNO}'.format(I=idx, ANNO=obj.get_desc())) obj.prt_summary_anno2ev() obj.chk_associations() print('- print extension -------------------------------------------------') for idx, obj in enumerate(annoobjs): print('>>>>> {I} >>>>> print Extension {ANNO}'.format(I=idx, ANNO=obj.get_desc())) if obj.name in {'gaf', 'gpad'}: _prt_fld(obj, 'Extension', 10) print('- get_id2gos ------------------------------------------------------') for idx, obj in enumerate(annoobjs): nspc = 'BP' if not obj.namespaces else next(iter(obj.namespaces)) id2gos = obj.get_id2gos(nspc) assert obj.get_id2gos(ev_include=INC_GOOD) == obj.get_id2gos(ev_exclude={'IEA'}), \ 'INC ALL({A}) != EXC IEA({I}): {DIF}'.format( A=len(obj.get_id2gos(ev_incclude=INC_GOOD)), I=len(obj.get_id2gos(ev_exclude={'IEA'})), # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()). # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'}))) DIF='') num_ids = len(id2gos) print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format( I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos)))) assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:" if obj.filename[-16:] == 'data/association' and obj.godag is None: assert num_ids == 34284 print('- get_ns2... ------------------------------------------------------') for idx, obj in enumerate(annoobjs): if obj.name in {'gpad', 'id2gos'} and obj.godag is None: print('{IDX}) SKIPPING(No NS): {C}:get_ns2ntsanno {ANNO}'.format( IDX=idx, C=obj.__class__.__name__, ANNO=obj.get_desc())) continue _tst_ns2(obj, idx)
def get_go2chrs(sec2gos, sec2chr): """Dict: given a GO return a set of letters representing it's section membership(s).""" go2chrs = {} for goid, sections in get_b2aset(sec2gos).items(): go2chrs[goid] = set(sec2chr[s] for s in sections) return go2chrs