Example #1
0
 def __init__(self):
     self.prt = sys.stdout
     _fin_assc = os.path.join(REPO, "goa_human.gaf")
     self.gene2gos_orig = dnld_assc(_fin_assc, go2obj=None, prt=self.prt)
     self.go2genes_orig = get_b2aset(self.gene2gos_orig)
     _num_genes = [len(gs) for gs in self.go2genes_orig.values()]
     self.min_genes = min(_num_genes)
     self.max_genes = max(_num_genes)
     assert self.gene2gos_orig == get_b2aset(self.go2genes_orig)
def _run_get_id2gos(annoobjs):
    """Test get_id2gos"""
    for idx, obj in enumerate(annoobjs):
        print('\n{I}) get_id2gos {DESC} {NSs} {N:,} annotations'.format(
            I=idx,
            DESC=obj.get_desc(),
            NSs=obj.namespaces,
            N=len(obj.associations)))
        # If all namespaces are loaded, returns BP, else returns loaded NS
        print('Load all evidence codes')
        id2gos = obj.get_id2gos()
        assert id2gos, 'NO ANNOTATIONS FOUND'
        ## print(next(iter(obj.associations)))
        print('Load all evidence codes, except IEA')
        id2gos_inc = obj.get_id2gos(ev_include=INC_GOOD)
        id2gos_exc = obj.get_id2gos(ev_exclude={'IEA'})
        assert id2gos_exc, 'NO NON-IEA ANNOTATIONS FOUND'
        assert id2gos_inc == id2gos_exc, \
            'INC ALL({A}) != EXC IEA({I}): {DIF}'.format(
                A=len(id2gos_inc),
                I=len(id2gos_exc),
                # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()).
                # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'})))
                DIF='')
        num_ids = len(id2gos)
        print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format(
            I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos))))
        assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:"
        if obj.filename[-16:] == 'data/association' and obj.godag is None:
            assert num_ids == 34284
Example #3
0
    def _prune_assc(self, assc_geneid2gos, max_genecnt, godag, prt=sys.stdout):
        """Remove GO IDs which are associated with large numbers of genes."""
        #### # DEPRECATED: Now in GOATOOLS
        #### go2genes_orig = get_b2aset(assc_geneid2gos)
        #### go2genes_prun = {go:gs for go, gs in go2genes_orig.items() if len(gs) <= max_genecnt}
        #### num_was = len(go2genes_orig)
        #### num_now = len(go2genes_prun)
        #### gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys()))
        #### assert num_was-num_now == len(gos_rm)
        #### prt.write("{N} GO IDs removed assc. w/>{G} genes = {A} - {B}\n".format(
        ####     N=num_was-num_now, G=max_genecnt, A=num_was, B=num_now))
        assc_geneid2gos_pruned, goids_rm = get_assc_pruned(assc_geneid2gos,
                                                           None,
                                                           max_genecnt,
                                                           prt=prt)
        go2genes_orig = get_b2aset(assc_geneid2gos)

        for desc in self.get_go2desc(goids_rm, godag, go2genes_orig).values():
            prt.write("    {DESC}\n".format(DESC=desc))
        cnts_genes = [len(gs) for gs in go2genes_orig.values()]
        # print sorted(cnts_genes)
        prt_percentiles("Number of genes associated with GO IDs.", cnts_genes,
                        "{:6.0f}", prt)

        #### self.prt_goids_assc(goids_rm, godag, go2genes_orig, "    ", prt)
        #### return get_b2aset(go2genes_prun)
        return assc_geneid2gos_pruned
Example #4
0
def _get_id2gos(file_id2gos, godag, name2go):
    """Get annotations"""
    if os.path.exists(file_id2gos):
        return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC')
    id2num = {
        name2go['A']: 10,
        name2go['B']: 10,
        name2go['C']: 10,
        name2go['D']: 10,
        name2go['E']: 10,
        name2go['F']: 10,
        name2go['G']: 10,
        name2go['H']: 10,
        name2go['I']: 30,
        name2go['L']: 30,
        name2go['M']: 20,
        name2go['N']: 30,
    }
    go2genes = cx.defaultdict(set)
    genenum = 0
    for goid, qty in id2num.items():
        for _ in range(qty):
            go2genes[goid].add(genenum)
            genenum += 1
    id2gos = get_b2aset(go2genes)
    IdToGosReader.wr_id2gos(file_id2gos, id2gos)
    return id2gos
Example #5
0
 def __init__(self, objsim):
     self.go2genes = get_b2aset(objsim.goeasim_assc)
     self.gos_bg = objsim.pobj.params['goids_study_bg']
     self.gos_sig_all = set([r.GO for r in objsim.goeasim_res])
     self.go2obj = objsim.pobj.params['gosubdag'].go2obj
     self.go2res = {r.GO: r for r in objsim.goeasim_res}
     self.attr_pval = "p_{METHOD}".format(METHOD=objsim.pobj.objbase.method)
     self.get_go2desc = objsim.pobj.objassc.get_go2desc
Example #6
0
 def _adj_for_assc(self):
     """Print only GO IDs from associations and their ancestors."""
     if self.gene2gos:
         gos_assoc = set(get_b2aset(self.gene2gos).keys())
         if 'item_marks' not in self.kws:
             self.kws['item_marks'] = {go:'>' for go in gos_assoc}
         if 'include_only' not in self.kws:
             gosubdag = GoSubDag(gos_assoc, self.gosubdag.go2obj,
                                 self.gosubdag.relationships)
             self.kws['include_only'] = gosubdag.go2obj
Example #7
0
 def _adj_for_assc(self):
     """Print only GO IDs from associations and their ancestors."""
     if self.gene2gos:
         gos_assoc = set(get_b2aset(self.gene2gos).keys())
         if 'item_marks' not in self.kws:
             self.kws['item_marks'] = {go:'>' for go in gos_assoc}
         if 'include_only' not in self.kws:
             gosubdag = GoSubDag(gos_assoc, self.gosubdag.go2obj,
                                 self.gosubdag.relationships)
             self.kws['include_only'] = gosubdag.go2obj
Example #8
0
 def set_targeted(self, goids_tgtd):
     """Set targeted GO IDs: Significant, but not tracked."""
     self.goids_tgtd = goids_tgtd
     # go:self.go2genes contains only GO IDs related to the population genes
     go2genes = {go: self.go2genes[go] for go in goids_tgtd}
     genes = get_b2aset(go2genes)
     print("TARGETED GENES({Gs})".format(Gs=len(genes)))
     assc_pruned, assc_tgtd = self._split_assc(goids_tgtd)
     print("AASSSSCC LENS PRUNED({}) TGTD({})".format(
         len(assc_pruned), len(assc_tgtd)))  # TBD rm
     self.objassc_pruned = RandAssc(assc_pruned)
     self.objassc_tgtd = RandAssc(assc_tgtd)
Example #9
0
 def __init__(self, params, godag):
     # Associations: rm obsolete GO IDs
     _assc_file = params['association_file']
     _pop_genes = params['genes_population']
     self.assc_hdr = get_gaf_hdr(_assc_file)
     # Remove obsolete GO IDs from association if needed
     _assc_geneid2gos_orig = self._init_assc(_assc_file, _pop_genes, godag)
     # Associations: Add parent all GO IDs if propagate_counts is True
     # DO propagate_counts before pruning because this step adds created higly associated GO IDs
     _assc_geneid2gos = _assc_geneid2gos_orig
     sys.stdout.write("PROPAGATE_COUNTS({VAL})\n".format(
         VAL=params.get('propagate_counts', False)))
     if params.get('propagate_counts', False):
         _assc_geneid2gos = {
             g: set(gos)
             for g, gos in _assc_geneid2gos.items()
         }
         update_association(_assc_geneid2gos, godag)
         # _go2genes = get_b2aset(_assc_geneid2gos)
         # vals = [len(genes) for genes in _go2genes.values()]
         # # Associations MUST be pruned if using propagate counts
         # max_genes = int(round(np.percentile(vals, 97.5)))
         # _assc_geneid2gos = self._prune_assc(_assc_geneid2gos, max_genes, godag)
     # Associations: rm GOs with lots of genes if specified by user
     # DO prune before getting population gene list so all pop genes have associations
     _assc_geneid2gos = self._possibly_prune_assc(_assc_geneid2gos, 1000,
                                                  godag, params)
     ##### Associations: rm GOs with lots of genes if specified by user
     ####_assc_geneid2gos = _possibly_prune_assc(_assc_geneid2gos, 1000, godag)
     #### _randomize_truenull_assc = params.get('randomize_truenull_assc', None)
     #### if _randomize_truenull_assc is not None and '_pruned_' in _randomize_truenull_assc:
     ####     _assc_geneid2gos = self._prune_assc(_assc_geneid2gos, 1000, godag)
     ####     print "PRUNE"
     #### else:
     ####     print "NO PRUNE"
     #### For sim analysis: Use population genes found in association for GOEA Sim eval
     self.pop_genes = set(_pop_genes).intersection(
         set(_assc_geneid2gos.keys()))
     # Speed sims: Use the association subset actually in the population
     _assc_all = {
         g: gos
         for g, gos in _assc_geneid2gos.items() if g in self.pop_genes
     }
     # Get all GO IDs in association
     self.pop_gos = self._init_assc_pop(_assc_all)
     self.go2genes = get_b2aset(_assc_all)
     self.objassc_all = RandAssc(
         _assc_all)  # Holds assc as well as providing shuffled version
     # Set by local set_targeted() when RunParams is initialized
     self.goids_tgtd = None  # Artifact GO IDs found to be also truly significant
     self.objassc_pruned = None
     self.objassc_tgtd = None
def _get_id2gos(file_id2gos, godag, name2go):
    """Get annotations"""
    if os.path.exists(file_id2gos):
        return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC')
    go2genes = cx.defaultdict(set)
    genenum = 0
    for name, qty in NAME2NUM.items():
        goid = name2go[name]
        for _ in range(qty):
            go2genes[goid].add(genenum)
            genenum += 1
    id2gos = get_b2aset(go2genes)
    IdToGosReader.wr_id2gos(file_id2gos, id2gos)
    return id2gos
Example #11
0
def describe_assc(org, fin_assc, go2obj, obj, prt):
    """Report statistics for a single association."""
    # Assc.       | # Assc| range      | 25th | median | 75th | mean | stddev
    # ------------|-------|------------|------|--------|------|------|-------
    # hsa GO/gene | 19394 | 1 to   212 |    5 |      9 |   17 |   13 |     14
    # hsa gene/GO | 17277 | 1 to 8,897 |    1 |      3 |    8 |   15 |    120
    #
    # mus GO/gene | 19870 | 1 to   261 |    5 |     10 |   18 |   14 |     15
    # mus gene/GO | 17491 | 1 to 7,009 |    1 |      3 |    8 |   16 |    129
    #
    # dme GO/gene | 12551 | 1 to   137 |    2 |      4 |    8 |    6 |      7
    # dme gene/GO |  7878 | 1 to 1,675 |    1 |      3 |    7 |   10 |     41
    gene2gos = dnld_assc(fin_assc, go2obj, prt=None)  # Associations
    go2genes = get_b2aset(gene2gos)
    cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()]
    cnts_genes_p_go = [len(genes) for genes in go2genes.values()]
    obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt)
    obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
Example #12
0
def describe_assc(org, fin_assc, go2obj, obj, prt):
    """Report statistics for a single association."""
    # Assc.       | # Assc| range      | 25th | median | 75th | mean | stddev
    # ------------|-------|------------|------|--------|------|------|-------
    # hsa GO/gene | 19394 | 1 to   212 |    5 |      9 |   17 |   13 |     14
    # hsa gene/GO | 17277 | 1 to 8,897 |    1 |      3 |    8 |   15 |    120
    #
    # mus GO/gene | 19870 | 1 to   261 |    5 |     10 |   18 |   14 |     15
    # mus gene/GO | 17491 | 1 to 7,009 |    1 |      3 |    8 |   16 |    129
    #
    # dme GO/gene | 12551 | 1 to   137 |    2 |      4 |    8 |    6 |      7
    # dme gene/GO |  7878 | 1 to 1,675 |    1 |      3 |    7 |   10 |     41
    gene2gos = dnld_assc(fin_assc, go2obj, prt=None) # Associations
    go2genes = get_b2aset(gene2gos)
    assert gene2gos
    assert go2genes
    cnts_gos_p_gene = [len(gos) for gos in gene2gos.values()]
    cnts_genes_p_go = [len(genes) for genes in go2genes.values()]
    obj.prt_data("{ORG} GO/gene".format(ORG=org), cnts_gos_p_gene, prt)
    obj.prt_data("{ORG} gene/GO".format(ORG=org), cnts_genes_p_go, prt)
Example #13
0
 def get_go2chrs(sec2gos, sec2chr):
     """Dict: given a GO return a set of letters representing it's section membership(s)."""
     go2chrs = {}
     for goid, sections in get_b2aset(sec2gos).items():
         go2chrs[goid] = set(sec2chr[s] for s in sections)
     return go2chrs
Example #14
0
def test_anno_read():
    """Test all annotation formats"""
    godag = get_godag(os.path.join(REPO, 'go-basic.obo'))

    # pylint: disable=superfluous-parens
    print(
        '- DOWNLOAD AND LOAD -----------------------------------------------')
    annoobjs = [
        # gene2go
        _get_objanno('gene2go', taxid=10090),
        _get_objanno('gene2go', taxid=10090, namespaces={'BP'}),
        _get_objanno('gene2go', taxid=10090, namespaces={'MF'}),
        _get_objanno('gene2go', taxid=10090, namespaces={'CC'}),
        # gaf
        _get_objanno('goa_human.gaf'),
        _get_objanno('goa_human.gaf', namespaces={'BP'}),
        _get_objanno('goa_human.gaf', namespaces={'MF'}),
        _get_objanno('goa_human.gaf', namespaces={'CC'}),
        # gpad
        _get_objanno('goa_human.gpad', godag=godag),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'BP'}),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'MF'}),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'CC'}),
        _get_objanno('goa_human.gpad'),
        _get_objanno('goa_human.gpad', namespaces={'BP'}),
        _get_objanno('goa_human.gpad', namespaces={'MF'}),
        _get_objanno('goa_human.gpad', namespaces={'CC'}),
        # id2gos
        _get_objanno('data/association', 'id2gos'),
        _get_objanno('data/association', 'id2gos', namespaces={'BP'}),
        _get_objanno('data/association', 'id2gos', namespaces={'MF'}),
        _get_objanno('data/association', 'id2gos', namespaces={'CC'}),
        _get_objanno('data/association', 'id2gos', godag=godag),
        _get_objanno('data/association',
                     'id2gos',
                     godag=godag,
                     namespaces={'BP'}),
        _get_objanno('data/association',
                     'id2gos',
                     godag=godag,
                     namespaces={'MF'}),
        _get_objanno('data/association',
                     'id2gos',
                     godag=godag,
                     namespaces={'CC'}),
    ]

    print(
        '- prt_summary_anno2ev ---------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        print('>>>>> {I} >>>>> prt_summary_anno2ev {ANNO}'.format(
            I=idx, ANNO=obj.get_desc()))
        obj.prt_summary_anno2ev()
        obj.chk_associations()

    print(
        '- print extension -------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        print('>>>>> {I} >>>>> print Extension {ANNO}'.format(
            I=idx, ANNO=obj.get_desc()))
        if obj.name in {'gaf', 'gpad'}:
            _prt_fld(obj, 'Extension', 10)

    print(
        '- get_id2gos ------------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        nspc = 'BP' if not obj.namespaces else next(iter(obj.namespaces))
        id2gos = obj.get_id2gos(nspc)
        assert obj.get_id2gos(ev_include=INC_GOOD) == obj.get_id2gos(ev_exclude={'IEA'}), \
            'INC ALL({A}) != EXC IEA({I}): {DIF}'.format(
                A=len(obj.get_id2gos(ev_incclude=INC_GOOD)),
                I=len(obj.get_id2gos(ev_exclude={'IEA'})),
                # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()).
                # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'})))
                DIF='')
        num_ids = len(id2gos)
        print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format(
            I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos))))
        assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:"
        if obj.filename[-16:] == 'data/association' and obj.godag is None:
            assert num_ids == 34284

    print(
        '- get_ns2... ------------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        if obj.name in {'gpad', 'id2gos'} and obj.godag is None:
            print('{IDX}) SKIPPING(No NS): {C}:get_ns2ntsanno {ANNO}'.format(
                IDX=idx, C=obj.__class__.__name__, ANNO=obj.get_desc()))
            continue
        _tst_ns2(obj, idx)
Example #15
0
def test_anno_read():
    """Test all annotation formats"""
    godag = get_godag(os.path.join(REPO, 'go-basic.obo'))

    # pylint: disable=superfluous-parens
    print('- DOWNLOAD AND LOAD -----------------------------------------------')
    annoobjs = [
        # gene2go
        _get_objanno('gene2go', taxid=10090),
        _get_objanno('gene2go', taxid=10090, namespaces={'BP'}),
        _get_objanno('gene2go', taxid=10090, namespaces={'MF'}),
        _get_objanno('gene2go', taxid=10090, namespaces={'CC'}),
        # gaf
        _get_objanno('goa_human.gaf'),
        _get_objanno('goa_human.gaf', namespaces={'BP'}),
        _get_objanno('goa_human.gaf', namespaces={'MF'}),
        _get_objanno('goa_human.gaf', namespaces={'CC'}),
        # gpad
        _get_objanno('goa_human.gpad', godag=godag),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'BP'}),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'MF'}),
        _get_objanno('goa_human.gpad', godag=godag, namespaces={'CC'}),
        _get_objanno('goa_human.gpad'),
        _get_objanno('goa_human.gpad', namespaces={'BP'}),
        _get_objanno('goa_human.gpad', namespaces={'MF'}),
        _get_objanno('goa_human.gpad', namespaces={'CC'}),
        # id2gos
        _get_objanno('data/association', 'id2gos'),
        _get_objanno('data/association', 'id2gos', namespaces={'BP'}),
        _get_objanno('data/association', 'id2gos', namespaces={'MF'}),
        _get_objanno('data/association', 'id2gos', namespaces={'CC'}),
        _get_objanno('data/association', 'id2gos', godag=godag),
        _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'BP'}),
        _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'MF'}),
        _get_objanno('data/association', 'id2gos', godag=godag, namespaces={'CC'}),
    ]

    print('- prt_summary_anno2ev ---------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        print('>>>>> {I} >>>>> prt_summary_anno2ev {ANNO}'.format(I=idx, ANNO=obj.get_desc()))
        obj.prt_summary_anno2ev()
        obj.chk_associations()

    print('- print extension -------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        print('>>>>> {I} >>>>> print Extension {ANNO}'.format(I=idx, ANNO=obj.get_desc()))
        if obj.name in {'gaf', 'gpad'}:
            _prt_fld(obj, 'Extension', 10)

    print('- get_id2gos ------------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        nspc = 'BP' if not obj.namespaces else next(iter(obj.namespaces))
        id2gos = obj.get_id2gos(nspc)
        assert obj.get_id2gos(ev_include=INC_GOOD) == obj.get_id2gos(ev_exclude={'IEA'}), \
            'INC ALL({A}) != EXC IEA({I}): {DIF}'.format(
                A=len(obj.get_id2gos(ev_incclude=INC_GOOD)),
                I=len(obj.get_id2gos(ev_exclude={'IEA'})),
                # DIF=set(obj.get_id2gos(ev_inc=INC_GOOD).keys()).
                # symmetric_difference(obj.get_id2gos(ev_exclude={'IEA'})))
                DIF='')
        num_ids = len(id2gos)
        print('>>>>> {I} >>>>> get_id2gos {N:6,} go2ids[{B:6,}] {ANNO}'.format(
            I=idx, N=num_ids, ANNO=obj.get_desc(), B=len(get_b2aset(id2gos))))
        assert next(iter(next(iter(id2gos.values()))))[:3] == "GO:"
        if obj.filename[-16:] == 'data/association' and obj.godag is None:
            assert num_ids == 34284

    print('- get_ns2... ------------------------------------------------------')
    for idx, obj in enumerate(annoobjs):
        if obj.name in {'gpad', 'id2gos'} and obj.godag is None:
            print('{IDX}) SKIPPING(No NS): {C}:get_ns2ntsanno {ANNO}'.format(
                IDX=idx, C=obj.__class__.__name__, ANNO=obj.get_desc()))
            continue
        _tst_ns2(obj, idx)
 def get_go2chrs(sec2gos, sec2chr):
     """Dict: given a GO return a set of letters representing it's section membership(s)."""
     go2chrs = {}
     for goid, sections in get_b2aset(sec2gos).items():
         go2chrs[goid] = set(sec2chr[s] for s in sections)
     return go2chrs