Ejemplo n.º 1
0
def test_evcode_picker():
    """Test that expected 3 letter codes are chosen when given: inc, exc for codes and groups"""
    obj = EvidenceCodes()
    # pylint: disable=superfluous-parens
    act = obj.get_evcodes()
    print('ALL POSITIVE CODES: {C}'.format(C=' '.join(sorted(act))))
    assert 'ND' not in act and len(act) > 15, act
    #
    act = obj.get_evcodes({'Experimental'})
    assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']), act
    #
    act = obj.get_evcodes({'Experimental'}, {'IEP'})
    assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI']), act
    #
    act = obj.get_evcodes({'Experimental', 'Similarity'}, {'IEP', 'IMR'})
    exp = {
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI',
        'ISS', 'ISO', 'ISA', 'ISM', 'IGC', 'IBA', 'IBD', 'IKR', 'IRD'}
    assert act == exp, act
    #
    act = obj.get_evcodes(None, {'IEA'})
    exp = set(obj.code2nt)
    exp.difference_update({'IEA', 'ND'})
    assert act == exp, act.symmetric_difference(exp)
    #
    obj.prt_details()
    obj.prt_summary_code()
    print("**TEST PASSED")
Ejemplo n.º 2
0
 def __init__(self, filename=None, hdr_only=False, prt=sys.stdout):
     self.filename = filename
     self.evobj = EvidenceCodes()
     # Initialize associations and header information
     self.hdr = None
     self.associations = self.read_gaf(filename, hdr_only,
                                       prt) if filename is not None else []
Ejemplo n.º 3
0
 def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws):
     # kws: allow_missing_symbol
     self.kws = {k:v for k, v in kws.items() if k in self.exp_kwdct}
     self.filename = filename
     self.evobj = EvidenceCodes()
     # Initialize associations and header information
     self.hdr = None
     self.associations = self.read_gaf(filename, hdr_only, prt) if filename is not None else []
Ejemplo n.º 4
0
 def _prt_evidence_codes(args):
     if not {'--ev_help', '--ev_help_short'}.isdisjoint(args):
         print('\nEVIDENCE CODE HELP: --ev_exc --ev_inc')
         print('Use any of these group names, ')
         print('like Experimental or Similarity or Experimental,Similarity,')
         print('or evidence codes, like IEA or ISS,ISO,ISA in --ev_exc or --ev_inc:')
         obj = EvidenceCodes()
         if '--ev_help' in args:
             print('')
             obj.prt_details()
         if '--ev_help_short' in args:
             print('')
             obj.prt_summary_code()
         sys.exit(0)
Ejemplo n.º 5
0
 def __init__(self, name, filename=None, **kws):
     # kws: allow_missing_symbol
     self.name = name  # name is one of valid_formats
     self.filename = filename
     self.godag = kws.get('godag')
     self.namespaces = kws.get('namespaces')
     self.evobj = EvidenceCodes()
     # Read anotation file, store namedtuples:
     #     Gene2GoReader(filename=None, taxids=None):
     #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
     #     GpadReader(filename=None, hdr_only=False):
     self.hdr = None
     self.datobj = None
     # pylint: disable=no-member
     self.associations = self._init_associations(filename, **kws)
     # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename)
     assert self.namespaces is None or isinstance(self.namespaces, set)
Ejemplo n.º 6
0
 def __init__(self, filename=None, **kws):
     # kws: allow_missing_symbol
     self.filename = filename
     self.evobj = EvidenceCodes()
     # Read anotation file, store namedtuples:
     #     Gene2GoReader(filename=None, taxids=None):
     #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
     #     GpadReader(filename=None, hdr_only=False):
     self.hdr = None
     self.datobj = None
     self.associations = self._init_associations(filename, **kws)
Ejemplo n.º 7
0
 def __init__(self, name, filename=None, **kws):
     # kws: allow_missing_symbol
     self.name = name
     self.filename = filename
     self.godag = kws.get('godag')
     self.namespaces = kws.get('namespaces')
     self.evobj = EvidenceCodes()
     # Read anotation file, store namedtuples:
     #     Gene2GoReader(filename=None, taxids=None):
     #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
     #     GpadReader(filename=None, hdr_only=False):
     self.hdr = None
     self.datobj = None
     # pylint: disable=no-member
     self.associations = self._init_associations(filename, **kws)
     # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename)
     assert self.namespaces is None or isinstance(self.namespaces, set)
Ejemplo n.º 8
0
 def _prt_evidence_codes(args):
     if not {'--ev_help', '--ev_help_short'}.isdisjoint(args):
         print('\nEVIDENCE CODE HELP: --ev_exc --ev_inc')
         print('Use any of these group names, ')
         print(
             'like Experimental or Similarity or Experimental,Similarity,')
         print(
             'or evidence codes, like IEA or ISS,ISO,ISA in --ev_exc or --ev_inc:'
         )
         obj = EvidenceCodes()
         if '--ev_help' in args:
             print('')
             obj.prt_details()
         if '--ev_help_short' in args:
             print('')
             obj.prt_summary_code()
         sys.exit(0)
Ejemplo n.º 9
0
def test_evcode_picker():
    """Test that expected 3 letter codes are chosen when given: inc, exc for codes and groups"""
    obj = EvidenceCodes()
    # pylint: disable=superfluous-parens
    act = obj.get_evcodes()
    print('ALL POSITIVE CODES: {C}'.format(C=' '.join(sorted(act))))
    assert 'ND' not in act and len(act) > 15, act
    #
    act = obj.get_evcodes({'Experimental'})
    assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']), act
    #
    act = obj.get_evcodes({'Experimental'}, {'IEP'})
    assert act == set(['EXP', 'IDA', 'IPI', 'IMP', 'IGI']), act
    #
    act = obj.get_evcodes({'Experimental', 'Similarity'}, {'IEP', 'IMR'})
    exp = {
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'ISS', 'ISO', 'ISA', 'ISM', 'IGC',
        'IBA', 'IBD', 'IKR', 'IRD'
    }
    assert act == exp, act
    #
    act = obj.get_evcodes(None, {'IEA'})
    exp = set(obj.code2nt)
    exp.difference_update({'IEA', 'ND'})
    assert act == exp, act.symmetric_difference(exp)
    #
    obj.prt_details()
    obj.prt_summary_code()
    print("**TEST PASSED")
Ejemplo n.º 10
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    exp_kwdct = set(['allow_missing_symbol'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws):
        # kws: allow_missing_symbol
        self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct}
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.associations = self.read_gaf(filename, hdr_only,
                                          prt) if filename is not None else []

    def read_gaf(self, fin_gaf, hdr_only, prt):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = []
        ver = None
        hdrobj = GafHdr()
        datobj = None
        lnum = line = -1
        ignored = []
        try:
            with open(fin_gaf) as ifstrm:
                for lnum, line in enumerate(ifstrm, 1):
                    # Read header
                    if datobj is None:
                        if line[0] == '!':
                            if ver is None and line[1:13] == 'gaf-version:':
                                ver = line[13:].strip()
                            hdrobj.chkaddhdr(line)
                        else:
                            self.hdr = hdrobj.get_hdr()
                            if hdr_only:
                                return nts
                            datobj = GafData(ver, **self.kws)
                    # Read data
                    if datobj is not None and line[0] != '!':
                        ntgaf = datobj.get_ntgaf(line)
                        if ntgaf is not None:
                            nts.append(ntgaf)
                        else:
                            ignored.append((lnum, line))
        except Exception as inst:
            import traceback
            traceback.print_exc()
            sys.stderr.write(
                "\n  **FATAL in read_gaf: {MSG}\n\n".format(MSG=str(inst)))
            sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf,
                                                                   L=line,
                                                                   LNUM=lnum))
            if datobj is not None:
                datobj.prt_line_detail(prt, line)
            sys.exit(1)
        # GAF file has been read
        self._prt_read_summary(prt, fin_gaf, nts, datobj, ignored)
        return self.evobj.sort_nts(nts, 'Evidence_Code')

    def _prt_read_summary(self, prt, fin_gaf, nts, datobj, ignored):
        """Print a summary about the GAF file that was read."""
        fout_log = self._prt_ignored_lines(ignored, datobj,
                                           fin_gaf) if ignored else None
        if prt is not None:
            prt.write("  READ    {N:9,} associations: {FIN}\n".format(
                N=len(nts), FIN=fin_gaf))
            if ignored:
                prt.write("  IGNORED {N:9,} associations: {FIN}\n".format(
                    N=len(ignored), FIN=fout_log))

    def _prt_ignored_lines(self, ignored, datobj, fin_gaf):
        """Print ignored lines to a log file."""
        fout_log = "{}.log".format(fin_gaf)
        with open(fout_log, 'w') as prt:
            for lnum, line in ignored:
                self.prt_ignore_line(prt, fin_gaf, line, lnum)
                datobj.prt_line_detail(prt, line)
                prt.write("\n")
        return fout_log

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    @staticmethod
    def prt_ignore_line(prt, fin_gaf, line, lnum):
        """Print a message saying that we are ignoring an association line."""
        prt.write(
            "**WARNING: BADLY FORMATTED LINE. IGNORED {FIN}[{LNUM}]:\n{L}\n".
            format(FIN=os.path.basename(fin_gaf), L=line, LNUM=lnum))
Ejemplo n.º 11
0
class AnnoReaderBase(object):
    """Reads a Gene Association File. Returns a Python object."""
    # pylint: disable=broad-except,line-too-long,too-many-instance-attributes

    tic = timeit.default_timer()

    # Expected values for a Qualifier
    exp_qualifiers = set([
        # Seen in both GAF and gene2go
        'not',
        'contributes_to',
        'colocalizes_with',
    ])

    valid_formats = {'gpad', 'gaf', 'gene2go', 'id2gos'}

    exp_nss = set(['BP', 'MF', 'CC'])

    def __init__(self, name, filename=None, **kws):
        # kws: allow_missing_symbol
        self.name = name  # name is one of valid_formats
        self.filename = filename
        self.godag = kws.get('godag')
        self.namespaces = kws.get('namespaces')
        self.evobj = EvidenceCodes()
        # Read anotation file, store namedtuples:
        #     Gene2GoReader(filename=None, taxids=None):
        #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
        #     GpadReader(filename=None, hdr_only=False):
        self.hdr = None
        self.datobj = None
        # pylint: disable=no-member
        self.associations = self._init_associations(filename, **kws)
        # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename)
        assert self.namespaces is None or isinstance(self.namespaces, set)

    def get_desc(self):
        """Get description"""
        return '{NAME} {NSs} {GODAG}'.format(
            NAME=self.name,
            NSs='' if self.namespaces is None else ','.join(self.namespaces),
            GODAG='' if self.godag is None else 'godag')

    # pylint: disable=unused-argument
    def get_associations(self, taxid=None):
        """Get associations"""
        # taxid is for NCBI's gene2gos
        return self.associations

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        self.evobj.prt_summary_anno2ev(self.associations, prt)

    def get_name(self):
        """Return type of annotation"""
        return self.name

    # pylint: disable=no-self-use
    def get_taxid(self):
        """Return taxid, if one was provided, otherwise return -1"""
        return -1

    # Arg, taxid, is used by NCBI's annotations, but not by gpad, gaf, etc.
    def get_ns2assc(self, taxid=None, **kws):
        """Return given associations into 3 (BP, MF, CC) dicts, id2gos"""
        return {
            ns: self._get_id2gos(nts, **kws)
            for ns, nts in self.get_ns2ntsanno().items()
        }

    # pylint: disable=unused-argument
    # Arg, taxid, is used by NCBI's annotations, but not by gpad, gaf, etc.
    def get_ns2ntsanno(self, taxid=None):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        return self._get_ns2ntsanno(self.associations)

    # Used by gpad, gaf, etc., but not used by NCBI's annotation reader
    def _get_ns2ntsanno(self, annotations):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        if self.name in {'gpad', 'id2gos'}:
            assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format(
                C=self.__class__.__name__, T=self.name)
        ns2nts = cx.defaultdict(list)
        for nta in annotations:
            ns2nts[nta.NS].append(nta)
        return {ns: ns2nts[ns] for ns in self.exp_nss.intersection(ns2nts)}

    def get_id2gos_nss(self, **kws):
        """Return all associations in a dict, id2gos, regardless of namespace"""
        return self._get_id2gos(self.associations, **kws)

    def get_id2gos(self, namespace=None, prt=sys.stdout, **kws):
        """Return associations from specified namespace in a dict, id2gos"""
        # pylint: disable=superfluous-parens
        if self.has_ns():  # Anno namedtuple has NS field
            nspc, assoc = self._get_1ns_assn(namespace)
            id2gos = self._get_id2gos(assoc, **kws)
            if prt:
                prt.write(
                    '{N} IDs in loaded association branch, {NS}\n'.format(
                        N=len(id2gos), NS=nspc))
            return id2gos
        if prt and namespace is not None:
            print(
                '**ERROR {CLS}(..., godag=None).get_id2gos: GODAG is None. IGNORING namespace({NS})\n'
                .format(NS=namespace, CLS=type(self).__name__))
        id2gos = self._get_id2gos(self.associations, **kws)
        if prt:
            prt.write('{N} IDs in all associations\n'.format(N=len(id2gos)))
        return id2gos

    def _get_1ns_assn(self, namespace_usr):
        """Get one namespace, given a user-provided namespace or a default"""
        # If all namespaces were loaded
        if self.namespaces is None:
            # Return user-specified namespace, if provided. Otherwise BP
            nspc = 'BP' if namespace_usr is None else namespace_usr
            # Return one namespace
            if nspc in set(NAMESPACE2NS.values()):
                return nspc, [nt for nt in self.associations if nt.NS == nspc]
            # Return all namespaces
            return nspc, self.associations
        # If one namespace was loaded, use that regardless of what user specfies
        if len(self.namespaces) == 1:
            nspc = next(iter(self.namespaces))
            if namespace_usr is not None and nspc != namespace_usr:
                print('**WARNING: IGNORING {ns}; ONLY {NS} WAS LOADED'.format(
                    ns=namespace_usr, NS=nspc))
            return nspc, self.associations
        if namespace_usr is None:
            print('**ERROR get_id2gos: GODAG NOT LOADED. USING: {NSs}'.format(
                NSs=' '.join(sorted(self.namespaces))))
        return namespace_usr, self.associations

    def has_ns(self):
        """Return True if namespace field, NS exists on annotation namedtuples"""
        assert self.associations, 'NO ASSOCIATIONS IN file({}): {}'.format(
            self.filename, self.associations)
        return hasattr(next(iter(self.associations)), 'NS')

    def _get_id2gos(self,
                    ntannos_usr,
                    propagate_counts=False,
                    relationships=None,
                    prt=sys.stdout,
                    **kws):
        """Return given ntannos_usr in a dict, id2gos"""
        options = AnnoOptions(self.evobj, **kws)
        # Default reduction is to remove. For all options, see goatools/anno/opts.py:
        #   * Evidence_Code == ND -> No biological data No biological Data available
        #   * Qualifiers contain NOT
        ntannos_m = self.reduce_annotations(ntannos_usr, options)
        dbid2goids = self.get_dbid2goids(ntannos_m, propagate_counts,
                                         relationships, prt)
        if options.b_geneid2gos:
            return dbid2goids
        # if not a2bs:
        #     raise RuntimeError('**ERROR: NO ASSOCATIONS FOUND: {FILE}'.format(FILE=self.filename))
        return self._get_goid2dbids(dbid2goids)

    @staticmethod
    def _get_goid2dbids(dbid2goids):
        """Return dict of GO ID keys and a set of gene products as values"""
        goid2dbids = cx.defaultdict(set)
        for dbid, goids in dbid2goids.items():
            for goid in goids:
                goid2dbids[goid].add(dbid)
        return dict(goid2dbids)

    def _get_namespaces(self, nts):
        """Get the set of namespaces seen in the namedtuples."""
        return set(nt.NS for nt in nts) if self.has_ns() else set()

    # Qualifier (column 4)
    # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with
    # This field is not mandatory;
    #     * cardinality 0, 1, >1;
    #     * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to)
    def prt_qualifiers(self, prt=sys.stdout):
        """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not"""
        # 13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
        #  4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        self._prt_qualifiers(self.associations, prt)

    @staticmethod
    def _prt_qualifiers(associations, prt=sys.stdout):
        """Print Qualifiers found in the annotations.
           QUALIFIERS:
                1,462 colocalizes_with
                1,454 contributes_to
                1,157 not
                   13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
                    4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        """
        prt.write('QUALIFIERS:\n')
        for fld, cnt in cx.Counter(q for nt in associations
                                   for q in nt.Qualifier).most_common():
            prt.write('    {N:6,} {FLD}\n'.format(N=cnt, FLD=fld))

    def reduce_annotations(self, annotations, options):
        """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT)."""
        getfnc_qual_ev = options.getfnc_qual_ev()
        return [
            nt for nt in annotations
            if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code)
        ]

    @staticmethod
    def update_association(assc_goidsets, go2ancestors, prt=sys.stdout):
        """Update the GO sets in assc_gene2gos to include all GO ancestors"""
        goids_avail = set(go2ancestors)
        # assc_gos is assc_gene2gos.values()
        for assc_goids_cur in assc_goidsets:
            parents = set()
            for goid in assc_goids_cur.intersection(goids_avail):
                parents.update(go2ancestors[goid])
            assc_goids_cur.update(parents)

    def _get_go2ancestors(self,
                          goids_assoc_usr,
                          relationships,
                          prt=sys.stdout):
        """Return go2ancestors (set of parent GO IDs) for all GO ID keys in go2obj."""
        assert self.godag is not None
        _godag = self.godag
        # Get GO IDs in annotations that are in GO DAG
        goids_avail = set(_godag)
        self._rpt_goids_notfound(goids_assoc_usr, goids_avail)
        goids_assoc_cur = goids_assoc_usr.intersection(goids_avail)
        # Get GO Term for each current GO ID in the annotations
        _go2obj_assc = {go: _godag[go] for go in goids_assoc_cur}
        go2ancestors = get_go2parents_go2obj(_go2obj_assc, relationships, prt)
        if prt:
            prt.write('{N} GO IDs -> {M} go2ancestors\n'.format(
                N=len(goids_avail), M=len(go2ancestors)))
        return go2ancestors

    @staticmethod
    def _rpt_goids_notfound(goids_assoc_all, goids_avail):
        """Report the number of GO IDs in the association, but not in the GODAG"""
        goids_missing = goids_assoc_all.difference(goids_avail)
        if goids_missing:
            print("{N} GO IDs NOT FOUND IN ASSOCIATION: {GOs}".format(
                N=len(goids_missing), GOs=" ".join(goids_missing)))

    def get_dbid2goids(self,
                       ntannos,
                       propagate_counts=False,
                       relationships=None,
                       prt=sys.stdout):
        """Return gene2go data for user-specified taxids."""
        if propagate_counts:
            return self._get_dbid2goids_p1(ntannos, relationships, prt)
        return self._get_dbid2goids_p0(ntannos)

    @staticmethod
    def _get_dbid2goids_p0(associations):
        """Return gene2goids with annotations as-is (propagate_counts == False)"""
        id2gos = cx.defaultdict(set)
        for ntd in associations:
            id2gos[ntd.DB_ID].add(ntd.GO_ID)
        return dict(id2gos)

    def _get_dbid2goids_p1(self, ntannos, relationships=None, prt=sys.stdout):
        """Return gene2goids with propagate_counts == True"""
        id2gos = cx.defaultdict(set)
        goids_annos = set(nt.GO_ID for nt in ntannos)
        go2ancestors = self._get_go2ancestors(goids_annos, relationships, prt)
        # https://github.com/geneontology/go-annotation/issues/3523
        exclude = {'GO:2000325', 'GO:2000327'}
        for ntd in ntannos:
            goid = ntd.GO_ID
            # https://github.com/geneontology/go-annotation/issues/3523
            if goid not in exclude:
                goids = id2gos[ntd.DB_ID]
                goids.add(goid)
                goids.update(go2ancestors[goid])
            else:
                print('**WARNING: OBSOLETE GO ID({GO})'.format(GO=goid))
        return dict(id2gos)

    @staticmethod
    def get_goid2dbids(associations):
        """Return gene2go data for user-specified taxids."""
        go2ids = cx.defaultdict(set)
        for ntd in associations:
            go2ids[ntd.GO_ID].add(ntd.DB_ID)
        return dict(go2ids)

    def hms(self, msg, tic=None, prt=sys.stdout):
        """Print elapsed time and message."""
        if tic is None:
            tic = self.tic
        now = timeit.default_timer()
        hms = str(datetime.timedelta(seconds=(now - tic)))
        prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg))
        return now

    def chk_associations(self, fout_err=None):
        """Check that associations are in expected format."""
        # pylint: disable=unnecessary-pass
        pass

    def nts_ev_nd(self):
        """Get annotations where Evidence_code == 'ND' (No biological data)"""
        return [nt for nt in self.associations if nt.Evidence_Code == 'ND']

    def nts_qual_not(self):
        """Get annotations having Qualifiers containing NOT"""
        return [nt for nt in self.associations if self._has_not_qual(nt)]

    def chk_qualifiers(self):
        """Check format of qualifier"""
        if self.name == 'id2gos':
            return
        for ntd in self.associations:
            # print(ntd)
            qual = ntd.Qualifier
            assert isinstance(
                qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format(
                    NAME=self.name, NT=ntd)
            assert qual != set(['']), ntd
            assert qual != set(['-']), ntd
            assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE'

    def chk_godag(self):
        """Check that a GODag was loaded"""
        if not self.godag:
            raise RuntimeError(
                '{CLS} MUST INCLUDE GODag: {CLS}(file.anno, godag=godag)'.
                format(CLS=self.__class__.__name__))

    @staticmethod
    def _has_not_qual(ntd):
        """Return True if the qualifiers contain a 'NOT'"""
        for qual in ntd.Qualifier:
            if 'not' in qual:
                return True
            if 'NOT' in qual:
                return True
        return False

    def prt_counts(self, prt=sys.stdout):
        """Print the number of taxids stored."""
        num_annos = len(self.associations)
        # 792,891 annotations for 3 taxids stored: 10090 7227 9606
        prt.write('{A:8,} annotations\n'.format(A=num_annos))
Ejemplo n.º 12
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    exp_kwdct = set(['allow_missing_symbol'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws):
        # kws: allow_missing_symbol
        self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct}
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.datobj = None
        self.associations = self._init_assn(
            filename, hdr_only, prt) if filename is not None else []

    def read_gaf(self, **kws):
        """Read Gene Association File (GAF). Return data."""
        # Simple associations
        id2gos = cx.defaultdict(set)
        # keyword arguments for choosing which GO IDs to keep
        # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
        taxid2asscs = kws.get('taxid2asscs', None)
        b_geneid2gos = not kws.get('go2geneids', False)
        evs = kws.get('evidence_set', None)
        eval_nd = self._get_nd(kws.get('keep_ND', False))
        eval_not = self._get_not(kws.get('keep_NOT', False))
        # Optionally specify a subset of GOs based on their evidence.
        # By default, return id2gos. User can cause go2geneids to be returned by:
        #   >>> read_ncbi_gene2go(..., go2geneids=True
        for ntgaf in self.associations:
            if eval_nd(ntgaf) and eval_not(ntgaf):
                if evs is None or ntgaf.Evidence_Code in evs:
                    geneid = ntgaf.DB_ID
                    go_id = ntgaf.GO_ID
                    if b_geneid2gos:
                        id2gos[geneid].add(go_id)
                    else:
                        id2gos[go_id].add(geneid)
                    if taxid2asscs is not None:
                        if ntgaf.Taxon:
                            taxid = ntgaf.Taxon[0]
                            taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id)
                            taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid)
        return id2gos  # return simple associations

    @staticmethod
    def _get_nd(keep_nd):
        """Allow GAF values always or never."""
        if keep_nd:
            return lambda nt: True
        return lambda nt: nt.Evidence_Code != 'ND'

    @staticmethod
    def _get_not(keep_not):
        """Allow GAF values always or never."""
        if keep_not:
            return lambda nt: True
        return lambda nt: 'NOT' not in nt.Qualifier

    def _init_assn(self, fin_gaf, hdr_only, prt):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = self._read_gaf_nts(fin_gaf, hdr_only)
        # GAF file has been read
        if prt:
            prt.write("  READ    {N:9,} associations: {FIN}\n".format(
                N=len(nts), FIN=fin_gaf))
        # If there are illegal GAF lines ...
        if self.datobj:
            if self.datobj.ignored or self.datobj.illegal_lines:
                self.datobj.prt_error_summary(fin_gaf)
        return self.evobj.sort_nts(nts, 'Evidence_Code')

    def _read_gaf_nts(self, fin_gaf, hdr_only):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = []
        ver = None
        hdrobj = GafHdr()
        datobj = None
        lnum = line = -1
        try:
            with open(fin_gaf) as ifstrm:
                for lnum, line in enumerate(ifstrm, 1):
                    # Read header
                    if datobj is None:
                        if line[0] == '!':
                            if ver is None and line[1:13] == 'gaf-version:':
                                ver = line[13:].strip()
                            hdrobj.chkaddhdr(line)
                        else:
                            self.hdr = hdrobj.get_hdr()
                            if hdr_only:
                                return nts
                            datobj = GafData(ver, **self.kws)
                    # Read data
                    if datobj is not None and line[0] != '!':
                        # print(lnum, line)
                        ntgaf = datobj.get_ntgaf(line, lnum)
                        if ntgaf is not None:
                            nts.append(ntgaf)
                        else:
                            datobj.ignored.append((lnum, line))
        except Exception as inst:
            import traceback
            traceback.print_exc()
            sys.stderr.write("\n  **FATAL: {MSG}\n\n".format(MSG=str(inst)))
            sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf,
                                                                   L=line,
                                                                   LNUM=lnum))
            if datobj is not None:
                datobj.prt_line_detail(sys.stdout, line)
            sys.exit(1)
        self.datobj = datobj
        return nts

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)
Ejemplo n.º 13
0
 def __init__(self, filename=None, log=sys.stdout):
     self.filename = filename
     self.log = log
     self.evobj = EvidenceCodes()
     self.associations = self.read_gaf(
         filename) if filename is not None else []
Ejemplo n.º 14
0
class AnnoReaderBase(object):
    """Reads a Gene Association File. Returns a Python object."""
    # pylint: disable=broad-except,line-too-long,too-many-instance-attributes

    tic = timeit.default_timer()

    # Expected values for a Qualifier
    exp_qualifiers = set([
        # Seen in both GAF and gene2go
        'not',
        'contributes_to',
        'colocalizes_with',
    ])

    # pylint: disable=too-many-instance-attributes
    def __init__(self, name, filename=None, **kws):
        # kws: allow_missing_symbol
        self.name = name
        self.filename = filename
        self.godag = kws.get('godag')
        self.namespaces = kws.get('namespaces')
        self.evobj = EvidenceCodes()
        # Read anotation file, store namedtuples:
        #     Gene2GoReader(filename=None, taxids=None):
        #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
        #     GpadReader(filename=None, hdr_only=False):
        self.hdr = None
        self.datobj = None
        # pylint: disable=no-member
        self.associations = self._init_associations(filename, **kws)
        # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename)
        assert self.namespaces is None or isinstance(self.namespaces, set)

    def get_desc(self):
        """Get description"""
        return '{NAME} {NSs} {GODAG}'.format(
            NAME=self.name,
            NSs='' if self.namespaces is None else ','.join(self.namespaces),
            GODAG='' if self.godag is None else 'godag')

    # pylint: disable=unused-argument
    def get_associations(self, taxid=None):
        """Get associations"""
        # taxid is for NCBI's gene2gos
        return self.associations

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        self.evobj.prt_summary_anno2ev(self.associations, prt)

    def get_name(self):
        """Return type of annotation"""
        return self.name

    # pylint: disable=no-self-use
    def get_taxid(self):
        """Return taxid, if one was provided, otherwise return -1"""
        return -1

    def get_ns2assc(self, **kws):
        """Return given associations into 3 (BP, MF, CC) dicts, id2gos"""
        return {
            ns: self._get_id2gos(nts, **kws)
            for ns, nts in self.get_ns2ntsanno(kws.get('taxid')).items()
        }

    # pylint: disable=unused-argument
    def get_ns2ntsanno(self, taxid=None):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        return self._get_ns2ntsanno(self.associations)

    def _get_ns2ntsanno(self, annotations):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        if self.name in {'gpad', 'id2gos'}:
            assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format(
                C=self.__class__.__name__, T=self.name)
        ns2nts = cx.defaultdict(list)
        for nta in annotations:
            ns2nts[nta.NS].append(nta)
        return {
            ns: ns2nts[ns]
            for ns in set(['BP', 'MF', 'CC']).intersection(ns2nts)
        }

    def get_id2gos_nss(self, **kws):
        """Return all associations in a dict, id2gos, regardless of namespace"""
        return self._get_id2gos(self.associations, **kws)

    def get_id2gos(self, namespace='BP', **kws):
        """Return associations from specified namespace in a dict, id2gos"""
        # pylint: disable=superfluous-parens
        if self.has_ns():
            assoc = [nt for nt in self.associations if nt.NS == namespace]
            id2gos = self._get_id2gos(assoc, **kws)
            print('{N} IDs in association branch, {NS}'.format(N=len(id2gos),
                                                               NS=namespace))
            return id2gos
        print('**ERROR: GODAG NOT LOADED. IGNORING namespace({NS})'.format(
            NS=namespace))
        id2gos = self._get_id2gos(self.associations, **kws)
        print('{N} IDs in association branch, {NS}'.format(N=len(id2gos),
                                                           NS=namespace))
        return id2gos

    def has_ns(self):
        """Return True if namespace field, NS exists on annotation namedtuples"""
        return hasattr(next(iter(self.associations)), 'NS')

    def _get_id2gos(self, associations, **kws):
        """Return given associations in a dict, id2gos"""
        options = AnnoOptions(self.evobj, **kws)
        # Default reduction is to remove. For all options, see goatools/anno/opts.py:
        #   * Evidence_Code == ND -> No biological data No biological Data available
        #   * Qualifiers contain NOT
        assc = self.reduce_annotations(associations, options)
        return self.get_dbid2goids(
            assc) if options.b_geneid2gos else self.get_goid2dbids(assc)

    def _get_namespaces(self, nts):
        """Get the set of namespaces seen in the namedtuples."""
        return set(nt.NS for nt in nts) if self.has_ns() else set()

    # Qualifier (column 4)
    # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with
    # This field is not mandatory;
    #     * cardinality 0, 1, >1;
    #     * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to)
    def prt_qualifiers(self, prt=sys.stdout):
        """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not"""
        # 13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
        #  4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        self._prt_qualifiers(self.associations, prt)

    @staticmethod
    def _prt_qualifiers(associations, prt=sys.stdout):
        """Print Qualifiers found in the annotations.
           QUALIFIERS:
                1,462 colocalizes_with
                1,454 contributes_to
                1,157 not
                   13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
                    4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        """
        prt.write('QUALIFIERS:\n')
        for fld, cnt in cx.Counter(q for nt in associations
                                   for q in nt.Qualifier).most_common():
            prt.write('    {N:6,} {FLD}\n'.format(N=cnt, FLD=fld))

    def reduce_annotations(self, annotations, options):
        """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT)."""
        getfnc_qual_ev = options.getfnc_qual_ev()
        return [
            nt for nt in annotations
            if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code)
        ]

    @staticmethod
    def get_dbid2goids(associations):
        """Return gene2go data for user-specified taxids."""
        id2gos = cx.defaultdict(set)
        for ntd in associations:
            id2gos[ntd.DB_ID].add(ntd.GO_ID)
        return dict(id2gos)

    @staticmethod
    def get_goid2dbids(associations):
        """Return gene2go data for user-specified taxids."""
        go2ids = cx.defaultdict(set)
        for ntd in associations:
            go2ids[ntd.GO_ID].add(ntd.DB_ID)
        return dict(go2ids)

    def hms(self, msg, tic=None, prt=sys.stdout):
        """Print elapsed time and message."""
        if tic is None:
            tic = self.tic
        now = timeit.default_timer()
        hms = str(datetime.timedelta(seconds=(now - tic)))
        prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg))
        return now

    def chk_associations(self, fout_err=None):
        """Check that associations are in expected format."""
        pass

    def nts_ev_nd(self):
        """Get annotations where Evidence_code == 'ND' (No biological data)"""
        return [nt for nt in self.associations if nt.Evidence_Code == 'ND']

    def nts_qual_not(self):
        """Get annotations having Qualifiers containing NOT"""
        return [nt for nt in self.associations if self._has_not_qual(nt)]

    def chk_qualifiers(self):
        """Check format of qualifier"""
        if self.name == 'id2gos':
            return
        for ntd in self.associations:
            # print(ntd)
            qual = ntd.Qualifier
            assert isinstance(
                qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format(
                    NAME=self.name, NT=ntd)
            assert qual != set(['']), ntd
            assert qual != set(['-']), ntd
            assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE'

    @staticmethod
    def _has_not_qual(ntd):
        """Return True if the qualifiers contain a 'NOT'"""
        for qual in ntd.Qualifier:
            if 'not' in qual:
                return True
            if 'NOT' in qual:
                return True
        return False
Ejemplo n.º 15
0
class AnnoReaderBase(object):
    """Reads a Gene Association File. Returns a Python object."""
    # pylint: disable=broad-except,line-too-long,too-many-instance-attributes

    tic = timeit.default_timer()

    # Expected values for a Qualifier
    exp_qualifiers = set([
        # Seen in both GAF and gene2go
        'not', 'contributes_to', 'colocalizes_with',
    ])

    # pylint: disable=too-many-instance-attributes
    def __init__(self, name, filename=None, **kws):
        # kws: allow_missing_symbol
        self.name = name
        self.filename = filename
        self.godag = kws.get('godag')
        self.namespaces = kws.get('namespaces')
        self.evobj = EvidenceCodes()
        # Read anotation file, store namedtuples:
        #     Gene2GoReader(filename=None, taxids=None):
        #     GafReader(filename=None, hdr_only=False, prt=sys.stdout, allow_missing_symbol=False):
        #     GpadReader(filename=None, hdr_only=False):
        self.hdr = None
        self.datobj = None
        # pylint: disable=no-member
        self.associations = self._init_associations(filename, **kws)
        # assert self.associations, 'NO ANNOTATIONS FOUND: {ANNO}'.format(ANNO=filename)
        assert self.namespaces is None or isinstance(self.namespaces, set)

    def get_desc(self):
        """Get description"""
        return '{NAME} {NSs} {GODAG}'.format(
            NAME=self.name,
            NSs='' if self.namespaces is None else ','.join(self.namespaces),
            GODAG='' if self.godag is None else 'godag')

    # pylint: disable=unused-argument
    def get_associations(self, taxid=None):
        """Get associations"""
        # taxid is for NCBI's gene2gos
        return self.associations

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        self.evobj.prt_summary_anno2ev(self.associations, prt)

    def get_name(self):
        """Return type of annotation"""
        return self.name

    # pylint: disable=no-self-use
    def get_taxid(self):
        """Return taxid, if one was provided, otherwise return -1"""
        return -1

    def get_ns2assc(self, **kws):
        """Return given associations into 3 (BP, MF, CC) dicts, id2gos"""
        return {ns:self._get_id2gos(nts, **kws) for ns, nts in self.get_ns2ntsanno(kws.get('taxid')).items()}

    # pylint: disable=unused-argument
    def get_ns2ntsanno(self, taxid=None):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        return self._get_ns2ntsanno(self.associations)

    def _get_ns2ntsanno(self, annotations):
        """Split list of annotations into 3 lists: BP, MF, CC"""
        if self.name in {'gpad', 'id2gos'}:
            assert self.godag is not None, "{T}: LOAD godag TO USE {C}::ns2ntsanno".format(
                C=self.__class__.__name__, T=self.name)
        ns2nts = cx.defaultdict(list)
        for nta in annotations:
            ns2nts[nta.NS].append(nta)
        return {ns:ns2nts[ns] for ns in set(['BP', 'MF', 'CC']).intersection(ns2nts)}

    def get_id2gos_nss(self, **kws):
        """Return all associations in a dict, id2gos, regardless of namespace"""
        return self._get_id2gos(self.associations, **kws)

    def get_id2gos(self, namespace='BP', **kws):
        """Return associations from specified namespace in a dict, id2gos"""
        # pylint: disable=superfluous-parens
        if self.has_ns():
            assoc = [nt for nt in self.associations if nt.NS == namespace]
            id2gos = self._get_id2gos(assoc, **kws)
            print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace))
            return id2gos
        print('**ERROR: GODAG NOT LOADED. IGNORING namespace({NS})'.format(NS=namespace))
        id2gos = self._get_id2gos(self.associations, **kws)
        print('{N} IDs in association branch, {NS}'.format(N=len(id2gos), NS=namespace))
        return id2gos

    def has_ns(self):
        """Return True if namespace field, NS exists on annotation namedtuples"""
        return hasattr(next(iter(self.associations)), 'NS')

    def _get_id2gos(self, associations, **kws):
        """Return given associations in a dict, id2gos"""
        options = AnnoOptions(self.evobj, **kws)
        # Default reduction is to remove. For all options, see goatools/anno/opts.py:
        #   * Evidence_Code == ND -> No biological data No biological Data available
        #   * Qualifiers contain NOT
        assc = self.reduce_annotations(associations, options)
        return self.get_dbid2goids(assc) if options.b_geneid2gos else self.get_goid2dbids(assc)

    def _get_namespaces(self, nts):
        """Get the set of namespaces seen in the namedtuples."""
        return set(nt.NS for nt in nts) if self.has_ns() else set()

    # Qualifier (column 4)
    # Flags that modify the interpretation of an annotation one (or more) of NOT, contributes_to, colocalizes_with
    # This field is not mandatory;
    #     * cardinality 0, 1, >1;
    #     * for cardinality >1 use a pipe to separate entries (e.g. NOT|contributes_to)
    def prt_qualifiers(self, prt=sys.stdout):
        """Print Qualifiers: 1,462 colocalizes_with; 1,454 contributes_to; 1,157 not"""
        # 13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
        #  4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        self._prt_qualifiers(self.associations, prt)

    @staticmethod
    def _prt_qualifiers(associations, prt=sys.stdout):
        """Print Qualifiers found in the annotations.
           QUALIFIERS:
                1,462 colocalizes_with
                1,454 contributes_to
                1,157 not
                   13 not colocalizes_with   (TBD: CHK - Seen in gene2go, but not gafs)
                    4 not contributes_to     (TBD: CHK - Seen in gene2go, but not gafs)
        """
        prt.write('QUALIFIERS:\n')
        for fld, cnt in cx.Counter(q for nt in associations for q in nt.Qualifier).most_common():
            prt.write('    {N:6,} {FLD}\n'.format(N=cnt, FLD=fld))

    def reduce_annotations(self, annotations, options):
        """Reduce annotations to ones used to identify enrichment (normally exclude ND and NOT)."""
        getfnc_qual_ev = options.getfnc_qual_ev()
        return [nt for nt in annotations if getfnc_qual_ev(nt.Qualifier, nt.Evidence_Code)]

    @staticmethod
    def get_dbid2goids(associations):
        """Return gene2go data for user-specified taxids."""
        id2gos = cx.defaultdict(set)
        for ntd in associations:
            id2gos[ntd.DB_ID].add(ntd.GO_ID)
        return dict(id2gos)

    @staticmethod
    def get_goid2dbids(associations):
        """Return gene2go data for user-specified taxids."""
        go2ids = cx.defaultdict(set)
        for ntd in associations:
            go2ids[ntd.GO_ID].add(ntd.DB_ID)
        return dict(go2ids)

    def hms(self, msg, tic=None, prt=sys.stdout):
        """Print elapsed time and message."""
        if tic is None:
            tic = self.tic
        now = timeit.default_timer()
        hms = str(datetime.timedelta(seconds=(now-tic)))
        prt.write('{HMS}: {MSG}\n'.format(HMS=hms, MSG=msg))
        return now

    def chk_associations(self, fout_err=None):
        """Check that associations are in expected format."""
        pass

    def nts_ev_nd(self):
        """Get annotations where Evidence_code == 'ND' (No biological data)"""
        return [nt for nt in self.associations if nt.Evidence_Code == 'ND']

    def nts_qual_not(self):
        """Get annotations having Qualifiers containing NOT"""
        return [nt for nt in self.associations if self._has_not_qual(nt)]

    def chk_qualifiers(self):
        """Check format of qualifier"""
        if self.name == 'id2gos':
            return
        for ntd in self.associations:
            # print(ntd)
            qual = ntd.Qualifier
            assert isinstance(qual, set), '{NAME}: QUALIFIER MUST BE A LIST: {NT}'.format(
                NAME=self.name, NT=ntd)
            assert qual != set(['']), ntd
            assert qual != set(['-']), ntd
            assert 'always' not in qual, 'SPEC SAID IT WOULD BE THERE'

    @staticmethod
    def _has_not_qual(ntd):
        """Return True if the qualifiers contain a 'NOT'"""
        for qual in ntd.Qualifier:
            if 'not' in qual:
                return True
            if 'NOT' in qual:
                return True
        return False
Ejemplo n.º 16
0
 def __init__(self, filename=None, log=sys.stdout):
     self.filename = filename
     self.log = log
     self.evobj = EvidenceCodes()
     self.associations = self.read_gaf(filename) if filename is not None else []
def test_ev():
    """Return GO associations from a GAF file. Download if necessary."""
    evs = _get_evidencecodes('gene2go')
    obj = EvidenceCodes()
    missing = evs.difference(obj.code2nt)
    assert not missing, 'MISSING({EV})'.format(EV=missing)
Ejemplo n.º 18
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    gafhdr = [  #           Col Req?     Cardinality    Example
        #                  --- -------- -------------- -----------------
        'DB',  #  0 required 1              UniProtKB
        'DB_ID',  #  1 required 1              P12345
        'DB_Symbol',  #  2 required 1              PHO3
        'Qualifier',  #  3 optional 0 or greater   NOT
        'GO_ID',  #  4 required 1              GO:0003993
        'DB_Reference',  #  5 required 1 or greater   PMID:2676709
        'Evidence_Code',  #  6 required 1              IMP
        'With_From',  #  7 optional 0 or greater   GO:0000346
        'Aspect',  #  8 required 1              F
        'DB_Name',  #  9 optional 0 or 1         Toll-like receptor 4
        'DB_Synonym',  # 10 optional 0 or greater   hToll|Tollbooth
        'DB_Type',  # 11 required 1              protein
        'Taxon',  # 12 required 1 or 2         taxon:9606
        'Date',  # 13 required 1              20090118
        'Assigned_By',  # 14 required 1              SGD
    ]

    #                            Col Required Cardinality  Example
    gafhdr2 = [  #                --- -------- ------------ -------------------
        'Annotation_Extension',  # 15 optional 0 or greater part_of(CL:0000576)
        'Gene_Product_Form_ID',  # 16 optional 0 or 1       UniProtKB:P12345-2
    ]

    gaf_columns = {
        "2.1": gafhdr + gafhdr2,  # !gaf-version: 2.1
        "2.0": gafhdr + gafhdr2,  # !gaf-version: 2.0
        "1.0": gafhdr
    }  # !gaf-version: 1.0

    # Expected numbers of columns for various versions
    gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15}

    # Expected values for a Qualifier
    exp_qualifiers = set(
        ['NOT', 'contributes_to', 'Contributes_to', 'colocalizes_with'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout):
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.associations = self.read_gaf(filename, hdr_only,
                                          prt) if filename is not None else []

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    def _get_ntgaf(self, ntgafobj, flds, ver):
        """Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
        # Cardinality
        is_set = False
        is_list = True
        qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set)
        db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1)
        with_from = self._rd_fld_vals("With_From", flds[7], is_set)
        db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1)
        db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set)
        taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2)
        self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14])
        # Additional Formatting
        taxons = self._do_taxons(taxons)
        self._chk_qualifier(qualifiers)
        # Create list of values
        gafvals = [
            flds[0],  # 0  DB
            flds[1],  # 1  DB_ID
            flds[2],  # 2  DB_Symbol
            qualifiers,  # 3  Qualifier
            flds[4],  # 4  GO_ID
            db_reference,  # 5  DB_Reference
            flds[6],  # 6  Evidence_Code
            with_from,  # 7  With_From
            flds[8],  # 8  Aspect
            db_name,  # 9  DB_Name
            db_synonym,  # 10 DB_Synonym
            flds[11],  # 11 DB_Type
            taxons,  # 12 Taxon
            flds[12],  # 13 Date
            flds[13]
        ]  # 14 Assigned_By
        # Version 2.x has these additional fields not found in v1.0
        if ver[0] == '2':
            gafvals += [
                self._rd_fld_vals("Annotation_Extension", flds[15], is_set),
                self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set)
            ]
        return ntgafobj._make(gafvals)

    def _rd_fld_vals(self,
                     name,
                     val,
                     set_list_ft=True,
                     qty_min=0,
                     qty_max=None):
        """Further split a GAF value within a single field."""
        if not val and qty_min == 0:
            return [] if set_list_ft else set()
        vals = val.split('|')  # Use a pipe to separate entries
        num_vals = len(vals)
        assert num_vals >= qty_min, \
            "FIELD({F}): MIN QUANTITY({Q}) WASN'T MET: {V} in {GAF}".format(
                F=name, Q=qty_min, V=vals, GAF=self.filename)
        if qty_max is not None:
            assert num_vals <= qty_max, \
                "FIELD({F}): MAX QUANTITY({Q}) EXCEEDED: {V} in {GAF}".format(
                    F=name, Q=qty_max, V=vals, GAF=self.filename)
        return vals if set_list_ft else set(vals)

    def read_gaf(self, fin_gaf, hdr_only, prt):
        """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
        ga_lst = []
        ver = None
        ntgafobj = None
        exp_numcol = None
        hdrobj = GafHdr()
        ifstrm = nopen(fin_gaf)
        for line in ifstrm:
            # Read header
            if ntgafobj is None:
                if line[0] == '!':
                    if line[1:13] == 'gaf-version:':
                        ver = line[13:].strip()
                    hdrobj.chkaddhdr(line)
                else:
                    self.hdr = hdrobj.get_hdr()
                    if hdr_only:
                        return ga_lst
                    ntgafobj = cx.namedtuple("ntgafobj",
                                             " ".join(self.gaf_columns[ver]))
                    exp_numcol = self.gaf_numcol[ver]
            # Read data
            if ntgafobj is not None:
                flds = self._split_line(line, exp_numcol)
                ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
                ga_lst.append(ntgaf)
        # GAF file has been read
        if prt is not None:
            readmsg = "  READ {N:,} associations: {FIN}\n"
            prt.write(readmsg.format(N=len(ga_lst), FIN=fin_gaf))
        return self.evobj.sort_nts(ga_lst, 'Evidence_Code')

    @staticmethod
    def _split_line(line, exp_numcol):
        """Split line into field values."""
        line = line.rstrip('\r\n')
        flds = re.split('\t', line)
        assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS"
        return flds

    def _chk_qualifier(self, qualifiers):
        """Check that qualifiers are expected values."""
        # http://geneontology.org/page/go-annotation-conventions#qual
        for qual in qualifiers:
            assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q}) IN {GAF}".format(
                Q=qual, GAF=self.filename)

    @staticmethod
    def _chk_qty_eq_1(flds, col_lst):
        """Check that these fields have only one value: required 1."""
        for col in col_lst:
            assert flds[
                col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(
                    V=flds[col], R=col)

    @staticmethod
    def _do_taxons(taxons):
        """Taxon"""
        taxons = [int(v[6:]) for v in taxons]  # strip "taxon:"
        num_taxons = len(taxons)
        assert num_taxons == 1 or num_taxons == 2
        return taxons
Ejemplo n.º 19
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    gafhdr = [  #           Col Req?     Cardinality    Example
        #                  --- -------- -------------- -----------------
        "DB",  #  0 required 1              UniProtKB
        "DB_ID",  #  1 required 1              P12345
        "DB_Symbol",  #  2 required 1              PHO3
        "Qualifier",  #  3 optional 0 or greater   NOT
        "GO_ID",  #  4 required 1              GO:0003993
        "DB_Reference",  #  5 required 1 or greater   PMID:2676709
        "Evidence_Code",  #  6 required 1              IMP
        "With_From",  #  7 optional 0 or greater   GO:0000346
        "Aspect",  #  8 required 1              F
        "DB_Name",  #  9 optional 0 or 1         Toll-like receptor 4
        "DB_Synonym",  # 10 optional 0 or greater   hToll|Tollbooth
        "DB_Type",  # 11 required 1              protein
        "Taxon",  # 12 required 1 or 2         taxon:9606
        "Date",  # 13 required 1              20090118
        "Assigned_By",  # 14 required 1              SGD
    ]

    #                            Col Required Cardinality  Example
    gafhdr2 = [  #                --- -------- ------------ -------------------
        "Annotation_Extension",  # 15 optional 0 or greater part_of(CL:0000576)
        "Gene_Product_Form_ID",  # 16 optional 0 or 1       UniProtKB:P12345-2
    ]

    gaf_columns = {
        "2.1": gafhdr + gafhdr2,  # !gaf-version: 2.1
        "2.0": gafhdr + gafhdr2,  # !gaf-version: 2.0
        "1.0": gafhdr,
    }  # !gaf-version: 1.0

    # Expected numbers of columns for various versions
    gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15}

    # Expected values for a Qualifier
    exp_qualifiers = set(["NOT", "contributes_to", "colocalizes_with"])

    def __init__(self, filename=None, log=sys.stdout):
        self.filename = filename
        self.log = log
        self.evobj = EvidenceCodes()
        self.associations = self.read_gaf(filename) if filename is not None else []

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if "NOT" not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif "NOT" in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    def _get_ntgaf(self, ntgafobj, flds, ver):
        """Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
        # Cardinality
        is_set = False
        is_list = True
        qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set)
        db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1)
        with_from = self._rd_fld_vals("With_From", flds[7], is_set)
        db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1)
        db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set)
        taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2)
        self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14])
        # Additional Formatting
        taxons = self._do_taxons(taxons)
        self._chk_qualifier(qualifiers)
        # Create list of values
        gafvals = [
            flds[0],  # 0  DB
            flds[1],  # 1  DB_ID
            flds[2],  # 2  DB_Symbol
            qualifiers,  # 3  Qualifier
            flds[4],  # 4  GO_ID
            db_reference,  # 5  DB_Reference
            flds[6],  # 6  Evidence_Code
            with_from,  # 7  With_From
            flds[8],  # 8  Aspect
            db_name,  # 9  DB_Name
            db_synonym,  # 10 DB_Synonym
            flds[11],  # 11 DB_Type
            taxons,  # 12 Taxon
            flds[12],  # 13 Date
            flds[13],
        ]  # 14 Assigned_By
        # Version 2.x has these additional fields not found in v1.0
        if ver[0] == "2":
            gafvals += [
                self._rd_fld_vals("Annotation_Extension", flds[15], is_set),
                self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set),
            ]
        return ntgafobj._make(gafvals)

    @staticmethod
    def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None):
        """Further split a GAF value within a single field."""
        if not val and qty_min == 0:
            return [] if set_list_ft else set()
        vals = val.split("|")  # Use a pipe to separate entries
        num_vals = len(vals)
        assert num_vals >= qty_min, "FLD({F}): MIN QUANTITY({Q}) NOT MET: {V}".format(F=name, Q=qty_min, V=vals)
        if qty_max is not None:
            assert num_vals <= qty_max, "FLD({F}): MAX QUANTITY({Q}) EXCEEDED: {V}".format(F=name, Q=qty_max, V=vals)
        return vals if set_list_ft else set(vals)

    def read_gaf(self, fin_gaf):
        """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
        ga_lst = []
        ifstrm = nopen(fin_gaf)
        ver = None
        ntgafobj = None
        exp_numcol = None
        for line in ifstrm:
            if ntgafobj is not None and not line.startswith("!"):
                flds = self._split_line(line, exp_numcol)
                ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
                ga_lst.append(ntgaf)
            elif ntgafobj is None and line.startswith("!gaf-version:"):
                ver = line[13:].strip()
                ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
                exp_numcol = self.gaf_numcol[ver]
        self.log.write("  READ {N:,} associations: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
        ga_lst = self.evobj.sort_nts(ga_lst, "Evidence_Code")
        return ga_lst

    @staticmethod
    def _split_line(line, exp_numcol):
        """Split line into field values."""
        line = line.rstrip("\r\n")
        flds = re.split("\t", line)
        assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS"
        return flds

    def _chk_qualifier(self, qualifiers):
        """Check that qualifiers are expected values."""
        # http://geneontology.org/page/go-annotation-conventions#qual
        for qual in qualifiers:
            assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q})".format(Q=qual)

    @staticmethod
    def _chk_qty_eq_1(flds, col_lst):
        """Check that these fields have only one value: required 1."""
        for col in col_lst:
            assert flds[col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(V=flds[col], R=col)

    @staticmethod
    def _do_taxons(taxons):
        """Taxon"""
        taxons = [int(v[6:]) for v in taxons]  # strip "taxon:"
        num_taxons = len(taxons)
        assert num_taxons == 1 or num_taxons == 2
        return taxons