Beispiel #1
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    exp_kwdct = set(['allow_missing_symbol'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws):
        # kws: allow_missing_symbol
        self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct}
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.associations = self.read_gaf(filename, hdr_only,
                                          prt) if filename is not None else []

    def read_gaf(self, fin_gaf, hdr_only, prt):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = []
        ver = None
        hdrobj = GafHdr()
        datobj = None
        lnum = line = -1
        ignored = []
        try:
            with open(fin_gaf) as ifstrm:
                for lnum, line in enumerate(ifstrm, 1):
                    # Read header
                    if datobj is None:
                        if line[0] == '!':
                            if ver is None and line[1:13] == 'gaf-version:':
                                ver = line[13:].strip()
                            hdrobj.chkaddhdr(line)
                        else:
                            self.hdr = hdrobj.get_hdr()
                            if hdr_only:
                                return nts
                            datobj = GafData(ver, **self.kws)
                    # Read data
                    if datobj is not None and line[0] != '!':
                        ntgaf = datobj.get_ntgaf(line)
                        if ntgaf is not None:
                            nts.append(ntgaf)
                        else:
                            ignored.append((lnum, line))
        except Exception as inst:
            import traceback
            traceback.print_exc()
            sys.stderr.write(
                "\n  **FATAL in read_gaf: {MSG}\n\n".format(MSG=str(inst)))
            sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf,
                                                                   L=line,
                                                                   LNUM=lnum))
            if datobj is not None:
                datobj.prt_line_detail(prt, line)
            sys.exit(1)
        # GAF file has been read
        self._prt_read_summary(prt, fin_gaf, nts, datobj, ignored)
        return self.evobj.sort_nts(nts, 'Evidence_Code')

    def _prt_read_summary(self, prt, fin_gaf, nts, datobj, ignored):
        """Print a summary about the GAF file that was read."""
        fout_log = self._prt_ignored_lines(ignored, datobj,
                                           fin_gaf) if ignored else None
        if prt is not None:
            prt.write("  READ    {N:9,} associations: {FIN}\n".format(
                N=len(nts), FIN=fin_gaf))
            if ignored:
                prt.write("  IGNORED {N:9,} associations: {FIN}\n".format(
                    N=len(ignored), FIN=fout_log))

    def _prt_ignored_lines(self, ignored, datobj, fin_gaf):
        """Print ignored lines to a log file."""
        fout_log = "{}.log".format(fin_gaf)
        with open(fout_log, 'w') as prt:
            for lnum, line in ignored:
                self.prt_ignore_line(prt, fin_gaf, line, lnum)
                datobj.prt_line_detail(prt, line)
                prt.write("\n")
        return fout_log

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    @staticmethod
    def prt_ignore_line(prt, fin_gaf, line, lnum):
        """Print a message saying that we are ignoring an association line."""
        prt.write(
            "**WARNING: BADLY FORMATTED LINE. IGNORED {FIN}[{LNUM}]:\n{L}\n".
            format(FIN=os.path.basename(fin_gaf), L=line, LNUM=lnum))
Beispiel #2
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    exp_kwdct = set(['allow_missing_symbol'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout, **kws):
        # kws: allow_missing_symbol
        self.kws = {k: v for k, v in kws.items() if k in self.exp_kwdct}
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.datobj = None
        self.associations = self._init_assn(
            filename, hdr_only, prt) if filename is not None else []

    def read_gaf(self, **kws):
        """Read Gene Association File (GAF). Return data."""
        # Simple associations
        id2gos = cx.defaultdict(set)
        # keyword arguments for choosing which GO IDs to keep
        # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs
        taxid2asscs = kws.get('taxid2asscs', None)
        b_geneid2gos = not kws.get('go2geneids', False)
        evs = kws.get('evidence_set', None)
        eval_nd = self._get_nd(kws.get('keep_ND', False))
        eval_not = self._get_not(kws.get('keep_NOT', False))
        # Optionally specify a subset of GOs based on their evidence.
        # By default, return id2gos. User can cause go2geneids to be returned by:
        #   >>> read_ncbi_gene2go(..., go2geneids=True
        for ntgaf in self.associations:
            if eval_nd(ntgaf) and eval_not(ntgaf):
                if evs is None or ntgaf.Evidence_Code in evs:
                    geneid = ntgaf.DB_ID
                    go_id = ntgaf.GO_ID
                    if b_geneid2gos:
                        id2gos[geneid].add(go_id)
                    else:
                        id2gos[go_id].add(geneid)
                    if taxid2asscs is not None:
                        if ntgaf.Taxon:
                            taxid = ntgaf.Taxon[0]
                            taxid2asscs[taxid]['ID2GOs'][geneid].add(go_id)
                            taxid2asscs[taxid]['GO2IDs'][go_id].add(geneid)
        return id2gos  # return simple associations

    @staticmethod
    def _get_nd(keep_nd):
        """Allow GAF values always or never."""
        if keep_nd:
            return lambda nt: True
        return lambda nt: nt.Evidence_Code != 'ND'

    @staticmethod
    def _get_not(keep_not):
        """Allow GAF values always or never."""
        if keep_not:
            return lambda nt: True
        return lambda nt: 'NOT' not in nt.Qualifier

    def _init_assn(self, fin_gaf, hdr_only, prt):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = self._read_gaf_nts(fin_gaf, hdr_only)
        # GAF file has been read
        if prt:
            prt.write("  READ    {N:9,} associations: {FIN}\n".format(
                N=len(nts), FIN=fin_gaf))
        # If there are illegal GAF lines ...
        if self.datobj:
            if self.datobj.ignored or self.datobj.illegal_lines:
                self.datobj.prt_error_summary(fin_gaf)
        return self.evobj.sort_nts(nts, 'Evidence_Code')

    def _read_gaf_nts(self, fin_gaf, hdr_only):
        """Read GAF file. Store annotation data in a list of namedtuples."""
        nts = []
        ver = None
        hdrobj = GafHdr()
        datobj = None
        lnum = line = -1
        try:
            with open(fin_gaf) as ifstrm:
                for lnum, line in enumerate(ifstrm, 1):
                    # Read header
                    if datobj is None:
                        if line[0] == '!':
                            if ver is None and line[1:13] == 'gaf-version:':
                                ver = line[13:].strip()
                            hdrobj.chkaddhdr(line)
                        else:
                            self.hdr = hdrobj.get_hdr()
                            if hdr_only:
                                return nts
                            datobj = GafData(ver, **self.kws)
                    # Read data
                    if datobj is not None and line[0] != '!':
                        # print(lnum, line)
                        ntgaf = datobj.get_ntgaf(line, lnum)
                        if ntgaf is not None:
                            nts.append(ntgaf)
                        else:
                            datobj.ignored.append((lnum, line))
        except Exception as inst:
            import traceback
            traceback.print_exc()
            sys.stderr.write("\n  **FATAL: {MSG}\n\n".format(MSG=str(inst)))
            sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf,
                                                                   L=line,
                                                                   LNUM=lnum))
            if datobj is not None:
                datobj.prt_line_detail(sys.stdout, line)
            sys.exit(1)
        self.datobj = datobj
        return nts

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)
Beispiel #3
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    gafhdr = [  #           Col Req?     Cardinality    Example
        #                  --- -------- -------------- -----------------
        'DB',  #  0 required 1              UniProtKB
        'DB_ID',  #  1 required 1              P12345
        'DB_Symbol',  #  2 required 1              PHO3
        'Qualifier',  #  3 optional 0 or greater   NOT
        'GO_ID',  #  4 required 1              GO:0003993
        'DB_Reference',  #  5 required 1 or greater   PMID:2676709
        'Evidence_Code',  #  6 required 1              IMP
        'With_From',  #  7 optional 0 or greater   GO:0000346
        'Aspect',  #  8 required 1              F
        'DB_Name',  #  9 optional 0 or 1         Toll-like receptor 4
        'DB_Synonym',  # 10 optional 0 or greater   hToll|Tollbooth
        'DB_Type',  # 11 required 1              protein
        'Taxon',  # 12 required 1 or 2         taxon:9606
        'Date',  # 13 required 1              20090118
        'Assigned_By',  # 14 required 1              SGD
    ]

    #                            Col Required Cardinality  Example
    gafhdr2 = [  #                --- -------- ------------ -------------------
        'Annotation_Extension',  # 15 optional 0 or greater part_of(CL:0000576)
        'Gene_Product_Form_ID',  # 16 optional 0 or 1       UniProtKB:P12345-2
    ]

    gaf_columns = {
        "2.1": gafhdr + gafhdr2,  # !gaf-version: 2.1
        "2.0": gafhdr + gafhdr2,  # !gaf-version: 2.0
        "1.0": gafhdr
    }  # !gaf-version: 1.0

    # Expected numbers of columns for various versions
    gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15}

    # Expected values for a Qualifier
    exp_qualifiers = set(
        ['NOT', 'contributes_to', 'Contributes_to', 'colocalizes_with'])

    def __init__(self, filename=None, hdr_only=False, prt=sys.stdout):
        self.filename = filename
        self.evobj = EvidenceCodes()
        # Initialize associations and header information
        self.hdr = None
        self.associations = self.read_gaf(filename, hdr_only,
                                          prt) if filename is not None else []

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if 'NOT' not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif 'NOT' in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    def _get_ntgaf(self, ntgafobj, flds, ver):
        """Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
        # Cardinality
        is_set = False
        is_list = True
        qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set)
        db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1)
        with_from = self._rd_fld_vals("With_From", flds[7], is_set)
        db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1)
        db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set)
        taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2)
        self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14])
        # Additional Formatting
        taxons = self._do_taxons(taxons)
        self._chk_qualifier(qualifiers)
        # Create list of values
        gafvals = [
            flds[0],  # 0  DB
            flds[1],  # 1  DB_ID
            flds[2],  # 2  DB_Symbol
            qualifiers,  # 3  Qualifier
            flds[4],  # 4  GO_ID
            db_reference,  # 5  DB_Reference
            flds[6],  # 6  Evidence_Code
            with_from,  # 7  With_From
            flds[8],  # 8  Aspect
            db_name,  # 9  DB_Name
            db_synonym,  # 10 DB_Synonym
            flds[11],  # 11 DB_Type
            taxons,  # 12 Taxon
            flds[12],  # 13 Date
            flds[13]
        ]  # 14 Assigned_By
        # Version 2.x has these additional fields not found in v1.0
        if ver[0] == '2':
            gafvals += [
                self._rd_fld_vals("Annotation_Extension", flds[15], is_set),
                self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set)
            ]
        return ntgafobj._make(gafvals)

    def _rd_fld_vals(self,
                     name,
                     val,
                     set_list_ft=True,
                     qty_min=0,
                     qty_max=None):
        """Further split a GAF value within a single field."""
        if not val and qty_min == 0:
            return [] if set_list_ft else set()
        vals = val.split('|')  # Use a pipe to separate entries
        num_vals = len(vals)
        assert num_vals >= qty_min, \
            "FIELD({F}): MIN QUANTITY({Q}) WASN'T MET: {V} in {GAF}".format(
                F=name, Q=qty_min, V=vals, GAF=self.filename)
        if qty_max is not None:
            assert num_vals <= qty_max, \
                "FIELD({F}): MAX QUANTITY({Q}) EXCEEDED: {V} in {GAF}".format(
                    F=name, Q=qty_max, V=vals, GAF=self.filename)
        return vals if set_list_ft else set(vals)

    def read_gaf(self, fin_gaf, hdr_only, prt):
        """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
        ga_lst = []
        ver = None
        ntgafobj = None
        exp_numcol = None
        hdrobj = GafHdr()
        ifstrm = nopen(fin_gaf)
        for line in ifstrm:
            # Read header
            if ntgafobj is None:
                if line[0] == '!':
                    if line[1:13] == 'gaf-version:':
                        ver = line[13:].strip()
                    hdrobj.chkaddhdr(line)
                else:
                    self.hdr = hdrobj.get_hdr()
                    if hdr_only:
                        return ga_lst
                    ntgafobj = cx.namedtuple("ntgafobj",
                                             " ".join(self.gaf_columns[ver]))
                    exp_numcol = self.gaf_numcol[ver]
            # Read data
            if ntgafobj is not None:
                flds = self._split_line(line, exp_numcol)
                ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
                ga_lst.append(ntgaf)
        # GAF file has been read
        if prt is not None:
            readmsg = "  READ {N:,} associations: {FIN}\n"
            prt.write(readmsg.format(N=len(ga_lst), FIN=fin_gaf))
        return self.evobj.sort_nts(ga_lst, 'Evidence_Code')

    @staticmethod
    def _split_line(line, exp_numcol):
        """Split line into field values."""
        line = line.rstrip('\r\n')
        flds = re.split('\t', line)
        assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS"
        return flds

    def _chk_qualifier(self, qualifiers):
        """Check that qualifiers are expected values."""
        # http://geneontology.org/page/go-annotation-conventions#qual
        for qual in qualifiers:
            assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q}) IN {GAF}".format(
                Q=qual, GAF=self.filename)

    @staticmethod
    def _chk_qty_eq_1(flds, col_lst):
        """Check that these fields have only one value: required 1."""
        for col in col_lst:
            assert flds[
                col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(
                    V=flds[col], R=col)

    @staticmethod
    def _do_taxons(taxons):
        """Taxon"""
        taxons = [int(v[6:]) for v in taxons]  # strip "taxon:"
        num_taxons = len(taxons)
        assert num_taxons == 1 or num_taxons == 2
        return taxons
Beispiel #4
0
class GafReader(object):
    """Reads a Gene Annotation File (GAF). Returns a Python object."""

    gafhdr = [  #           Col Req?     Cardinality    Example
        #                  --- -------- -------------- -----------------
        "DB",  #  0 required 1              UniProtKB
        "DB_ID",  #  1 required 1              P12345
        "DB_Symbol",  #  2 required 1              PHO3
        "Qualifier",  #  3 optional 0 or greater   NOT
        "GO_ID",  #  4 required 1              GO:0003993
        "DB_Reference",  #  5 required 1 or greater   PMID:2676709
        "Evidence_Code",  #  6 required 1              IMP
        "With_From",  #  7 optional 0 or greater   GO:0000346
        "Aspect",  #  8 required 1              F
        "DB_Name",  #  9 optional 0 or 1         Toll-like receptor 4
        "DB_Synonym",  # 10 optional 0 or greater   hToll|Tollbooth
        "DB_Type",  # 11 required 1              protein
        "Taxon",  # 12 required 1 or 2         taxon:9606
        "Date",  # 13 required 1              20090118
        "Assigned_By",  # 14 required 1              SGD
    ]

    #                            Col Required Cardinality  Example
    gafhdr2 = [  #                --- -------- ------------ -------------------
        "Annotation_Extension",  # 15 optional 0 or greater part_of(CL:0000576)
        "Gene_Product_Form_ID",  # 16 optional 0 or 1       UniProtKB:P12345-2
    ]

    gaf_columns = {
        "2.1": gafhdr + gafhdr2,  # !gaf-version: 2.1
        "2.0": gafhdr + gafhdr2,  # !gaf-version: 2.0
        "1.0": gafhdr,
    }  # !gaf-version: 1.0

    # Expected numbers of columns for various versions
    gaf_numcol = {"2.1": 17, "2.0": 17, "1.0": 15}

    # Expected values for a Qualifier
    exp_qualifiers = set(["NOT", "contributes_to", "colocalizes_with"])

    def __init__(self, filename=None, log=sys.stdout):
        self.filename = filename
        self.log = log
        self.evobj = EvidenceCodes()
        self.associations = self.read_gaf(filename) if filename is not None else []

    def prt_summary_anno2ev(self, prt=sys.stdout):
        """Print annotation/evidence code summary."""
        ctr = cx.Counter()
        for ntgaf in self.associations:
            evidence_code = ntgaf.Evidence_Code
            if "NOT" not in ntgaf.Qualifier:
                ctr[evidence_code] += 1
            elif "NOT" in ntgaf.Qualifier:
                ctr["NOT {EV}".format(EV=ntgaf.Evidence_Code)] += 1
            else:
                raise Exception("UNEXPECTED INFO")
        self.evobj.prt_ev_cnts(ctr, prt)

    def _get_ntgaf(self, ntgafobj, flds, ver):
        """Convert fields from string to preferred format for GAF ver 2.1 and 2.0."""
        # Cardinality
        is_set = False
        is_list = True
        qualifiers = self._rd_fld_vals("Qualifier", flds[3], is_set)
        db_reference = self._rd_fld_vals("DB_Reference", flds[5], is_set, 1)
        with_from = self._rd_fld_vals("With_From", flds[7], is_set)
        db_name = self._rd_fld_vals("DB_Name", flds[9], is_set, 0, 1)
        db_synonym = self._rd_fld_vals("DB_Synonym", flds[10], is_set)
        taxons = self._rd_fld_vals("Taxon", flds[12], is_list, 1, 2)
        self._chk_qty_eq_1(flds, [0, 1, 2, 4, 6, 8, 11, 13, 14])
        # Additional Formatting
        taxons = self._do_taxons(taxons)
        self._chk_qualifier(qualifiers)
        # Create list of values
        gafvals = [
            flds[0],  # 0  DB
            flds[1],  # 1  DB_ID
            flds[2],  # 2  DB_Symbol
            qualifiers,  # 3  Qualifier
            flds[4],  # 4  GO_ID
            db_reference,  # 5  DB_Reference
            flds[6],  # 6  Evidence_Code
            with_from,  # 7  With_From
            flds[8],  # 8  Aspect
            db_name,  # 9  DB_Name
            db_synonym,  # 10 DB_Synonym
            flds[11],  # 11 DB_Type
            taxons,  # 12 Taxon
            flds[12],  # 13 Date
            flds[13],
        ]  # 14 Assigned_By
        # Version 2.x has these additional fields not found in v1.0
        if ver[0] == "2":
            gafvals += [
                self._rd_fld_vals("Annotation_Extension", flds[15], is_set),
                self._rd_fld_vals("Gene_Product_Form_ID", flds[16], is_set),
            ]
        return ntgafobj._make(gafvals)

    @staticmethod
    def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None):
        """Further split a GAF value within a single field."""
        if not val and qty_min == 0:
            return [] if set_list_ft else set()
        vals = val.split("|")  # Use a pipe to separate entries
        num_vals = len(vals)
        assert num_vals >= qty_min, "FLD({F}): MIN QUANTITY({Q}) NOT MET: {V}".format(F=name, Q=qty_min, V=vals)
        if qty_max is not None:
            assert num_vals <= qty_max, "FLD({F}): MAX QUANTITY({Q}) EXCEEDED: {V}".format(F=name, Q=qty_max, V=vals)
        return vals if set_list_ft else set(vals)

    def read_gaf(self, fin_gaf):
        """Read GAF file. HTTP address okay. GZIPPED/BZIPPED file okay."""
        ga_lst = []
        ifstrm = nopen(fin_gaf)
        ver = None
        ntgafobj = None
        exp_numcol = None
        for line in ifstrm:
            if ntgafobj is not None and not line.startswith("!"):
                flds = self._split_line(line, exp_numcol)
                ntgaf = self._get_ntgaf(ntgafobj, flds, ver)
                ga_lst.append(ntgaf)
            elif ntgafobj is None and line.startswith("!gaf-version:"):
                ver = line[13:].strip()
                ntgafobj = cx.namedtuple("ntgafobj", " ".join(self.gaf_columns[ver]))
                exp_numcol = self.gaf_numcol[ver]
        self.log.write("  READ {N:,} associations: {FIN}\n".format(N=len(ga_lst), FIN=fin_gaf))
        ga_lst = self.evobj.sort_nts(ga_lst, "Evidence_Code")
        return ga_lst

    @staticmethod
    def _split_line(line, exp_numcol):
        """Split line into field values."""
        line = line.rstrip("\r\n")
        flds = re.split("\t", line)
        assert len(flds) == exp_numcol, "UNEXPECTED NUMBER OF COLUMNS"
        return flds

    def _chk_qualifier(self, qualifiers):
        """Check that qualifiers are expected values."""
        # http://geneontology.org/page/go-annotation-conventions#qual
        for qual in qualifiers:
            assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q})".format(Q=qual)

    @staticmethod
    def _chk_qty_eq_1(flds, col_lst):
        """Check that these fields have only one value: required 1."""
        for col in col_lst:
            assert flds[col], "UNEXPECTED REQUIRED VALUE({V}) AT INDEX({R})".format(V=flds[col], R=col)

    @staticmethod
    def _do_taxons(taxons):
        """Taxon"""
        taxons = [int(v[6:]) for v in taxons]  # strip "taxon:"
        num_taxons = len(taxons)
        assert num_taxons == 1 or num_taxons == 2
        return taxons