Esempio n. 1
0
    def from_line(cls, line: str) -> "TRNAScanRecord":
        sline = MULTISPACE_REGEX.split(line.rstrip())

        if ((len(sline) != len(cls.column_order))
                and (len(sline) != len(cls.column_order) - 1)):
            raise LineParseError(
                "Line had the wrong number of columns. "
                f"Expected {len(cls.column_order)} or "
                f"{len(cls.column_order) - 1} but got {len(sline)}.")

        record: Dict[str, str] = {
            k.strip(): v.strip()
            for k, v in zip(cls.column_order, sline)
        }

        start = parse_int(record["start"], "start")
        end = parse_int(record["end"], "end")
        num = parse_int(record["num"], "num")

        infernal_score = parse_float(record["infernal_score"],
                                     "infernal_score")

        if record["intron_starts"] == "0" and record["intron_ends"] == "0":
            intron_starts: List[int] = []
            intron_ends: List[int] = []
        else:
            intron_starts = [
                parse_int(i.strip(), "intron_starts")
                for i in record["intron_starts"].split(",")
            ]

            intron_ends = [
                parse_int(i.strip(), "intron_ends")
                for i in record["intron_ends"].split(",")
            ]

        return cls(
            seqid=parse_string_not_empty(record["seqid"], "seqid"),
            start=start,
            end=end,
            trna_type=parse_string_not_empty(record["trna_type"], "trna_type"),
            anticodon=parse_string_not_empty(record["anticodon"], "anticodon"),
            num=num,
            intron_starts=intron_starts,
            intron_ends=intron_ends,
            infernal_score=infernal_score,
            note=record.get("note", None),
        )
Esempio n. 2
0
    def from_line(cls, line: str) -> "Coords":
        if line.strip() == "":
            raise LineParseError("The line was empty")

        sline = line.strip().split("\t")

        if len(sline) < 13:
            raise LineParseError("The line had the wrong number of columns. "
                                 f"Expected at least 13 but got {len(sline)}.")

        rstart = parse_int(sline[0], "reference_start")
        rend = parse_int(sline[1], "reference_end")

        # This shouldn't ever happen AFAIK
        assert rstart <= rend, line
        rstart -= 1

        qstart = parse_int(sline[2], "query_start")
        qend = parse_int(sline[3], "query_end")

        if qstart > qend:
            strand = Strand.MINUS
            temp = qstart
            qstart = qend - 1
            qend = temp

            del temp
        else:
            strand = Strand.PLUS
            qstart -= 1

        # This isn't as type safe as I'd like
        return cls(
            rstart,
            rend,
            qstart,
            qend,
            strand,
            parse_int(sline[4], "reference_alnlen"),
            parse_int(sline[5], "query_alnlen"),
            parse_float(sline[6], "pid"),
            parse_int(sline[7], "reference_len"),
            parse_int(sline[8], "query_len"),
            parse_float(sline[9], "reference_cov"),
            parse_float(sline[10], "query_cov"),
            parse_string_not_empty(sline[11], "reference"),
            parse_string_not_empty(sline[12], "query"),
        )
Esempio n. 3
0
    def from_line(cls, line: str) -> "DomTbl":
        if line == "":
            raise LineParseError("The line was empty.")

        sline = MULTISPACE_REGEX.split(line.strip(), maxsplit=22)
        if len(sline) != 22 and len(sline) != 23:
            # Technically because of the max_split this should be impossible.
            # the description line is allowed to have spaces.
            print(sline)
            raise LineParseError("The line had the wrong number of columns. "
                                 f"Expected 22 or 23 but got {len(sline)}")

        if sline[22] == "-" or sline[22] == "":
            description: Optional[str] = None
        else:
            description = sline[22]

        return cls(parse_string_not_empty(sline[0], "target_name"),
                   parse_string_not_empty(sline[1], "target_acc"),
                   parse_int(sline[2], "target_len"),
                   parse_string_not_empty(sline[3], "query_name"),
                   parse_string_not_empty(sline[4], "query_acc"),
                   parse_int(sline[5], "query_len"),
                   parse_float(sline[6], "full_evalue"),
                   parse_float(sline[7], "full_score"),
                   parse_float(sline[8], "full_bias"),
                   parse_int(sline[9], "match_num"),
                   parse_int(sline[10], "nmatches"),
                   parse_float(sline[11], "domain_c_evalue"),
                   parse_float(sline[12], "domain_i_evalue"),
                   parse_float(sline[13], "domain_score"),
                   parse_float(sline[14], "domain_bias"),
                   parse_int(sline[15], "hmm_from"),
                   parse_int(sline[16], "hmm_to"),
                   parse_int(sline[17], "ali_from"),
                   parse_int(sline[18], "ali_to"),
                   parse_int(sline[19], "env_from"),
                   parse_int(sline[20], "env_to"),
                   parse_float(sline[21], "acc"), description)
Esempio n. 4
0
    def from_line(cls, line: str) -> 'PAF':
        sline = line.strip().split("\t")
        if len(sline) < len(cls.columns()):
            raise LineParseError(
                "The line had the wrong number of columns. "
                f"Expected at least {len(cls.columns())} but got {len(sline)}."
            )

        dline = dict(zip(cls.columns(), sline))
        attrs = sline[len(cls.columns()):]

        return cls(parse_string_not_empty(dline["query"], "query"),
                   parse_int(dline["qlen"], "qlen"),
                   parse_int(dline["qstart"], "qstart"),
                   parse_int(dline["qend"], "qend"),
                   is_one_of(dline["strand"], ["+", "-"], "strand"),
                   parse_string_not_empty(dline["target"], "target"),
                   parse_int(dline["tlen"], "tlen"),
                   parse_int(dline["tstart"], "tstart"),
                   parse_int(dline["tend"], "tend"),
                   parse_int(dline["nmatch"], "nmatch"),
                   parse_int(dline["alilen"], "alilen"),
                   parse_int(dline["mq"], "mq"), attrs)
Esempio n. 5
0
    def parse(
        cls,
        string: str,
        attr: Type[AttrT] = cast(Type[AttrT], GFF3Attributes),
        strip_quote: bool = False,
        unescape: bool = False,
    ) -> "GFFRecord[AttrT]":
        """ Parse a gff line string as a `GFFRecord`.

        Keyword arguments:
        string -- The gff line to parse.
        format -- What format the gff file is in.
            Currently only GFF3 is supported.
        strip_quote -- Strip quotes from attributes values. The specification
            says that they should not be stripped, so we don't by default.
        unescape -- Unescape reserved characters in the attributes to their
            original values. I.E. some commas, semicolons, newlines etc.

        Returns:
        A `GFFRecord`
        """

        sline = string.strip().split("\t")
        sline_len = len(sline)
        columns_len = len(cls.columns)

        if sline_len == columns_len - 1:
            logger.warning(("Line has has too few columns columns. "
                            "Probably it is missing the attributes"), )
        elif sline_len < columns_len:
            raise ValueError(
                ("Line has too few columns. "
                 f"Expected: {columns_len}, Encountered: {sline_len}"))
        elif sline_len > columns_len:
            logger.warning(
                "Line has too many columns. Expected: %s, Encountered: %s",
                columns_len, sline_len)

        fields: Dict[str, str] = dict(zip(cls.columns, sline))
        if sline_len == columns_len - 1:
            fields["attributes"] = ""

        # 0-based indexing exclusive
        start = parse_int(fields["start"], "start") - 1
        end = parse_int(fields["end"], "end")

        if start > end:
            tmp = start
            start = end
            end = tmp
            del tmp

        score = parse_or_none(fields["score"], "score", ".", parse_float)

        strand = Strand.parse(
            is_one_of(fields["strand"], ["-", "+", ".", "?"], "strand"))

        phase = Phase.parse(
            is_one_of(fields["phase"], ["0", "1", "2", "."], "phase"))

        attributes = cast(
            AttrT,
            attr.parse(
                fields["attributes"],
                strip_quote=strip_quote,
                unescape=unescape,
            ))

        return cls(parse_string_not_empty(fields["seqid"], "seqid"),
                   parse_string_not_empty(fields["source"], "source"),
                   parse_string_not_empty(fields["type"], "type"), start, end,
                   score, strand, phase, attributes)