Ejemplo n.º 1
0
def match_to_anticodon(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type: str = "anticodon",
    parents: Sequence[GFF3Record] = []
) -> GFF3Record:
    start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    anticodon = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.{type}{match.num}",
            parent=parent_ids,
        ),
        parents=parents
    )
    return anticodon
Ejemplo n.º 2
0
def deal_with_kids(
    children: Iterator[GFF3Record],
    type_: str,
    length: int,
    aligned: DefaultDict[str, int],
) -> Tuple[int, DefaultDict[str, int]]:
    for child in children:

        if child.attributes is None:
            child.attributes = GFF3Attributes()

        if child.type != type_:
            continue

        length += child.length()

        best_hints: Dict[str, Hint] = dict()
        hints = child.attributes.custom.get("hint", None)
        if hints is None:
            continue
        else:
            hints = hints.replace("%2C", ",")

        for h in hints.split(","):
            this_hint = parse_hint(h)

            if not ((this_hint.source in best_hints) and
                    (best_hints[this_hint.source].naln > this_hint.naln)):
                best_hints[this_hint.source] = this_hint

        for s, h in best_hints.items():
            aligned[s] += h.naln

    return length, aligned
Ejemplo n.º 3
0
def match_to_introns(
    match: TRNAScanRecord,
    source: str,
    type: str = "tRNA_intron",
    parents: Sequence[GFF3Record] = [],
) -> List[GFF3Record]:
    introns = []

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    for istart, iend in zip(match.intron_starts, match.intron_ends):
        start, end, strand = fix_strand(istart, iend)
        intron = GFF3Record(
            seqid=match.seqid,
            source=source,
            type=type,
            start=start,
            end=end,
            score=match.infernal_score,
            strand=strand,
            phase=Phase.NOT_CDS,
            attributes=GFF3Attributes(
                id=f"{match.seqid}.{type}{match.num}",
                parent=parent_ids,
            ),
            parents=parents
        )
        introns.append(intron)
    return introns
Ejemplo n.º 4
0
def transform_child(
    feature: GFF3Record,
    group_name: str,
    gff_to_hints: Dict[str, str],
    type_to_trim: Dict[str, int],
    type_to_priority: Dict[str, int],
    source: str,
    priority: int,
) -> Optional[GFF3Record]:
    """ Converts a regular feature to a hint record. """

    feature = copy(feature)
    if feature.type not in gff_to_hints:
        mapped_type = GFF_TYPE_MAP.get(feature.type, None)
    else:
        mapped_type = feature.type

    hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None),
                                           mapped_type)

    if hint_type is None:
        return None

    feature.type = hint_type
    feature.trim_ends(type_to_trim.get(feature.type, 0))
    priority_boost = type_to_priority[feature.type]

    attr = GFF3Attributes(custom=dict(source=source,
                                      group=group_name,
                                      priority=str(priority + priority_boost)))
    feature.attributes = attr
    return feature
Ejemplo n.º 5
0
 def as_gffrecord(self,
                  source="MUMmer",
                  type="nucleotide_match") -> GFF3Record:
     return GFF3Record(
         self.ref,
         source,
         type,
         self.rstart,
         self.rend,
         score=self.qcov,
         strand=self.strand,
         attributes=GFF3Attributes(target=Target(self.query, self.qstart,
                                                 self.qend),
                                   custom={
                                       "pid":
                                       str(self.pid),
                                       "contig_id":
                                       str(self.query),
                                       "contig_coverage":
                                       str(self.qcov),
                                       "contig_length":
                                       str(self.qlen),
                                       "contig_alignment_length":
                                       str(self.qalnlen),
                                       "scaffold_alignment_length":
                                       str(self.ralnlen),
                                   }))
Ejemplo n.º 6
0
def main():
    args = cli(sys.argv[0], sys.argv[1:])
    gff = GFF.parse(args.infile)

    itree = gff_to_itree(gff.select_type(args.group_level))

    for mrna in gff.select_type(args.group_level):

        if mrna.attributes is None:
            mrna.attributes = GFF3Attributes()

        failed_antifam = "antifam_match" in mrna.attributes.custom
        if failed_antifam:
            mrna.attributes.custom["is_unreliable"] = "true"
            mrna.attributes.custom["should_exclude"] = "true"

        length, aligned = deal_with_kids(
            gff.traverse_children([mrna]),
            args.type,
            0,
            defaultdict(int)
        )

        coverages = find_coverages(aligned, length)
        supported = [k for k, v in coverages.items() if v > args.min_cov]
        not_supported = ((len(supported) == 0) or
                         (all(s in args.exclude for s in supported)))

        is_novel = is_novel_locus(mrna, itree, args.threshold)

        if not_supported:
            mrna.attributes.custom["is_unreliable"] = "true"
            if not is_novel:
                mrna.attributes.custom["should_exclude"] = "true"

        if args.stats:
            line = coverages
            line["id"] = mrna.attributes.id
            line["length"] = length
            line["is_supported"] = not not_supported
            line["is_novel_locus"] = is_novel
            line["antifam_match"] = failed_antifam
            line["excluded"] = (failed_antifam
                                or (not_supported and not is_novel))

            print(json.dumps(line), file=args.stats)

    kept, dropped = split_gffs(gff, args.group_level)

    write_gff(kept, args.outfile)

    if args.filtered is not None:
        write_gff(dropped, args.filtered)
    return
Ejemplo n.º 7
0
def match_to_trna(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type_map: Mapping[str, str] = TYPE_MAP,
    parents: Sequence[GFF3Record] = []
) -> GFF3Record:
    start, end, strand = fix_strand(match.start, match.end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    if match.note is None or match.note == "":
        notes: List[str] = []
    else:
        notes = [match.note]

    trna = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type_map.get(match.trna_type.lower(), "tRNA"),
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.tRNA{match.num}",
            parent=parent_ids,
            note=notes,
            custom={
                "secondary_structure": ss.ss,
                "anticodon": match.anticodon,
                "amino_acid": match.trna_type,
            }
        ),
        parents=parents
    )
    return trna
Ejemplo n.º 8
0
    def infer_from_children(
        cls,
        children: Sequence["GFF3Record"],
        id: Optional[str] = None,
        seqid: Optional[str] = None,
        source: str = ".",
        type: str = ".",
        strand: Optional[Strand] = None,
        score: Optional[float] = None,
        phase: Phase = Phase.NOT_CDS,
    ) -> "GFF3Record":
        """ """

        if len(children) == 0:
            raise ValueError("Cannot get the parent of an empty set.")

        if seqid is None:
            seqid = cls._infer_seqid_from_children(children)

        if id is None:
            id = cls._infer_id_from_children(children)

        if strand is None:
            strand = Strand.infer_from_many([f.strand for f in children])

        start = min(r.start for r in children)
        end = max(r.end for r in children)

        attributes = GFF3Attributes(id=id)
        record = cast(
            GFF3Record,
            GFFRecord(
                seqid,
                source,
                type,
                start,
                end,
                score,
                strand,
                phase,
                attributes,
                children=children,
            ))
        return record
Ejemplo n.º 9
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFF3Record] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = cast(GFF3Record, GFF3Record.parse("\t".join(sline)))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFF3Attributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
Ejemplo n.º 10
0
def get_non_canon_stop_codon(
    seqid: str,
    start: int,
    end: int,
    strand: Strand,
    codon: str,
    parent_id: Optional[str],
) -> GFF3Record:

    custom = {"codon": codon}
    if parent_id is not None:
        custom["cds_parent"] = parent_id

    return GFF3Record(
        seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS,
        GFF3Attributes(
            ontology_term=["SO:0000319"],
            note=["Non-canonical stop codon"],
            custom=custom,
        ))
Ejemplo n.º 11
0
 def add_attributes_if_none(
     self,
     id: Optional[str] = None,
     name: Optional[str] = None,
     alias: Optional[Sequence[str]] = None,
     parent: Optional[Sequence[str]] = None,
     target: Optional[Target] = None,
     gap: Optional[Gap] = None,
     derives_from: Optional[Sequence[str]] = None,
     note: Optional[Sequence[str]] = None,
     dbxref: Optional[Sequence[str]] = None,
     ontology_term: Optional[Sequence[str]] = None,
     is_circular: Optional[bool] = None,
     custom: Optional[Mapping[str, str]] = None,
 ) -> None:
     if self.attributes is None:
         self.attributes = GFF3Attributes(id, name, alias, parent, target,
                                          gap, derives_from, note, dbxref,
                                          ontology_term, is_circular,
                                          custom)
     return
Ejemplo n.º 12
0
def match_to_gene(
    match: TRNAScanRecord,
    source: str,
    type: str
) -> GFF3Record:
    start, end, strand = fix_strand(match.start, match.end)

    gene = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.{type}{match.num}",
        )
    )

    return gene
Ejemplo n.º 13
0
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]:

    parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict()
    for line in block:
        rec = GFFRecord.parse(line, attr=GTFAttributes)

        if rec.type in parsed:
            parsed[rec.type].append(rec)
        else:
            parsed[rec.type] = [rec]

    assert len(parsed["gene"]) == 1
    assert len(parsed["similarity"]) == 1
    gene_parsed = parsed["gene"][0]
    similarity_parsed = parsed["similarity"][0]

    custom: Dict[str, str] = dict()
    if similarity_parsed.attributes is not None:
        custom["query"] = similarity_parsed.attributes.custom["Query"]

    if gene_parsed.attributes is not None:
        custom["identity"] = gene_parsed.attributes.custom["identity"]
        custom["similarity"] = gene_parsed.attributes.custom["similarity"]

    gene = GFF3Record(
        parsed["gene"][0].seqid,
        "exonerate",
        type="gene",
        start=parsed["gene"][0].start,
        end=parsed["gene"][0].end,
        score=parsed["gene"][0].score,
        strand=parsed["gene"][0].strand,
        phase=parsed["gene"][0].phase,
        attributes=GFF3Attributes(
            id=f"gene{gene_num}",
            custom=custom,
        )
    )

    cdss = [
        GFF3Record(
            e.seqid,
            "exonerate",
            "CDS",
            e.start,
            e.end,
            e.score,
            e.strand,
            e.phase,
            attributes=GFF3Attributes(
                id=f"CDS{gene_num}",
                parent=[f"mRNA{gene_num}"],
                custom=(e.attributes.custom
                        if e.attributes is not None
                        else None)
            )
        )
        for e
        in parsed["exon"]
    ]

    for c in cdss:
        if gene.attributes is not None:
            # This is safe because we added attributes.
            assert c.attributes is not None
            c.attributes.custom["query"] = gene.attributes.custom["query"]

    mrna = GFF3Record.infer_from_children(
        cdss,
        id=f"mRNA{gene_num}",
        seqid=gene.seqid,
        source="exonerate",
        type="mRNA",
        strand=gene.strand,
        score=gene.score,
    )

    mrna.add_parent(gene)

    if gene.attributes is not None:
        # This is safe because infer_from_children adds an ID to attributes.
        assert mrna.attributes is not None
        if gene.attributes.id is not None:
            mrna.attributes.parent = [gene.attributes.id]
        mrna.attributes.custom["query"] = gene.attributes.custom["query"]

    out = [gene, mrna]
    out.extend(cdss)
    return out