Ejemplo n.º 1
0
def transform_child(
    feature: GFF3Record,
    group_name: str,
    gff_to_hints: Dict[str, str],
    type_to_trim: Dict[str, int],
    type_to_priority: Dict[str, int],
    source: str,
    priority: int,
) -> Optional[GFF3Record]:
    """ Converts a regular feature to a hint record. """

    feature = copy(feature)
    if feature.type not in gff_to_hints:
        mapped_type = GFF_TYPE_MAP.get(feature.type, None)
    else:
        mapped_type = feature.type

    hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None),
                                           mapped_type)

    if hint_type is None:
        return None

    feature.type = hint_type
    feature.trim_ends(type_to_trim.get(feature.type, 0))
    priority_boost = type_to_priority[feature.type]

    attr = GFF3Attributes(custom=dict(source=source,
                                      group=group_name,
                                      priority=str(priority + priority_boost)))
    feature.attributes = attr
    return feature
Ejemplo n.º 2
0
def match_to_anticodon(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type: str = "anticodon",
    parents: Sequence[GFF3Record] = []
) -> GFF3Record:
    start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    anticodon = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.{type}{match.num}",
            parent=parent_ids,
        ),
        parents=parents
    )
    return anticodon
Ejemplo n.º 3
0
def match_to_introns(
    match: TRNAScanRecord,
    source: str,
    type: str = "tRNA_intron",
    parents: Sequence[GFF3Record] = [],
) -> List[GFF3Record]:
    introns = []

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    for istart, iend in zip(match.intron_starts, match.intron_ends):
        start, end, strand = fix_strand(istart, iend)
        intron = GFF3Record(
            seqid=match.seqid,
            source=source,
            type=type,
            start=start,
            end=end,
            score=match.infernal_score,
            strand=strand,
            phase=Phase.NOT_CDS,
            attributes=GFF3Attributes(
                id=f"{match.seqid}.{type}{match.num}",
                parent=parent_ids,
            ),
            parents=parents
        )
        introns.append(intron)
    return introns
Ejemplo n.º 4
0
 def as_gffrecord(self,
                  source="MUMmer",
                  type="nucleotide_match") -> GFF3Record:
     return GFF3Record(
         self.ref,
         source,
         type,
         self.rstart,
         self.rend,
         score=self.qcov,
         strand=self.strand,
         attributes=GFF3Attributes(target=Target(self.query, self.qstart,
                                                 self.qend),
                                   custom={
                                       "pid":
                                       str(self.pid),
                                       "contig_id":
                                       str(self.query),
                                       "contig_coverage":
                                       str(self.qcov),
                                       "contig_length":
                                       str(self.qlen),
                                       "contig_alignment_length":
                                       str(self.qalnlen),
                                       "scaffold_alignment_length":
                                       str(self.ralnlen),
                                   }))
Ejemplo n.º 5
0
def add_parents(args: argparse.Namespace) -> None:

    gff = GFF.parse(args.infile)
    gff.infer_missing_parents()

    for f in gff.select_type("mRNA"):
        if len(f.parents) > 0:
            continue

        if f.attributes is None:
            continue

        if f.attributes.id is None:
            continue

        id_ = f.attributes.id
        gene_id = f"gene.{id_}"
        gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene")
        f.add_parent(gene)
        gff.add_record(gene)

    print("##gff-version 3", file=args.outfile)
    for feature in gff.traverse_children(sort=True):
        print(feature, file=args.outfile)
    return
Ejemplo n.º 6
0
def all_children_excluded(parent: GFF3Record) -> bool:
    for child in parent.traverse_children():
        if child == parent:
            continue
        # As soon as we hit one that didn't fail, do early exit.
        elif ((child.attributes is not None) and
                ("should_exclude" not in child.attributes.custom)):
            return False

    return True
Ejemplo n.º 7
0
def remaining_to_gffrecord(
    seqid: str,
    interval: Interval,
    index: int,
    ndigits: int,
    source: str,
) -> GFF3Record:
    return GFF3Record(seqid, source, "contig", interval.begin, interval.end,
                      None, Strand.UNSTRANDED, Phase.NOT_CDS,
                      GFF3Attributes(id=f"contig{index:0>{ndigits}}"))
Ejemplo n.º 8
0
def match_to_trna(
    match: TRNAScanRecord,
    ss: TRNAScanSS,
    source: str,
    type_map: Mapping[str, str] = TYPE_MAP,
    parents: Sequence[GFF3Record] = []
) -> GFF3Record:
    start, end, strand = fix_strand(match.start, match.end)

    parent_ids = [
        p.attributes.id
        for p
        in parents
        if (p.attributes is not None
            and p.attributes.id is not None)
    ]

    if match.note is None or match.note == "":
        notes: List[str] = []
    else:
        notes = [match.note]

    trna = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type_map.get(match.trna_type.lower(), "tRNA"),
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.tRNA{match.num}",
            parent=parent_ids,
            note=notes,
            custom={
                "secondary_structure": ss.ss,
                "anticodon": match.anticodon,
                "amino_acid": match.trna_type,
            }
        ),
        parents=parents
    )
    return trna
Ejemplo n.º 9
0
def rnammer2gff(args: argparse.Namespace) -> None:
    records: List[GFF3Record] = []

    for line in args.infile:
        if line.startswith("#"):
            continue

        sline = line.strip().split("\t")
        rrna_type = sline[8]
        new_type = TYPE_MAP[args.kingdom][rrna_type.lower()]
        sline[1] = args.source
        sline[2] = new_type
        sline[8] = "."

        rna_record = cast(GFF3Record, GFF3Record.parse("\t".join(sline)))
        gene_record = deepcopy(rna_record)
        gene_record.type = "rRNA_gene"
        gene_record.add_child(rna_record)

        records.append(gene_record)
        records.append(rna_record)

    num = 0
    for record in GFF(records).traverse_children(sort=True):
        if record.attributes is None:
            attr = GFF3Attributes()
            record.attributes = attr
        else:
            attr = record.attributes

        if record.type == "rRNA_gene":
            num += 1
            attr.id = f"rRNA_gene{num}"
        else:
            attr.id = f"rRNA{num}"
            attr.parent = [
                p.attributes.id for p in record.parents
                if (p.attributes is not None and p.attributes.id is not None)
            ]

        print(record, file=args.outfile)

    return
Ejemplo n.º 10
0
def get_non_canon_stop_codon(
    seqid: str,
    start: int,
    end: int,
    strand: Strand,
    codon: str,
    parent_id: Optional[str],
) -> GFF3Record:

    custom = {"codon": codon}
    if parent_id is not None:
        custom["cds_parent"] = parent_id

    return GFF3Record(
        seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS,
        GFF3Attributes(
            ontology_term=["SO:0000319"],
            note=["Non-canonical stop codon"],
            custom=custom,
        ))
Ejemplo n.º 11
0
def add_antifam(args: argparse.Namespace) -> None:

    antifam_records: Dict[str, List[DomTbl]] = defaultdict(list)
    for rec in DomTbl.from_file(args.antifam):
        antifam_records[rec.target_name].append(rec)

    for line in args.infile:
        sline = line.strip()
        if sline.startswith("#") or sline == "":
            args.outfile.write(line)
            continue

        record = GFF3Record.parse(sline)
        if record.attributes is None:
            print(record, file=args.outfile)
            continue

        field = record.attributes.get(args.field, None)
        if field is None:
            print(record, file=args.outfile)
            continue

        if str(field) not in antifam_records:
            print(record, file=args.outfile)
            continue

        dbxrefs = []
        matches = []
        for antifam_record in antifam_records[str(field)]:
            dbxrefs.append(f"AntiFam:{antifam_record.query_acc}")
            matches.append(
                f"{antifam_record.query_acc} {antifam_record.full_evalue} "
                f"{antifam_record.full_score} {antifam_record.domain_score} "
                f"{antifam_record.hmm_from} {antifam_record.hmm_to} "
                f"{antifam_record.ali_from} {antifam_record.ali_to}")

        record.attributes.dbxref.extend(dbxrefs)
        record.attributes.custom["antifam_match"] = ",".join(matches)
        print(record, file=args.outfile)

    return
Ejemplo n.º 12
0
def match_to_gene(
    match: TRNAScanRecord,
    source: str,
    type: str
) -> GFF3Record:
    start, end, strand = fix_strand(match.start, match.end)

    gene = GFF3Record(
        seqid=match.seqid,
        source=source,
        type=type,
        start=start,
        end=end,
        score=match.infernal_score,
        strand=strand,
        phase=Phase.NOT_CDS,
        attributes=GFF3Attributes(
            id=f"{match.seqid}.{type}{match.num}",
        )
    )

    return gene
Ejemplo n.º 13
0
def gap_to_gffrecord(seqid: str, interval: Interval) -> GFF3Record:
    return GFF3Record(seqid, "gffpal", "gap", interval.begin, interval.end,
                      None, Strand.UNSTRANDED, Phase.NOT_CDS, GFF3Attributes())
Ejemplo n.º 14
0
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]:

    parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict()
    for line in block:
        rec = GFFRecord.parse(line, attr=GTFAttributes)

        if rec.type in parsed:
            parsed[rec.type].append(rec)
        else:
            parsed[rec.type] = [rec]

    assert len(parsed["gene"]) == 1
    assert len(parsed["similarity"]) == 1
    gene_parsed = parsed["gene"][0]
    similarity_parsed = parsed["similarity"][0]

    custom: Dict[str, str] = dict()
    if similarity_parsed.attributes is not None:
        custom["query"] = similarity_parsed.attributes.custom["Query"]

    if gene_parsed.attributes is not None:
        custom["identity"] = gene_parsed.attributes.custom["identity"]
        custom["similarity"] = gene_parsed.attributes.custom["similarity"]

    gene = GFF3Record(
        parsed["gene"][0].seqid,
        "exonerate",
        type="gene",
        start=parsed["gene"][0].start,
        end=parsed["gene"][0].end,
        score=parsed["gene"][0].score,
        strand=parsed["gene"][0].strand,
        phase=parsed["gene"][0].phase,
        attributes=GFF3Attributes(
            id=f"gene{gene_num}",
            custom=custom,
        )
    )

    cdss = [
        GFF3Record(
            e.seqid,
            "exonerate",
            "CDS",
            e.start,
            e.end,
            e.score,
            e.strand,
            e.phase,
            attributes=GFF3Attributes(
                id=f"CDS{gene_num}",
                parent=[f"mRNA{gene_num}"],
                custom=(e.attributes.custom
                        if e.attributes is not None
                        else None)
            )
        )
        for e
        in parsed["exon"]
    ]

    for c in cdss:
        if gene.attributes is not None:
            # This is safe because we added attributes.
            assert c.attributes is not None
            c.attributes.custom["query"] = gene.attributes.custom["query"]

    mrna = GFF3Record.infer_from_children(
        cdss,
        id=f"mRNA{gene_num}",
        seqid=gene.seqid,
        source="exonerate",
        type="mRNA",
        strand=gene.strand,
        score=gene.score,
    )

    mrna.add_parent(gene)

    if gene.attributes is not None:
        # This is safe because infer_from_children adds an ID to attributes.
        assert mrna.attributes is not None
        if gene.attributes.id is not None:
            mrna.attributes.parent = [gene.attributes.id]
        mrna.attributes.custom["query"] = gene.attributes.custom["query"]

    out = [gene, mrna]
    out.extend(cdss)
    return out