def transform_child( feature: GFF3Record, group_name: str, gff_to_hints: Dict[str, str], type_to_trim: Dict[str, int], type_to_priority: Dict[str, int], source: str, priority: int, ) -> Optional[GFF3Record]: """ Converts a regular feature to a hint record. """ feature = copy(feature) if feature.type not in gff_to_hints: mapped_type = GFF_TYPE_MAP.get(feature.type, None) else: mapped_type = feature.type hint_type: Optional[str] = applicative(lambda t: gff_to_hints.get(t, None), mapped_type) if hint_type is None: return None feature.type = hint_type feature.trim_ends(type_to_trim.get(feature.type, 0)) priority_boost = type_to_priority[feature.type] attr = GFF3Attributes(custom=dict(source=source, group=group_name, priority=str(priority + priority_boost))) feature.attributes = attr return feature
def match_to_anticodon( match: TRNAScanRecord, ss: TRNAScanSS, source: str, type: str = "anticodon", parents: Sequence[GFF3Record] = [] ) -> GFF3Record: start, end, strand = fix_strand(ss.anticodon_start, ss.anticodon_end) parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] anticodon = GFF3Record( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFF3Attributes( id=f"{match.seqid}.{type}{match.num}", parent=parent_ids, ), parents=parents ) return anticodon
def match_to_introns( match: TRNAScanRecord, source: str, type: str = "tRNA_intron", parents: Sequence[GFF3Record] = [], ) -> List[GFF3Record]: introns = [] parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] for istart, iend in zip(match.intron_starts, match.intron_ends): start, end, strand = fix_strand(istart, iend) intron = GFF3Record( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFF3Attributes( id=f"{match.seqid}.{type}{match.num}", parent=parent_ids, ), parents=parents ) introns.append(intron) return introns
def as_gffrecord(self, source="MUMmer", type="nucleotide_match") -> GFF3Record: return GFF3Record( self.ref, source, type, self.rstart, self.rend, score=self.qcov, strand=self.strand, attributes=GFF3Attributes(target=Target(self.query, self.qstart, self.qend), custom={ "pid": str(self.pid), "contig_id": str(self.query), "contig_coverage": str(self.qcov), "contig_length": str(self.qlen), "contig_alignment_length": str(self.qalnlen), "scaffold_alignment_length": str(self.ralnlen), }))
def add_parents(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) gff.infer_missing_parents() for f in gff.select_type("mRNA"): if len(f.parents) > 0: continue if f.attributes is None: continue if f.attributes.id is None: continue id_ = f.attributes.id gene_id = f"gene.{id_}" gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene") f.add_parent(gene) gff.add_record(gene) print("##gff-version 3", file=args.outfile) for feature in gff.traverse_children(sort=True): print(feature, file=args.outfile) return
def all_children_excluded(parent: GFF3Record) -> bool: for child in parent.traverse_children(): if child == parent: continue # As soon as we hit one that didn't fail, do early exit. elif ((child.attributes is not None) and ("should_exclude" not in child.attributes.custom)): return False return True
def remaining_to_gffrecord( seqid: str, interval: Interval, index: int, ndigits: int, source: str, ) -> GFF3Record: return GFF3Record(seqid, source, "contig", interval.begin, interval.end, None, Strand.UNSTRANDED, Phase.NOT_CDS, GFF3Attributes(id=f"contig{index:0>{ndigits}}"))
def match_to_trna( match: TRNAScanRecord, ss: TRNAScanSS, source: str, type_map: Mapping[str, str] = TYPE_MAP, parents: Sequence[GFF3Record] = [] ) -> GFF3Record: start, end, strand = fix_strand(match.start, match.end) parent_ids = [ p.attributes.id for p in parents if (p.attributes is not None and p.attributes.id is not None) ] if match.note is None or match.note == "": notes: List[str] = [] else: notes = [match.note] trna = GFF3Record( seqid=match.seqid, source=source, type=type_map.get(match.trna_type.lower(), "tRNA"), start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFF3Attributes( id=f"{match.seqid}.tRNA{match.num}", parent=parent_ids, note=notes, custom={ "secondary_structure": ss.ss, "anticodon": match.anticodon, "amino_acid": match.trna_type, } ), parents=parents ) return trna
def rnammer2gff(args: argparse.Namespace) -> None: records: List[GFF3Record] = [] for line in args.infile: if line.startswith("#"): continue sline = line.strip().split("\t") rrna_type = sline[8] new_type = TYPE_MAP[args.kingdom][rrna_type.lower()] sline[1] = args.source sline[2] = new_type sline[8] = "." rna_record = cast(GFF3Record, GFF3Record.parse("\t".join(sline))) gene_record = deepcopy(rna_record) gene_record.type = "rRNA_gene" gene_record.add_child(rna_record) records.append(gene_record) records.append(rna_record) num = 0 for record in GFF(records).traverse_children(sort=True): if record.attributes is None: attr = GFF3Attributes() record.attributes = attr else: attr = record.attributes if record.type == "rRNA_gene": num += 1 attr.id = f"rRNA_gene{num}" else: attr.id = f"rRNA{num}" attr.parent = [ p.attributes.id for p in record.parents if (p.attributes is not None and p.attributes.id is not None) ] print(record, file=args.outfile) return
def get_non_canon_stop_codon( seqid: str, start: int, end: int, strand: Strand, codon: str, parent_id: Optional[str], ) -> GFF3Record: custom = {"codon": codon} if parent_id is not None: custom["cds_parent"] = parent_id return GFF3Record( seqid, "gffpal", "stop_codon", start, end, None, strand, Phase.NOT_CDS, GFF3Attributes( ontology_term=["SO:0000319"], note=["Non-canonical stop codon"], custom=custom, ))
def add_antifam(args: argparse.Namespace) -> None: antifam_records: Dict[str, List[DomTbl]] = defaultdict(list) for rec in DomTbl.from_file(args.antifam): antifam_records[rec.target_name].append(rec) for line in args.infile: sline = line.strip() if sline.startswith("#") or sline == "": args.outfile.write(line) continue record = GFF3Record.parse(sline) if record.attributes is None: print(record, file=args.outfile) continue field = record.attributes.get(args.field, None) if field is None: print(record, file=args.outfile) continue if str(field) not in antifam_records: print(record, file=args.outfile) continue dbxrefs = [] matches = [] for antifam_record in antifam_records[str(field)]: dbxrefs.append(f"AntiFam:{antifam_record.query_acc}") matches.append( f"{antifam_record.query_acc} {antifam_record.full_evalue} " f"{antifam_record.full_score} {antifam_record.domain_score} " f"{antifam_record.hmm_from} {antifam_record.hmm_to} " f"{antifam_record.ali_from} {antifam_record.ali_to}") record.attributes.dbxref.extend(dbxrefs) record.attributes.custom["antifam_match"] = ",".join(matches) print(record, file=args.outfile) return
def match_to_gene( match: TRNAScanRecord, source: str, type: str ) -> GFF3Record: start, end, strand = fix_strand(match.start, match.end) gene = GFF3Record( seqid=match.seqid, source=source, type=type, start=start, end=end, score=match.infernal_score, strand=strand, phase=Phase.NOT_CDS, attributes=GFF3Attributes( id=f"{match.seqid}.{type}{match.num}", ) ) return gene
def gap_to_gffrecord(seqid: str, interval: Interval) -> GFF3Record: return GFF3Record(seqid, "gffpal", "gap", interval.begin, interval.end, None, Strand.UNSTRANDED, Phase.NOT_CDS, GFF3Attributes())
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]: parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict() for line in block: rec = GFFRecord.parse(line, attr=GTFAttributes) if rec.type in parsed: parsed[rec.type].append(rec) else: parsed[rec.type] = [rec] assert len(parsed["gene"]) == 1 assert len(parsed["similarity"]) == 1 gene_parsed = parsed["gene"][0] similarity_parsed = parsed["similarity"][0] custom: Dict[str, str] = dict() if similarity_parsed.attributes is not None: custom["query"] = similarity_parsed.attributes.custom["Query"] if gene_parsed.attributes is not None: custom["identity"] = gene_parsed.attributes.custom["identity"] custom["similarity"] = gene_parsed.attributes.custom["similarity"] gene = GFF3Record( parsed["gene"][0].seqid, "exonerate", type="gene", start=parsed["gene"][0].start, end=parsed["gene"][0].end, score=parsed["gene"][0].score, strand=parsed["gene"][0].strand, phase=parsed["gene"][0].phase, attributes=GFF3Attributes( id=f"gene{gene_num}", custom=custom, ) ) cdss = [ GFF3Record( e.seqid, "exonerate", "CDS", e.start, e.end, e.score, e.strand, e.phase, attributes=GFF3Attributes( id=f"CDS{gene_num}", parent=[f"mRNA{gene_num}"], custom=(e.attributes.custom if e.attributes is not None else None) ) ) for e in parsed["exon"] ] for c in cdss: if gene.attributes is not None: # This is safe because we added attributes. assert c.attributes is not None c.attributes.custom["query"] = gene.attributes.custom["query"] mrna = GFF3Record.infer_from_children( cdss, id=f"mRNA{gene_num}", seqid=gene.seqid, source="exonerate", type="mRNA", strand=gene.strand, score=gene.score, ) mrna.add_parent(gene) if gene.attributes is not None: # This is safe because infer_from_children adds an ID to attributes. assert mrna.attributes is not None if gene.attributes.id is not None: mrna.attributes.parent = [gene.attributes.id] mrna.attributes.custom["query"] = gene.attributes.custom["query"] out = [gene, mrna] out.extend(cdss) return out