def main(): args = cli(sys.argv[0], sys.argv[1:]) if args.go is not None: go = parse_rfam2go(args.go) else: go = {} for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) attrs = record.attributes if args.best and attrs.custom["olp"] == "=": continue name = record.type dbxrefs = ["Rfam:" + attrs.custom["mdlaccn"], "Rfam:" + name] if "clan" in attrs.custom: dbxrefs.append("RfamClan:" + attrs.custom["clan"]) ontology_terms = go.get(attrs.custom["mdlaccn"], []) notes = [attrs.custom["desc"]] target = Target( attrs.custom["mdlaccn"], int(attrs.custom["mdlfrom"]), int(attrs.custom["mdlto"]), ) custom = { "evalue": attrs.custom["evalue"], "model_type": attrs.custom["mdl"], "gc": attrs.custom["gc"], "bias": attrs.custom["bias"], "bitscore": record.score, } if attrs.custom["trunc"] == "yes": custom["truncated_match"] = "true" if attrs.custom["olp"] == "=": custom["overlap_with_better_score"] = "true" record.source = args.source record.type = args.type record.score = float(custom["evalue"]) record.attributes = GFFAttributes( name=name, dbxref=dbxrefs, target=target, note=notes, ontology_term=ontology_terms, custom=custom, ) print(record, file=args.outfile)
def decode_gff(infiles, outfile, map_, column): inhandles = join_files(infiles, header=False) if column == "id": trans_function = replace_gff_id elif column == "name": trans_function = replace_gff_name elif column == "seqid": trans_function = replace_gff_seqid else: raise ValueError("This shouldn't ever happen") record_chunk = list() for i, line in enumerate(inhandles): if line.startswith("#"): record_chunk.append(line.strip()) continue old_record = GFFRecord.parse(line) new_records = trans_function(old_record, map_) record_chunk.append(str(new_records)) if i % 10000 == 0: outfile.write("\n".join(record_chunk)) record_chunk = list() if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) return
def encode_gff( infiles, outfile, mapfile, column, id_conv, ): inhandles = join_files(infiles, header=False) seen = dict() if column == "id": trans_function = replace_gff_id elif column == "name": trans_function = replace_gff_name elif column == "seqid": trans_function = replace_gff_seqid else: raise ValueError("This shouldn't ever happen") id_chunk = list() record_chunk = list() for i, line in enumerate(inhandles): if line.startswith("#"): record_chunk.append(line.strip()) continue old_record = GFFRecord.parse(line) new_record = trans_function( old_record, seen, id_chunk, lambda x: next(id_conv), ) record_chunk.append(str(new_record)) if i % 10000 == 0: if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) outfile.write("\n") record_chunk = list() if len(id_chunk) > 0: mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk)) id_chunk = list() if len(record_chunk) > 0: outfile.write("\n".join(record_chunk)) outfile.write("\n") if len(id_chunk) > 0: mapfile.write("".join(f"{n}\t{o}\n" for n, o in id_chunk)) return
def rnammer2gff(args: argparse.Namespace) -> None: records: List[GFFRecord] = [] for line in args.infile: if line.startswith("#"): continue sline = line.strip().split("\t") rrna_type = sline[8] new_type = TYPE_MAP[args.kingdom][rrna_type.lower()] sline[1] = args.source sline[2] = new_type sline[8] = "." rna_record = GFFRecord.parse("\t".join(sline)) gene_record = deepcopy(rna_record) gene_record.type = "rRNA_gene" gene_record.add_child(rna_record) records.append(gene_record) records.append(rna_record) num = 0 for record in GFF(records).traverse_children(sort=True): if record.attributes is None: attr = GFFAttributes() record.attributes = attr else: attr = record.attributes if record.type == "rRNA_gene": num += 1 attr.id = f"rRNA_gene{num}" else: attr.id = f"rRNA{num}" attr.parent = [ p.attributes.id for p in record.parents if (p.attributes is not None and p.attributes.id is not None) ] print(record, file=args.outfile) return
def main(): args = cli(sys.argv[0], sys.argv[1:]) rows = list() for line in args.infile: if line.startswith("#"): continue record = GFFRecord.parse(line) record.attributes.id = record.attributes.name record.attributes.name = None record.attributes.custom = {} rows.append(record) gff = GFF(rows) gff.infer_missing_parents() counter = 1 for mrna in gff.select_type("mRNA"): if len(mrna.children) < 2: continue region = deepcopy(mrna) region.type = "repeat_region" region.attributes.id = f"repeat_region{counter}" region.attributes.ontology_term = ["SO:0000657"], mrna.type = "helitron" mrna.parents = [region] mrna.attributes.parent = [region.attributes.id] mrna.attributes.id = f"helitron{counter}" mrna.attributes.ontology_term = ["SO:0000544", "SO:helitron"] mrna.attributes.custom = {} flank3 = [c for c in mrna.children if c.attributes.id.endswith(".3")][0] flank5 = [c for c in mrna.children if c.attributes.id.endswith(".5.1")][0] flank3.type = "three_prime_flanking_region" flank3.attributes.ontology_term = [ "SO:0001417", "SO:three_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank3.attributes.id = None flank3.attributes.parent = [mrna.attributes.id] flank5.type = "five_prime_flanking_region" flank5.attributes.ontology_term = [ "SO:0001416", "SO:five_prime_flanking_region", "SO:0000364", "SO:transposable_element_flanking_region" ] flank5.attributes.id = None flank5.attributes.parent = [mrna.attributes.id] mrna.source = flank5.source print(region, file=args.outfile) print(mrna, file=args.outfile) if mrna.strand == Strand.MINUS: print(flank3, file=args.outfile) print(flank5, file=args.outfile) else: print(flank5, file=args.outfile) print(flank3, file=args.outfile) counter += 1 return
def deal_with_block(block: List[str], gene_num: int) -> List[GFF3Record]: parsed: Dict[str, List[GFFRecord[GTFAttributes]]] = dict() for line in block: rec = GFFRecord.parse(line, attr=GTFAttributes) if rec.type in parsed: parsed[rec.type].append(rec) else: parsed[rec.type] = [rec] assert len(parsed["gene"]) == 1 assert len(parsed["similarity"]) == 1 gene_parsed = parsed["gene"][0] similarity_parsed = parsed["similarity"][0] custom: Dict[str, str] = dict() if similarity_parsed.attributes is not None: custom["query"] = similarity_parsed.attributes.custom["Query"] if gene_parsed.attributes is not None: custom["identity"] = gene_parsed.attributes.custom["identity"] custom["similarity"] = gene_parsed.attributes.custom["similarity"] gene = GFF3Record( parsed["gene"][0].seqid, "exonerate", type="gene", start=parsed["gene"][0].start, end=parsed["gene"][0].end, score=parsed["gene"][0].score, strand=parsed["gene"][0].strand, phase=parsed["gene"][0].phase, attributes=GFF3Attributes( id=f"gene{gene_num}", custom=custom, ) ) cdss = [ GFF3Record( e.seqid, "exonerate", "CDS", e.start, e.end, e.score, e.strand, e.phase, attributes=GFF3Attributes( id=f"CDS{gene_num}", parent=[f"mRNA{gene_num}"], custom=(e.attributes.custom if e.attributes is not None else None) ) ) for e in parsed["exon"] ] for c in cdss: if gene.attributes is not None: # This is safe because we added attributes. assert c.attributes is not None c.attributes.custom["query"] = gene.attributes.custom["query"] mrna = GFF3Record.infer_from_children( cdss, id=f"mRNA{gene_num}", seqid=gene.seqid, source="exonerate", type="mRNA", strand=gene.strand, score=gene.score, ) mrna.add_parent(gene) if gene.attributes is not None: # This is safe because infer_from_children adds an ID to attributes. assert mrna.attributes is not None if gene.attributes.id is not None: mrna.attributes.parent = [gene.attributes.id] mrna.attributes.custom["query"] = gene.attributes.custom["query"] out = [gene, mrna] out.extend(cdss) return out