def hints(args: argparse.Namespace) -> None: gff_to_hints = get_hints_map(args) type_to_trim = get_trim_map(args) type_to_priority = get_priority_map(args) gff = GFF.parse(args.infile) for parent in gff.select_type(args.group_level): group_name = fmap(lambda a: getattr(a, "id"), parent.attributes) if group_name is None: raise GPMissingID( "One of the selected records doesn't have an ID. " f"The offending line is {parent}.") for feature in gff.traverse_children([parent]): hint_feature = transform_child( feature, group_name, gff_to_hints, type_to_trim, type_to_priority, args.source, args.priority, ) if hint_feature is not None: print(hint_feature, file=args.outfile) return
def add_parents(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) gff.infer_missing_parents() for f in gff.select_type("mRNA"): if len(f.parents) > 0: continue if f.attributes is None: continue if f.attributes.id is None: continue id_ = f.attributes.id gene_id = f"gene.{id_}" gene = GFF3Record.infer_from_children([f], id=gene_id, type="gene") f.add_parent(gene) gff.add_record(gene) print("##gff-version 3", file=args.outfile) for feature in gff.traverse_children(sort=True): print(feature, file=args.outfile) return
def main(): args = cli(sys.argv[0], sys.argv[1:]) gff = GFF.parse(args.infile) itree = gff_to_itree(gff.select_type(args.group_level)) for mrna in gff.select_type(args.group_level): if mrna.attributes is None: mrna.attributes = GFF3Attributes() failed_antifam = "antifam_match" in mrna.attributes.custom if failed_antifam: mrna.attributes.custom["is_unreliable"] = "true" mrna.attributes.custom["should_exclude"] = "true" length, aligned = deal_with_kids( gff.traverse_children([mrna]), args.type, 0, defaultdict(int) ) coverages = find_coverages(aligned, length) supported = [k for k, v in coverages.items() if v > args.min_cov] not_supported = ((len(supported) == 0) or (all(s in args.exclude for s in supported))) is_novel = is_novel_locus(mrna, itree, args.threshold) if not_supported: mrna.attributes.custom["is_unreliable"] = "true" if not is_novel: mrna.attributes.custom["should_exclude"] = "true" if args.stats: line = coverages line["id"] = mrna.attributes.id line["length"] = length line["is_supported"] = not not_supported line["is_novel_locus"] = is_novel line["antifam_match"] = failed_antifam line["excluded"] = (failed_antifam or (not_supported and not is_novel)) print(json.dumps(line), file=args.stats) kept, dropped = split_gffs(gff, args.group_level) write_gff(kept, args.outfile) if args.filtered is not None: write_gff(dropped, args.filtered) return
def ncbi(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile).break_bubbles() so = Ontology.from_obo_library(args.so) name_to_so = {term.name: term for term in so.values()} add_so_as_ontologies(gff, name_to_so) add_ncrna_types(gff, name_to_so, so, NCRNA_TYPES) add_pseudogene_types(gff, name_to_so, so, PSEUDOGENE_TYPES) return
def expandcds(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) if args.infasta is None: seqs = None else: seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta")) codon_table = CodonTable.unambiguous_dna_by_id[args.gencode] cds_parents: Set[GFF3Record] = set() for record in gff.select_type(args.cds_type): cds_parents.update((cast(GFF3Record, p) for p in record.parents)) for parent in cds_parents: cdss = sorted([ cast(GFF3Record, f) for f in parent.children if f.type == args.cds_type ], key=lambda f: (f.start, f.end)) strand = find_strand(cdss, parent) if args.start: bump_start(cdss, strand) if args.stop: bump_end(cdss, strand) if seqs is not None and parent.seqid in seqs: start_codon = check_start(cdss, parent, strand, seqs, codon_table) if start_codon is not None: print(start_codon, file=args.warnings) stop_codon = check_stop(cdss, parent, strand, seqs, codon_table) if stop_codon is not None: print(stop_codon, file=args.warnings) child_cdss: Sequence[GFF3Record] = list(gff.select_type(args.cds_type)) for parent in gff.traverse_parents(child_cdss): parent.expand_to_children() print("##gff-version 3", file=args.outfile) for feature in gff.traverse_children(sort=True): print(feature, file=args.outfile) return
def run_extract(args) -> None: if args.good is not None: good_ids: Optional[Set[str]] = get_good_ids(args.good) else: good_ids = None in_gff = GFF.parse(args.gff) in_fasta = SeqIO.to_dict(SeqIO.parse(args.fasta, format="fasta")) if args.hints is not None: in_hints = GFF.parse(args.hints) else: in_hints = None block_iter = get_blocks(in_fasta, in_gff, in_hints, args.pad, args.merge, good_ids) print("##gff-version 3", file=args.outgff) if args.outhints is not None: print("##gff-version 3", file=args.outhints) for name, seq, genes, hints in block_iter: SeqIO.write(seq, args.outfasta, format="fasta") sr = f"##sequence-region {name} 1 {len(seq)}" print(sr, file=args.outgff) print(genes, file=args.outgff) if (args.outhints is not None and hints is not None and len(hints.inner) > 0): print(sr, file=args.outhints) print(hints, file=args.outhints) return
def select(args: argparse.Namespace) -> None: gff = GFF.parse(args.infile) ids = {l.strip() for l in args.ids} to_keep: Set[GFF3Record] = set() for record in gff: if record.attributes is not None and record.attributes.id in ids: to_keep.update( cast(Iterator[GFF3Record], record.traverse_parents())) to_keep.update( cast(Iterator[GFF3Record], record.traverse_children())) pruned = prune_gff(to_keep) print("#gff-version 3", file=args.outfile) for feature in pruned.traverse_children(sort=True): print(feature, file=args.outfile)
def main(): args = cli(sys.argv[0], sys.argv[1:]) gff = GFF.parse(args.ingff) seqs = SeqIO.to_dict(SeqIO.parse(args.infasta, format="fasta")) counter = 1 for region in gff: region.type = args.type region.source = args.source region.strand = Strand.UNSTRANDED region.attributes.id = f"{args.type}{counter}" sf = gff_to_seqfeature(region) seq = sf.extract(seqs[region.seqid]) base_counts = count_frequencies(seq) region.attributes.custom = base_counts counter += 1 print(region, file=args.outfile) return