def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [ to_range(x, score=gene_register[x.accn], id=x.accn) for x in group ] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print("\n".join(sorted(removed)), file=fw) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [to_range(x, score=gene_register[x.accn], id=x.accn) \ for x in group] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print >> fw, "\n".join(sorted(removed)) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option( "--trim5", default=None, type="str", help="File containing gene list for 5' UTR trimming", ) p.add_option( "--trim3", default=None, type="str", help="File containing gene list for 3' UTR trimming", ) p.add_option( "--trimrange", default=None, type="str", help="File containing gene list for UTR trim back" + "based on suggested (start, stop) coordinate range", ) p.add_option( "--refgff", default=None, type="str", help="Reference GFF3 used as fallback to replace UTRs", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (gffile, ) = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert (len(tr.split("\t")) == 3 ), "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = ( c.id, c.featuretype, c.attributes.get("Parent", [None])[0], ) t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes["Parent"] [0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append("five_prime_UTR") if t3: utr_types.append("three_prime_UTR") for utr_type in utr_types: for utr in refgff.children( refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region( region=utr, featuretype="exon"): if exon.attributes["Parent"][ 0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by="start"): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [ range_overlap(to_range(cc), to_range(x)) for x in extras if x.featuretype == "exon" ] if any(eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option("--trim5", default=None, type="str", \ help="File containing gene list for 5' UTR trimming") p.add_option("--trim3", default=None, type="str", \ help="File containing gene list for 3' UTR trimming") p.add_option("--trimrange", default=None, type="str", \ help="File containing gene list for UTR trim back" + \ "based on suggested (start, stop) coordinate range") p.add_option("--refgff", default=None, type="str", \ help="Reference GFF3 used as fallback to replace UTRs") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert len(tr.split("\t")) == 3, \ "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = c.id, c.featuretype, \ c.attributes.get('Parent', [None])[0] t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes['Parent'][0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append('five_prime_UTR') if t3: utr_types.append('three_prime_UTR') for utr_type in utr_types: for utr in refgff.children(refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region(region=utr, featuretype="exon"): if exon.attributes['Parent'][0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by=("start")): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [range_overlap(to_range(cc), to_range(x)) \ for x in extras if x.featuretype == 'exon'] if any(skip for skip in eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()