def augustus(args): """ %prog augustus augustus.gff3 > reformatted.gff3 AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement. """ from jcvi.formats.gff import Gff p = OptionParser(augustus.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ingff3, = args gff = Gff(ingff3) fw = must_open(opts.outfile, "w") seen = defaultdict(int) for g in gff: if g.type not in ("gene", "transcript", "CDS"): continue if g.type == "transcript": g.type = "mRNA" prefix = g.seqid + "_" pid = prefix + g.id newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid seen[pid] += 1 g.attributes["ID"] = [newid] g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]] g.update_attributes() print >> fw, g fw.close()
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. """ p = OptionParser(trimUTR.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args g = make_index(gffile) gff = Gff(gffile) mRNA_register = {} fw = must_open(opts.outfile, "w") for c in gff: cid, ctype = c.accn, c.type if ctype == "gene": start, end = get_cds_minmax(g, cid) trim(c, start, end) elif ctype == "mRNA": start, end = get_cds_minmax(g, cid, level=1) trim(c, start, end) mRNA_register[cid] = (start, end) elif ctype != "CDS": start, end = mRNA_register[c.parent] trim(c, start, end) if c.start > c.end: print >> sys.stderr, cid, \ "destroyed [{0} > {1}]".format(c.start, c.end) else: print >> fw, c
def pasa(args): """ %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3 Wraps `pasa_asmbls_to_training_set.dbi`. """ from jcvi.formats.base import SetFile from jcvi.formats.gff import Gff p = OptionParser(pasa.__doc__) p.set_home("pasa") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, gffile = args transcodergff = fastafile + ".transdecoder.gff3" transcodergenomegff = fastafile + ".transdecoder.genome.gff3" if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)): cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format( opts.pasa_home) cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\ format(fastafile, gffile) sh(cmd) completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids" if need_update(transcodergff, completeids): cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff) sh(cmd, outfile=completeids) complete = SetFile(completeids) seen = set() completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3" fw = open(completegff, "w") gff = Gff(transcodergenomegff) for g in gff: a = g.attributes if "Parent" in a: id = a["Parent"][0] else: id = a["ID"][0] asmbl_id = id.split("|")[0] if asmbl_id not in complete: continue print >> fw, g if g.type == "gene": seen.add(id) fw.close() logging.debug("A total of {0} complete models extracted to `{1}`.".\ format(len(seen), completegff))
def yeasttruth(args): """ %prog yeasttruth Pillars.tab *.gff Prepare pairs data for 14 yeasts. """ p = OptionParser(yeasttruth.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) pillars = args[0] gffiles = args[1:] aliases = {} pivot = {} for gffile in gffiles: is_pivot = op.basename(gffile).startswith("Saccharomyces_cerevisiae") gff = Gff(gffile) for g in gff: if g.type != "gene": continue for a in g.attributes["Alias"]: aliases[a] = g.accn if is_pivot: pivot[a] = g.accn logging.debug("Aliases imported: {0}".format(len(aliases))) logging.debug("Pivot imported: {0}".format(len(pivot))) fw = open("yeast.aliases", "w") for k, v in sorted(aliases.items()): print("\t".join((k, v)), file=fw) fw.close() fp = open(pillars) pairs = set() fw = must_open(opts.outfile, "w") for row in fp: atoms = [x for x in row.split() if x != "---"] pps = [pivot[x] for x in atoms if x in pivot] atoms = [aliases[x] for x in atoms if x in aliases] for p in pps: for a in atoms: if p == a: continue pairs.add(tuple(sorted((p, a)))) for a, b in sorted(pairs): print("\t".join((a, b)), file=fw) fw.close()
def uniq(args): """ %prog uniq gffile cdsfasta Remove overlapping gene models. Similar to formats.gff.uniq(), overlapping 'piles' are processed, one by one. Here, we use a different algorithm, that retains the best non-overlapping subset witin each pile, rather than single best model. Scoring function is also different, rather than based on score or span, we optimize for the subset that show the best combined score. Score is defined by: score = (1 - AED) * length """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, cdsfasta = args gff = Gff(gffile) sizes = Sizes(cdsfasta).mapping gene_register = {} for g in gff: if g.type != "mRNA": continue aed = float(g.attributes["_AED"][0]) gene_register[g.parent] = (1 - aed) * sizes[g.accn] allgenes = import_feats(gffile) g = get_piles(allgenes) bestids = set() for group in g: ranges = [ to_range(x, score=gene_register[x.accn], id=x.accn) for x in group ] selected_chain, score = range_chain(ranges) bestids |= set(x.id for x in selected_chain) removed = set(x.accn for x in allgenes) - bestids fw = open("removed.ids", "w") print("\n".join(sorted(removed)), file=fw) fw.close() populate_children(opts.outfile, bestids, gffile, "gene")
def get_cds_beds(gffile, noUTR=False): from jcvi.formats.gff import Gff mrnabed = None cdsbeds = [] gf = Gff(gffile) for g in gf: if g.type == "mRNA": mrnabed = g.bedline elif g.type == "CDS": cdsbeds.append(g.bedline) if noUTR: mrnabed.start = min(x.start for x in cdsbeds) mrnabed.end = max(x.end for x in cdsbeds) return mrnabed, cdsbeds
def augustus(args): """ %prog augustus augustus.gff3 > reformatted.gff3 AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement. """ p = OptionParser(augustus.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ingff3, = args gff = Gff(ingff3) for g in gff: if g.type not in ("gene", "transcript", "CDS"): continue if g.type == "transcript": g.type = "mRNA" print g