def ls(args): """ %prog ls "s3://hli-mv-data-science/htang/str/*.vcf.gz" List files with support for wildcards. """ p = OptionParser(ls.__doc__) p.add_option("--keys", help="List of keys to include") p.add_option("--recursive", default=False, action="store_true", help="Recursive search") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) store, = args keys = opts.keys if keys: keys = SetFile(keys) print("\n".join(glob_s3(store, keys=keys, recursive=opts.recursive)))
def cut(args): """ %prog cut unitigfile fragID Cut the unitig at a given fragment. Run `%prog trace unitigfile` first to see which fragment breaks the unitig. """ from jcvi.formats.base import SetFile p = OptionParser(cut.__doc__) p.add_option("-s", dest="shredafter", default=False, action="store_true", help="Shred fragments after the given fragID [default: %default]") p.add_option("--notest", default=False, action="store_true", help="Do not test the unitigfile after edits [default: %default]") p.add_option("--blacklist", help="File that contains blacklisted fragments to be popped " "[default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) s, fragID = args u = UnitigLayout(s) blacklist = opts.blacklist black = SetFile(blacklist) if blacklist else None if opts.shredafter: u.shredafter(fragID) elif black: assert fragID == "0", "Must set fragID to 0 when --blacklist is on" u.pop(black) else: u.cut(fragID) u.print_to_file(inplace=True) if not opts.notest: test([s])
def screen(args): """ %prog screen anchorfile newanchorfile --qbed=qbedfile --sbed=sbedfile [options] Extract subset of blocks from anchorfile. Provide several options: 1. Option --ids: a file with IDs, 0-based, comma separated, all in one line. 2. Option --seqids: only allow seqids in this file. 3. Option --seqpairs: only allow seqpairs in this file, one per line, e.g. "Chr01,Chr05". 4. Option --minspan: remove blocks with less span than this. 5. Option --minsize: remove blocks with less number of anchors than this. """ p = OptionParser(screen.__doc__) p.set_beds() p.add_option("--ids", help="File with block IDs (0-based) [default: %default]") p.add_option("--seqids", help="File with seqids [default: %default]") p.add_option("--seqpairs", help="File with seqpairs [default: %default]") p.add_option("--nointra", action="store_true", help="Remove intra-chromosomal blocks [default: %default]") p.add_option("--minspan", default=0, type="int", help="Only blocks with span >= [default: %default]") p.add_option("--minsize", default=0, type="int", help="Only blocks with anchors >= [default: %default]") p.add_option( "--simple", action="store_true", help="Write simple anchorfile with block ends [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) anchorfile, newanchorfile = args ac = AnchorFile(anchorfile) idsfile = opts.ids seqidsfile = opts.seqids seqpairsfile = opts.seqpairs minspan = opts.minspan minsize = opts.minsize osimple = opts.simple nointra = opts.nointra ids, seqids, seqpairs = None, None, None if idsfile: ids = SetFile(idsfile, delimiter=',') ids = set(int(x) for x in ids) if seqidsfile: seqids = SetFile(seqidsfile, delimiter=',') if seqpairsfile: fp = open(seqpairsfile) seqpairs = set() for row in fp: a, b = row.strip().split(",") seqpairs.add((a, b)) seqpairs.add((b, a)) qbed, sbed, qorder, sorder, is_self = check_beds(anchorfile, p, opts) blocks = ac.blocks selected = 0 fw = open(newanchorfile, "w") for i, block in enumerate(blocks): if ids and i not in ids: continue a, b, scores = zip(*block) a = [qorder[x] for x in a] b = [sorder[x] for x in b] ia, oa = zip(*a) ib, ob = zip(*b) aspan = max(ia) - min(ia) + 1 bspan = max(ib) - min(ib) + 1 aseqid = oa[0].seqid bseqid = ob[0].seqid if seqids: if (aseqid not in seqids) or (bseqid not in seqids): continue if seqpairs: if (aseqid, bseqid) not in seqpairs: continue if nointra and aseqid == bseqid: continue if minsize: if len(block) < minsize: continue if minspan: if aspan < minspan or bspan < minspan: continue selected += 1 print >> fw, "###" for line in block: print >> fw, "\t".join(line) fw.close() if osimple: simple([newanchorfile, "--noheader", \ "--qbed=" + qbed.filename, "--sbed=" + sbed.filename]) logging.debug("Before: {0} blocks, After: {1} blocks".\ format(len(blocks), selected))
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option( "--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.add_option("--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on") p.add_option("--nolayout", default=False, action="store_true", help="Do not rearrange contigs") p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([ deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen) ]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout)
def get_blacklist(self, filename): black = SetFile(filename) black = set(atg_name(x) for x in black) self.black.update(black)
def adjgraph(args): """ %prog adjgraph adjacency.txt subgraph.txt Construct adjacency graph for graphviz. The file may look like sample below. The lines with numbers are chromosomes with gene order information. genome 0 chr 0 -1 -13 -16 3 4 -6126 -5 17 -6 7 18 5357 8 -5358 5359 -9 -10 -11 5362 5360 chr 1 138 6133 -5387 144 -6132 -139 140 141 146 -147 6134 145 -170 -142 -143 """ import pygraphviz as pgv from jcvi.formats.base import SetFile p = OptionParser(adjgraph.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) infile, subgraph = args subgraph = SetFile(subgraph) subgraph = set(x.strip("-") for x in subgraph) G = pgv.AGraph(strict=False) # allow multi-edge SG = pgv.AGraph(strict=False) palette = ("green", "magenta", "tomato", "peachpuff") fp = open(infile) genome_id = -1 key = 0 for row in fp: if row.strip() == "": continue atoms = row.split() tag = atoms[0] if tag in ("ChrNumber", "chr"): continue if tag == "genome": genome_id += 1 gcolor = palette[genome_id] continue nodeseq = [] for p in atoms: np = p.strip("-") nodeL, nodeR = np + "L", np + "R" if p[0] == "-": # negative strand nodeseq += [nodeR, nodeL] else: nodeseq += [nodeL, nodeR] for a, b in pairwise(nodeseq): G.add_edge(a, b, key, color=gcolor) key += 1 na, nb = a[:-1], b[:-1] if na not in subgraph and nb not in subgraph: continue SG.add_edge(a, b, key, color=gcolor) G.graph_attr.update(dpi="300") fw = open("graph.dot", "w") G.write(fw) fw.close() fw = open("subgraph.dot", "w") SG.write(fw) fw.close()
def get_blacklist(self, filename): black = SetFile(filename) for x in black: chr, rank = atg_name(x) self.black.add((chr, rank))
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option( "--trim5", default=None, type="str", help="File containing gene list for 5' UTR trimming", ) p.add_option( "--trim3", default=None, type="str", help="File containing gene list for 3' UTR trimming", ) p.add_option( "--trimrange", default=None, type="str", help="File containing gene list for UTR trim back" + "based on suggested (start, stop) coordinate range", ) p.add_option( "--refgff", default=None, type="str", help="Reference GFF3 used as fallback to replace UTRs", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (gffile, ) = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert (len(tr.split("\t")) == 3 ), "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = ( c.id, c.featuretype, c.attributes.get("Parent", [None])[0], ) t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes["Parent"] [0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append("five_prime_UTR") if t3: utr_types.append("three_prime_UTR") for utr_type in utr_types: for utr in refgff.children( refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region( region=utr, featuretype="exon"): if exon.attributes["Parent"][ 0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by="start"): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [ range_overlap(to_range(cc), to_range(x)) for x in extras if x.featuretype == "exon" ] if any(eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()
def maker(args): """ %prog maker maker.gff3 genome.fasta Prepare EVM inputs by separating tracks from MAKER. """ from jcvi.formats.base import SetFile, FileShredder A, T, P = "ABINITIO_PREDICTION", "TRANSCRIPT", "PROTEIN" # Stores default weights and types Registry = {\ "maker": (A, 5), "augustus_masked": (A, 1), "snap_masked": (A, 1), "genemark": (A, 1), "est2genome": (T, 5), "est_gff": (T, 5), "protein2genome": (P, 5), "blastx": (P, 1) } p = OptionParser(maker.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gffile, fastafile = args types = "type.ids" if need_update(gffile, types): cmd = "cut -f2 -s {0} | sort -u".format(gffile) sh(cmd, outfile=types) types = SetFile(types) reg = defaultdict(list) weightsfile = "weights.txt" contents = [] for s in types: rs = s.split(":")[0] if rs not in Registry: continue type, weight = Registry[rs] reg[type].append(s) contents.append("\t".join(str(x) for x in (type, s, weight))) contents = "\n".join(sorted(contents)) write_file(weightsfile, contents, meta="weights file") evs = [x + ".gff" for x in (A, T, P)] FileShredder(evs) for type, tracks in reg.items(): for t in tracks: cmd = "grep '\t{0}' {1} | grep -v '_match\t' >> {2}.gff".format(t, gffile, type) sh(cmd) partition(evs) runfile = "run.sh" contents = EVMRUN.format(*evs) write_file(runfile, contents, meta="run script")
def segment(args): """ %prog segment loss.ids bedfile Merge adjacent gene loss into segmental loss. Then based on the segmental loss, estimate amount of DNA loss in base pairs. Two estimates can be given: - conservative: just within the start and end of a single gene - aggressive: extend the deletion track to the next gene The real deletion size is within these estimates. """ from jcvi.formats.base import SetFile p = OptionParser(segment.__doc__) p.add_option("--chain", default=1, type="int", help="Allow next N genes to be chained [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) idsfile, bedfile = args bed = Bed(bedfile) order = bed.order ids = SetFile(idsfile) losses = Grouper() skip = opts.chain for i, a in enumerate(bed): a = a.accn for j in xrange(i + 1, i + 1 + skip): if j >= len(bed): break b = bed[j].accn if a in ids: losses.join(a, a) if a in ids and b in ids: losses.join(a, b) losses = list(losses) singletons = [x for x in losses if len(x) == 1] segments = [x for x in losses if len(x) > 1] ns, nm, nt = len(singletons), len(segments), len(losses) assert ns + nm == nt # Summary for all segments for x in sorted(singletons) + sorted(segments): print "\t".join( str(x) for x in ("|".join(sorted(x)), len(x), estimate_size(x, bed, order))) # Find longest segment stretch if segments: mx, maxsegment = max([(len(x), x) for x in segments]) print >> sys.stderr, "Longest stretch: run of {0} genes".format(mx) print >> sys.stderr, " {0}".format("|".join(sorted(maxsegment))) seg_asize = sum(estimate_size(x, bed, order) for x in segments) seg_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in segments) else: seg_asize = seg_bsize = 0 sing_asize = sum(estimate_size(x, bed, order) for x in singletons) sing_bsize = sum(estimate_size(x, bed, order, conservative=False) \ for x in singletons) total_asize = sing_asize + seg_asize total_bsize = sing_bsize + seg_bsize print >> sys.stderr, "Singleton ({0}): {1} - {2} bp".\ format(ns, sing_asize, sing_bsize) print >> sys.stderr, "Segment ({0}): {1} - {2} bp".\ format(nm, seg_asize, seg_bsize) print >> sys.stderr, "Total ({0}): {1} - {2} bp".\ format(nt, total_asize, total_bsize) print >> sys.stderr, "Average ({0}): {1} bp".\ format(nt, (total_asize + total_bsize) / 2)
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. If reference GFF3 is provided, reinstate UTRs from reference transcripts after trimming. Note: After running trimUTR, it is advised to also run `python -m jcvi.formats.gff fixboundaries` on the resultant GFF3 to adjust the boundaries of all parent 'gene' features """ import gffutils from jcvi.formats.base import SetFile p = OptionParser(trimUTR.__doc__) p.add_option("--trim5", default=None, type="str", \ help="File containing gene list for 5' UTR trimming") p.add_option("--trim3", default=None, type="str", \ help="File containing gene list for 3' UTR trimming") p.add_option("--trimrange", default=None, type="str", \ help="File containing gene list for UTR trim back" + \ "based on suggested (start, stop) coordinate range") p.add_option("--refgff", default=None, type="str", \ help="Reference GFF3 used as fallback to replace UTRs") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) trim_both = False if (opts.trim5 or opts.trim3) else True trim5 = SetFile(opts.trim5) if opts.trim5 else set() trim3 = SetFile(opts.trim3) if opts.trim3 else set() trimrange = dict() if opts.trimrange: trf = must_open(opts.trimrange) for tr in trf: assert len(tr.split("\t")) == 3, \ "Must specify (start, stop) coordinate range" id, start, stop = tr.split("\t") trimrange[id] = (int(start), int(stop)) trf.close() refgff = make_index(opts.refgff) if opts.refgff else None fw = must_open(opts.outfile, "w") for feat in gff.iter_by_parent_childs(featuretype="gene", order_by=("seqid", "start"), level=1): for c in feat: cid, ctype, cparent = c.id, c.featuretype, \ c.attributes.get('Parent', [None])[0] t5, t3 = False, False if ctype == "gene": t5 = True if cid in trim5 else False t3 = True if cid in trim3 else False start, end = get_cds_minmax(gff, cid) trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) elif ctype == "mRNA": utr_types, extras = [], set() if any(id in trim5 for id in (cid, cparent)): t5 = True trim5.add(cid) if any(id in trim3 for id in (cid, cparent)): t3 = True trim3.add(cid) refc = None if refgff: try: refc = refgff[cid] refctype = refc.featuretype refptype = refgff[refc.attributes['Parent'][0]].featuretype if refctype == "mRNA" and refptype == "gene": if cmp_children(cid, gff, refgff, cftype="CDS"): reinstate(c, refc, trim5=t5, trim3=t3, both=trim_both) if t5: utr_types.append('five_prime_UTR') if t3: utr_types.append('three_prime_UTR') for utr_type in utr_types: for utr in refgff.children(refc, featuretype=utr_type): extras.add(utr) for exon in refgff.region(region=utr, featuretype="exon"): if exon.attributes['Parent'][0] == cid: extras.add(exon) else: refc = None except gffutils.exceptions.FeatureNotFoundError: pass start, end = get_cds_minmax(gff, cid, level=1) if cid in trimrange: start, end = range_minmax([trimrange[cid], (start, end)]) if not refc: trim(c, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(c, fw) for cc in gff.children(cid, order_by=("start")): _ctype = cc.featuretype if _ctype not in utr_types: if _ctype != "CDS": if _ctype == "exon": eskip = [range_overlap(to_range(cc), to_range(x)) \ for x in extras if x.featuretype == 'exon'] if any(skip for skip in eskip): continue trim(cc, start, end, trim5=t5, trim3=t3, both=trim_both) fprint(cc, fw) else: fprint(cc, fw) for x in extras: fprint(x, fw) fw.close()