def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps( [scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def fasta(args): """ %prog fasta bedfile scf.fasta pseudomolecules.fasta Use OM bed to scaffold and create pseudomolecules. bedfile can be generated by running jcvi.assembly.opticalmap bed --blockonly """ from jcvi.formats.sizes import Sizes from jcvi.formats.agp import OO, build p = OptionParser(fasta.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, scffasta, pmolfasta = args pf = bedfile.rsplit(".", 1)[0] bed = Bed(bedfile) selected = select_bed(bed) oo = OO() seen = set() sizes = Sizes(scffasta).mapping agpfile = pf + ".agp" agp = open(agpfile, "w") for b in selected: scf = range_parse(b.accn).seqid chr = b.seqid cs = (chr, scf) if cs not in seen: oo.add(chr, scf, sizes[scf], b.strand) seen.add(cs) else: logging.debug("Seen {0}, ignored.".format(cs)) oo.write_AGP(agp, gaptype="contig") agp.close() build([agpfile, scffasta, pmolfasta])
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length [default: %default]") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".\ format(len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print >> sys.stderr, "Remove {0}".format(c) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])