def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'.\ format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def write_unplaced_agp(agpfile, scaffolds, unplaced_agp): agp = AGP(agpfile) scaffolds_seen = set(x.component_id for x in agp) sizes = Sizes(scaffolds).mapping fwagp = must_open(unplaced_agp, "w") for s in sorted(sizes.keys()): if s in scaffolds_seen: continue order_to_agp(s, [(s, "?")], sizes, fwagp) logging.debug("Write unplaced AGP to `{0}`.".format(unplaced_agp))
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.add_option("--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on") p.add_option("--nolayout", default=False, action="store_true", help="Do not rearrange contigs") p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout)
def covfilter(args): """ %prog covfilter blastfile fastafile Fastafile is used to get the sizes of the queries. Two filters can be applied, the id% and cov%. """ p = OptionParser(covfilter.__doc__) p.add_option("--pctid", dest="pctid", default=90, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--pctcov", dest="pctcov", default=50, type="int", help="Percentage identity cutoff [default: %default]") p.add_option("--ids", dest="ids", default=None, help="Print out the ids that satisfy [default: %default]") p.add_option("--list", dest="list", default=False, action="store_true", help="List the id% and cov% per gene [default: %default]") set_outfile(p, outfile=None) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) from jcvi.algorithms.supermap import supermap blastfile, fastafile = args sizes = Sizes(fastafile).mapping querysupermap = blastfile + ".query.supermap" if not op.exists(querysupermap): supermap(blastfile, filter="query") blastfile = querysupermap assert op.exists(blastfile) covered = 0 mismatches = 0 gaps = 0 alignlen = 0 queries = set() valid = set() blast = BlastSlow(querysupermap) for query, blines in blast.iter_hits(): blines = list(blines) queries.add(query) # per gene report this_covered = 0 this_alignlen = 0 this_mismatches = 0 this_gaps = 0 for b in blines: this_covered += abs(b.qstart - b.qstop + 1) this_alignlen += b.hitlen this_mismatches += b.nmismatch this_gaps += b.ngaps this_identity = 100. - (this_mismatches + this_gaps) * 100. / this_alignlen this_coverage = this_covered * 100. / sizes[query] if opts.list: print "{0}\t{1:.1f}\t{2:.1f}".format(query, this_identity, this_coverage) if this_identity >= opts.pctid and this_coverage >= opts.pctcov: valid.add(query) covered += this_covered mismatches += this_mismatches gaps += this_gaps alignlen += this_alignlen mapped_count = len(queries) valid_count = len(valid) cutoff_message = "(id={0.pctid}% cov={0.pctcov}%)".format(opts) print >> sys.stderr, "Identity: {0} mismatches, {1} gaps, {2} alignlen".\ format(mismatches, gaps, alignlen) total = len(sizes.keys()) print >> sys.stderr, "Total mapped: {0} ({1:.1f}% of {2})".\ format(mapped_count, mapped_count * 100. / total, total) print >> sys.stderr, "Total valid {0}: {1} ({2:.1f}% of {3})".\ format(cutoff_message, valid_count, valid_count * 100. / total, total) print >> sys.stderr, "Average id = {0:.2f}%".\ format(100 - (mismatches + gaps) * 100. / alignlen) queries_combined = sum(sizes[x] for x in queries) print >> sys.stderr, "Coverage: {0} covered, {1} total".\ format(covered, queries_combined) print >> sys.stderr, "Average coverage = {0:.2f}%".\ format(covered * 100. / queries_combined) if opts.ids: filename = opts.ids fw = must_open(filename, "w") for id in valid: print >> fw, id logging.debug("Queries beyond cutoffs {0} written to `{1}`.".\ format(cutoff_message, filename)) outfile = opts.outfile if not outfile: return fp = open(blastfile) fw = must_open(outfile, "w") blast = Blast(blastfile) for b in blast.iter_line(): if b.query in valid: print >> fw, b
def draw_chromosomes( root, bedfile, sizes, iopts, mergedist, winsize, imagemap, mappingfile=None, gauge=False, legend=True, empty=False, title=None, ): bed = Bed(bedfile) prefix = bedfile.rsplit(".", 1)[0] if imagemap: imgmapfile = prefix + ".map" mapfh = open(imgmapfile, "w") print('<map id="' + prefix + '">', file=mapfh) if mappingfile: mappings = DictFile(mappingfile, delimiter="\t") classes = sorted(set(mappings.values())) preset_colors = (DictFile( mappingfile, keypos=1, valuepos=2, delimiter="\t") if DictFile.num_columns(mappingfile) >= 3 else {}) else: classes = sorted(set(x.accn for x in bed)) mappings = dict((x, x) for x in classes) preset_colors = {} logging.debug("A total of {} classes found: {}".format( len(classes), ",".join(classes))) # Assign colors to classes ncolors = max(3, min(len(classes), 12)) palette = set1_n if ncolors <= 8 else set3_n colorset = palette(number=ncolors) colorset = sample_N(colorset, len(classes)) class_colors = dict(zip(classes, colorset)) class_colors.update(preset_colors) logging.debug("Assigned colors: {}".format(class_colors)) chr_lens = {} centromeres = {} if sizes: chr_lens = Sizes(sizes).sizes_mapping else: for b, blines in groupby(bed, key=(lambda x: x.seqid)): blines = list(blines) maxlen = max(x.end for x in blines) chr_lens[b] = maxlen for b in bed: accn = b.accn if accn == "centromere": centromeres[b.seqid] = b.start if accn in mappings: b.accn = mappings[accn] else: b.accn = "-" chr_number = len(chr_lens) if centromeres: assert chr_number == len( centromeres), "chr_number = {}, centromeres = {}".format( chr_number, centromeres) r = 0.7 # width and height of the whole chromosome set xstart, ystart = 0.15, 0.85 xinterval = r / chr_number xwidth = xinterval * 0.5 # chromosome width max_chr_len = max(chr_lens.values()) ratio = r / max_chr_len # canvas / base # first the chromosomes for a, (chr, clen) in enumerate(sorted(chr_lens.items())): xx = xstart + a * xinterval + 0.5 * xwidth root.text(xx, ystart + 0.01, str(get_number(chr)), ha="center") if centromeres: yy = ystart - centromeres[chr] * ratio ChromosomeWithCentromere(root, xx, ystart, yy, ystart - clen * ratio, width=xwidth) else: Chromosome(root, xx, ystart, ystart - clen * ratio, width=xwidth) chr_idxs = dict((a, i) for i, a in enumerate(sorted(chr_lens.keys()))) alpha = 1 # color the regions for chr in sorted(chr_lens.keys()): segment_size, excess = 0, 0 bac_list = [] prev_end, prev_klass = 0, None for b in bed.sub_bed(chr): clen = chr_lens[chr] idx = chr_idxs[chr] klass = b.accn if klass == "centromere": continue start = b.start end = b.end if start < prev_end + mergedist and klass == prev_klass: start = prev_end xx = xstart + idx * xinterval yystart = ystart - end * ratio yyend = ystart - start * ratio root.add_patch( Rectangle( (xx, yystart), xwidth, yyend - yystart, fc=class_colors.get(klass, "lightslategray"), lw=0, alpha=alpha, )) prev_end, prev_klass = b.end, klass if imagemap: """ `segment` : size of current BAC being investigated + `excess` `excess` : left-over bases from the previous BAC, as a result of iterating over `winsize` regions of `segment` """ if excess == 0: segment_start = start segment = (end - start + 1) + excess while True: if segment < winsize: bac_list.append(b.accn) excess = segment break segment_end = segment_start + winsize - 1 tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) segment_start += winsize segment -= winsize bac_list = [] if imagemap and excess > 0: bac_list.append(b.accn) segment_end = end tlx, tly, brx, bry = ( xx, (1 - ystart) + segment_start * ratio, xx + xwidth, (1 - ystart) + segment_end * ratio, ) print( "\t" + write_ImageMapLine( tlx, tly, brx, bry, iopts.w, iopts.h, iopts.dpi, chr + ":" + ",".join(bac_list), segment_start, segment_end, ), file=mapfh, ) if imagemap: print("</map>", file=mapfh) mapfh.close() logging.debug("Image map written to `{0}`".format(mapfh.name)) if gauge: xstart, ystart = 0.9, 0.85 Gauge(root, xstart, ystart - r, ystart, max_chr_len) if "centromere" in class_colors: del class_colors["centromere"] # class legends, four in a row if legend: xstart = 0.1 xinterval = 0.8 / len(class_colors) xwidth = 0.04 yy = 0.08 for klass, cc in sorted(class_colors.items()): if klass == "-": continue root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fc=cc, lw=0, alpha=alpha)) root.text(xstart + xwidth + 0.01, yy, latex(klass), fontsize=10) xstart += xinterval if empty: root.add_patch( Rectangle((xstart, yy), xwidth, xwidth, fill=False, lw=1)) root.text(xstart + xwidth + 0.01, yy, empty, fontsize=10) if title: root.text(0.5, 0.95, markup(title), ha="center", va="center")
def scaffold(args): """ %prog scaffold ctgfasta linksfile Use the linksfile to build scaffolds. The linksfile can be generated by calling assembly.bundle.link() or assembly.bundle.bundle(). Use --prefix to place the sequences with same prefix together. The final product is an AGP file. """ from jcvi.algorithms.graph import nx from jcvi.formats.agp import order_to_agp p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, linksfile = args sizes = Sizes(ctgfasta).mapping logfile = "scaffold.log" fwlog = open(logfile, "w") pf = ctgfasta.rsplit(".", 1)[0] agpfile = pf + ".agp" fwagp = open(agpfile, "w") clinks = [] g = nx.MultiGraph() # use this to get connected components fp = open(linksfile) for row in fp: c = LinkLine(row) distance = max(c.distance, 50) g.add_edge(c.aseqid, c.bseqid, orientation=c.orientation, distance=distance) def get_bname(sname, prefix=False): return sname.rsplit("_", 1)[0] if prefix else "chr0" scaffoldbuckets = defaultdict(list) seqnames = sorted(sizes.keys()) for h in nx.connected_component_subgraphs(g): partialorder = solve_component(h, sizes, fwlog) name = partialorder[0][0] bname = get_bname(name, prefix=opts.prefix) scaffoldbuckets[bname].append(partialorder) ctgbuckets = defaultdict(set) for name in seqnames: bname = get_bname(name, prefix=opts.prefix) ctgbuckets[bname].add(name) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. scafname = "{0}.scf_{1:04d}" for bname, ctgs in sorted(ctgbuckets.items()): scaffolds = scaffoldbuckets[bname] scaffolded = set() ctgorder = [] for scafID, scaf in enumerate(scaffolds): ctgorder = [] for node, start, end, orientation in scaf: ctgorder.append((node, orientation)) scaffolded.add(node) scaf = scafname.format(bname, scafID) order_to_agp(scaf, ctgorder, sizes, fwagp) singletons = sorted(ctgbuckets[bname] - scaffolded) nscaffolds = len(scaffolds) nsingletons = len(singletons) msg = "{0}: Scaffolds={1} Singletons={2}".\ format(bname, nscaffolds, nsingletons) print >> sys.stderr, msg for singleton in singletons: ctgorder = [(singleton, "+")] order_to_agp(singleton, ctgorder, sizes, fwagp) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def scaffold(args): """ %prog scaffold ctgfasta agpfile Build scaffolds based on ordering in the AGP file. """ from jcvi.formats.agp import AGP, bed, order_to_agp, build from jcvi.formats.bed import Bed p = OptionParser(scaffold.__doc__) p.add_option("--prefix", default=False, action="store_true", help="Keep IDs with same prefix together [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, agpfile = args sizes = Sizes(ctgfasta).mapping pf = ctgfasta.rsplit(".", 1)[0] phasefile = pf + ".phases" fwphase = open(phasefile, "w") newagpfile = pf + ".new.agp" fwagp = open(newagpfile, "w") scaffoldbuckets = defaultdict(list) seqnames = sorted(sizes.keys()) bedfile = bed([agpfile, "--nogaps", "--outfile=tmp"]) bb = Bed(bedfile) for s, partialorder in bb.sub_beds(): name = partialorder[0].accn bname = name.rsplit("_", 1)[0] if opts.prefix else s scaffoldbuckets[bname].append([(b.accn, b.strand) for b in partialorder]) # Now the buckets contain a mixture of singletons and partially resolved # scaffolds. Print the scaffolds first then remaining singletons. for bname, scaffolds in sorted(scaffoldbuckets.items()): ctgorder = [] singletons = set() for scaf in sorted(scaffolds): for node, orientation in scaf: ctgorder.append((node, orientation)) if len(scaf) == 1: singletons.add(node) nscaffolds = len(scaffolds) nsingletons = len(singletons) if nsingletons == 1 and nscaffolds == 0: phase = 3 elif nsingletons == 0 and nscaffolds == 1: phase = 2 else: phase = 1 msg = "{0}: Scaffolds={1} Singletons={2} Phase={3}".\ format(bname, nscaffolds, nsingletons, phase) print >> sys.stderr, msg print >> fwphase, "\t".join((bname, str(phase))) order_to_agp(bname, ctgorder, sizes, fwagp) fwagp.close() os.remove(bedfile) fastafile = "final.fasta" build([newagpfile, ctgfasta, fastafile]) tidy([fastafile])