def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print(a) print(b) print(om_dist, ch_dist)
def distance(args): """ %prog distance bedfile Calculate distance between bed features. The output file is a list of distances, which can be used to plot histogram, etc. """ from jcvi.utils.iter import pairwise p = OptionParser(distance.__doc__) p.add_option("--distmode", default="ss", choices=("ss", "ee"), help="Distance mode between paired reads. ss is outer distance, " \ "ee is inner distance [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args sortedbedfile = sort([bedfile]) valid = total = 0 fp = open(sortedbedfile) for a, b in pairwise(fp): a = BedLine(a) b = BedLine(b) ar = (a.seqid, a.start, a.end, "+") br = (b.seqid, b.start, b.end, "+") dist, oo = range_distance(ar, br, distmode=opts.distmode) total += 1 if dist > 0: print dist valid += 1 logging.debug("Total valid (> 0) distances: {0}.".\ format(percentage(valid, total)))
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path", ) p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance", ) p.set_verbose(help="Print verbose reports to stdout") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(asub, bsub, atag, btag) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) graph_to_agp(g, blastfile, subjectfasta, verbose=opts.verbose)
def get_distance(a, b, xaxis=True): """ Returns the distance between two blast HSPs. """ if xaxis: arange = ("0", a.qstart, a.qstop, a.orientation) # 0 is the dummy chromosome brange = ("0", b.qstart, b.qstop, b.orientation) else: arange = ("0", a.sstart, a.sstop, a.orientation) brange = ("0", b.sstart, b.sstop, b.orientation) dist, oo = range_distance(arange, brange, distmode="ee") dist = abs(dist) return dist
def gaps(args): """ %prog gaps OM.bed fastafile Create patches around OM gaps. """ from jcvi.formats.bed import uniq from jcvi.utils.iter import pairwise p = OptionParser(gaps.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ombed, fastafile = args ombed = uniq([ombed]) bed = Bed(ombed) for a, b in pairwise(bed): om_a = (a.seqid, a.start, a.end, "+") om_b = (b.seqid, b.start, b.end, "+") ch_a = range_parse(a.accn) ch_b = range_parse(b.accn) ch_a = (ch_a.seqid, ch_a.start, ch_a.end, "+") ch_b = (ch_b.seqid, ch_b.start, ch_b.end, "+") om_dist, x = range_distance(om_a, om_b, distmode="ee") ch_dist, x = range_distance(ch_a, ch_b, distmode="ee") if om_dist <= 0 and ch_dist <= 0: continue print a print b print om_dist, ch_dist
def estimate_size(accns, bed, order, conservative=True): """ Estimate the bp length for the deletion tracks, indicated by the gene accns. True different levels of estimates vary on conservativeness. """ accns = [order[x] for x in accns] ii, bb = zip(*accns) mini, maxi = min(ii), max(ii) if not conservative: # extend one gene mini -= 1 maxi += 1 minb = bed[mini] maxb = bed[maxi] assert minb.seqid == maxb.seqid distmode = "ss" if conservative else "ee" ra = (minb.seqid, minb.start, minb.end, "+") rb = (maxb.seqid, maxb.start, maxb.end, "+") dist, orientation = range_distance(ra, rb, distmode=distmode) assert dist != -1 return dist
def fastpairs(args): """ %prog fastpairs castabfile Assuming paired reads are adjacent in the castabfile. Print pair distance and orientations. """ from jcvi.utils.range import range_distance from jcvi.assembly.base import orientationlabels p = OptionParser(fastpairs.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) castabfile, = args fp = open(castabfile) arow = fp.readline() while arow: brow = fp.readline() a, b = CasTabLine(arow), CasTabLine(brow) asubject, astart, astop = a.refnum, a.refstart, a.refstop bsubject, bstart, bstop = b.refnum, b.refstart, b.refstop if -1 not in (astart, bstart): aquery, bquery = a.readname, b.readname astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand) ) orientation = orientationlabels[orientation] if dist != -1: print "\t".join( str(x) for x in (aquery, bquery, dist, orientation)) arow = fp.readline()
def fastpairs(args): """ %prog fastpairs castabfile Assuming paired reads are adjacent in the castabfile. Print pair distance and orientations. """ from jcvi.utils.range import range_distance from jcvi.assembly.base import orientationlabels p = OptionParser(fastpairs.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) castabfile, = args fp = open(castabfile) arow = fp.readline() while arow: brow = fp.readline() a, b = CasTabLine(arow), CasTabLine(brow) asubject, astart, astop = a.refnum, a.refstart, a.refstop bsubject, bstart, bstop = b.refnum, b.refstart, b.refstop if -1 not in (astart, bstart): aquery, bquery = a.readname, b.readname astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand) ) orientation = orientationlabels[orientation] if dist != -1: print "\t".join(str(x) for x in (aquery, bquery, dist, orientation)) arow = fp.readline()
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def test_range_distance(a, b, distmode, expected): from jcvi.utils.range import range_distance assert range_distance(a, b, distmode) == expected
def report_pairs(data, cutoff=0, mateorientation=None, pairsfile=None, insertsfile=None, rclip=1, ascii=False, bins=20, distmode="ss", mpcutoff=1000): """ This subroutine is used by the pairs function in blast.py and cas.py. Reports number of fragments and pairs as well as linked pairs """ allowed_mateorientations = ("++", "--", "+-", "-+") if mateorientation: assert mateorientation in allowed_mateorientations num_fragments, num_pairs = 0, 0 all_dist = [] linked_dist = [] # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie` orientations = defaultdict(int) # clip how many chars from end of the read name to get pair name key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) data.sort(key=key) if pairsfile: pairsfw = open(pairsfile, "w") if insertsfile: insertsfw = open(insertsfile, "w") for pe, lines in groupby(data, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue num_pairs += 1 a, b = lines asubject, astart, astop = a.seqid, a.start, a.end bsubject, bstart, bstop = b.seqid, b.start, b.end aquery, bquery = a.accn, b.accn astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand), distmode=distmode) if dist >= 0: all_dist.append((dist, orientation, aquery, bquery)) # select only pairs with certain orientations - e.g. innies, outies, etc. if mateorientation: all_dist = [x for x in all_dist if x[1] == mateorientation] # try to infer cutoff as twice the median until convergence if cutoff <= 0: dists = np.array([x[0] for x in all_dist], dtype="int") p0 = analyze_dists(dists, cutoff=mpcutoff) cutoff = int(2 * p0) # initial estimate cutoff = int(math.ceil(cutoff / bins)) * bins logging.debug("Insert size cutoff set to {0}, ".format(cutoff) + "use '--cutoff' to override") for dist, orientation, aquery, bquery in all_dist: if dist > cutoff: continue if cutoff > 2 * mpcutoff and dist < mpcutoff: continue linked_dist.append(dist) if pairsfile: print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist) orientations[orientation] += 1 print >>sys.stderr, "{0} fragments, {1} pairs ({2} total)".\ format(num_fragments, num_pairs, num_fragments + num_pairs * 2) s = SummaryStats(linked_dist, dtype="int") num_links = s.size meandist, stdev = s.mean, s.sd p0, p1, p2 = s.median, s.p1, s.p2 print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \ (num_links, num_links * 100. / num_pairs, cutoff) print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\ format(meandist, stdev) print >> sys.stderr, "median distance between mates: {0}".format(p0) print >> sys.stderr, "95% distance range: {0} - {1}".format(p1, p2) print >> sys.stderr, "\nOrientations:" orientation_summary = [] for orientation, count in sorted(orientations.items()): o = "{0}:{1}".format(orientation, \ percentage(count, num_links, mode=1)) orientation_summary.append(o.split()[0]) print >> sys.stderr, o if insertsfile: from jcvi.graphics.histogram import histogram print >> insertsfw, "\n".join(str(x) for x in linked_dist) insertsfw.close() prefix = insertsfile.rsplit(".", 1)[0] if prefix > 10: prefix = prefix.split("-")[0] osummary = " ".join(orientation_summary) title = "{0} ({1}; median:{2} bp)".format(prefix, osummary, p0) histogram(insertsfile, vmin=0, vmax=cutoff, bins=bins, xlabel="Insertsize", title=title, ascii=ascii) if op.exists(insertsfile): os.remove(insertsfile) return s
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option("--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option("--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def fromblast(args): """ %prog fromblast blastfile subject.fasta Generate path from BLAST file. If multiple subjects map to the same query, an edge is constructed between them (with the link provided by the query). The BLAST file MUST be filtered, chained, supermapped. """ from jcvi.formats.blast import sort from jcvi.utils.range import range_distance p = OptionParser(fromblast.__doc__) p.add_option( "--clique", default=False, action="store_true", help="Populate clique instead of linear path [default: %default]") p.add_option( "--maxdist", default=100000, type="int", help="Create edge within certain distance [default: %default]") p.add_option("--verbose", default=False, action="store_true", help="Print verbose reports to stdout [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) blastfile, subjectfasta = args clique = opts.clique maxdist = opts.maxdist sort([blastfile, "--query"]) blast = BlastSlow(blastfile, sorted=True) g = BiGraph() for query, blines in groupby(blast, key=lambda x: x.query): blines = list(blines) iterator = combinations(blines, 2) if clique else pairwise(blines) for a, b in iterator: asub, bsub = a.subject, b.subject if asub == bsub: continue arange = (a.query, a.qstart, a.qstop, "+") brange = (b.query, b.qstart, b.qstop, "+") dist, oo = range_distance(arange, brange, distmode="ee") if dist > maxdist: continue atag = ">" if a.orientation == "+" else "<" btag = ">" if b.orientation == "+" else "<" g.add_edge(BiEdge(asub, bsub, atag, btag)) g.write("graph.txt") #g.draw("graph.pdf") logging.debug(str(g)) paths = [] for path in g.iter_paths(): m, oo = g.path(path) if len(oo) == 1: # Singleton path continue paths.append(oo) if opts.verbose: print m print oo npaths = len(paths) ntigs = sum(len(x) for x in paths) logging.debug("Graph decomposed to {0} paths with {1} components.".\ format(npaths, ntigs)) agpfile = blastfile + ".agp" sizes = Sizes(subjectfasta) fwagp = open(agpfile, "w") scaffolded = set() for i, oo in enumerate(paths): ctgorder = [(str(ctg), ("+" if strand else "-")) \ for ctg, strand in oo] scaffolded |= set(ctg for ctg, strand in ctgorder) object = "pmol_{0:04d}".format(i) order_to_agp(object, ctgorder, sizes.mapping, fwagp) # Get the singletons as well nsingletons = 0 for ctg, size in sizes.iter_sizes(): if ctg in scaffolded: continue ctgorder = [(ctg, "+")] object = ctg order_to_agp(object, ctgorder, sizes.mapping, fwagp) nsingletons += 1 logging.debug("Written {0} unscaffolded singletons.".format(nsingletons)) fwagp.close() logging.debug("AGP file written to `{0}`.".format(agpfile))
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option( "--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing", ) p.add_option( "--flank", default=2000, type="int", help="Get the seq of size on two ends", ) p.add_option( "--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print( "\t".join( str(x) for x in (chr, leftb.start, rightb.end, gb.accn) ), file=fwp, ) print(leftb, file=fwe) print(gb, file=fwe) print(rightb, file=fwe) print( "L:{0} R:{1} [{2}]".format( distance_to_left, distance_to_right, label ), file=fwe, ) print(gb.accn, file=fw_ids) continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def report_pairs(data, cutoff=0, mateorientation=None, pairsfile=None, insertsfile=None, rclip=1, ascii=False, bins=20, distmode="ss"): """ This subroutine is used by the pairs function in blast.py and cas.py. Reports number of fragments and pairs as well as linked pairs """ from jcvi.utils.cbook import percentage allowed_mateorientations = ("++", "--", "+-", "-+") if mateorientation: assert mateorientation in allowed_mateorientations num_fragments, num_pairs = 0, 0 all_dist = [] linked_dist = [] # +- (forward-backward) is `innie`, -+ (backward-forward) is `outie` orientations = defaultdict(int) # clip how many chars from end of the read name to get pair name key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) data.sort(key=key) if pairsfile: pairsfw = open(pairsfile, "w") if insertsfile: insertsfw = open(insertsfile, "w") for pe, lines in groupby(data, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue num_pairs += 1 a, b = lines asubject, astart, astop = a.seqid, a.start, a.end bsubject, bstart, bstop = b.seqid, b.start, b.end aquery, bquery = a.accn, b.accn astrand, bstrand = a.strand, b.strand dist, orientation = range_distance(\ (asubject, astart, astop, astrand), (bsubject, bstart, bstop, bstrand), distmode=distmode) if dist >= 0: all_dist.append((dist, orientation, aquery, bquery)) # select only pairs with certain orientations - e.g. innies, outies, etc. if mateorientation: all_dist = [x for x in all_dist if x[1] == mateorientation] # try to infer cutoff as twice the median until convergence if cutoff <= 0: dists = np.array([x[0] for x in all_dist], dtype="int") p0 = np.median(dists) cutoff = int(2 * p0) # initial estimate cutoff = int(math.ceil(cutoff / bins)) * bins logging.debug("Insert size cutoff set to {0}, ".format(cutoff) + "use '--cutoff' to override") for dist, orientation, aquery, bquery in all_dist: if dist > cutoff: continue linked_dist.append(dist) if pairsfile: print >> pairsfw, "{0}\t{1}\t{2}".format(aquery, bquery, dist) orientations[orientation] += 1 print >>sys.stderr, "%d fragments, %d pairs" % (num_fragments, num_pairs) num_links = len(linked_dist) linked_dist = np.array(linked_dist, dtype="int") linked_dist = np.sort(linked_dist) meandist = np.mean(linked_dist) stdev = np.std(linked_dist) p0 = np.median(linked_dist) p1 = linked_dist[int(num_links * .025)] p2 = linked_dist[int(num_links * .975)] meandist, stdev = int(meandist), int(stdev) p0 = int(p0) print >>sys.stderr, "%d pairs (%.1f%%) are linked (cutoff=%d)" % \ (num_links, num_links * 100. / num_pairs, cutoff) print >>sys.stderr, "mean distance between mates: {0} +/- {1}".\ format(meandist, stdev) print >>sys.stderr, "median distance between mates: {0}".format(p0) print >>sys.stderr, "95% distance range: {0} - {1}".format(p1, p2) print >>sys.stderr, "\nOrientations:" orientation_summary = [] for orientation, count in sorted(orientations.items()): o = "{0}:{1}".format(orientation, \ percentage(count, num_links, denominator=False)) orientation_summary.append(o.split()[0]) print >>sys.stderr, o if insertsfile: from jcvi.graphics.histogram import histogram print >>insertsfw, "\n".join(str(x) for x in linked_dist) insertsfw.close() prefix = insertsfile.rsplit(".", 1)[0] osummary = " ".join(orientation_summary) title="{0} ({1}; median dist:{2})".format(prefix, osummary, p0) histogram(insertsfile, vmin=0, vmax=cutoff, bins=bins, xlabel="Insertsize", title=title, ascii=ascii) if op.exists(insertsfile): os.remove(insertsfile) return meandist, stdev, p0, p1, p2