def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ from jcvi.graphics.histogram import loghistogram p = OptionParser(n50.__doc__) p.add_option( "--print0", default=False, action="store_true", help="Print size and L50 to stdout", ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = probe == ">" if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(float(row.split()[-1])) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print(", ".join(args), file=sys.stderr) summary = (sumsize, l50, nn50, minsize, maxsize, n) print( " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)), file=sys.stderr, ) loghistogram(ctgsizes) if opts.print0: print("\t".join(str(x) for x in (",".join(args), sumsize, l50))) return zip(header, summary)
def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists [default: %default]") p.add_option("--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size [default: %default]") p.add_option("--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution [default: %default]") opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print >>fw, header for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format(\ op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]): print >> fw, "\t".join((str(i), str(s / 1000000.), tag)) fw.close() table = loadtable(statsheader, statsrows) print >> sys.stderr, table generate_plot(rplot)
def minimap(args): """ %prog minimap ref.fasta query.fasta Wrap minimap2 aligner using query against sequences. When query and ref is the same, we are in "self-scan" mode (e.g. useful for finding internal duplications resulted from mis-assemblies). """ from jcvi.apps.grid import MakeManager from jcvi.formats.fasta import Fasta p = OptionParser(minimap.__doc__) p.add_option( "--chunks", type="int", default=2000000, help="Split ref.fasta into chunks of size in self-scan mode", ) p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args chunks = opts.chunks outdir = opts.outdir if ref != query: raise NotImplementedError # "self-scan" mode # build faidx (otherwise, parallel make may complain) sh("samtools faidx {}".format(ref)) f = Fasta(ref) mkdir(outdir) mm = MakeManager() for name, size in f.itersizes(): start = 0 for end in range(chunks, size, chunks): fafile = op.join(outdir, "{}_{}_{}.fa".format(name, start + 1, end)) cmd = "samtools faidx {} {}:{}-{} -o {}".format( ref, name, start + 1, end, fafile) mm.add(ref, fafile, cmd) paffile = fafile.rsplit(".", 1)[0] + ".paf" cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile) mm.add(fafile, paffile, cmd) epsfile = fafile.rsplit(".", 1)[0] + ".eps" cmd = "minidot {} > {}".format(paffile, epsfile) mm.add(paffile, epsfile, cmd) start += chunks mm.write()
def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ from jcvi.graphics.histogram import loghistogram p = OptionParser(n50.__doc__) p.add_option( "--print0", default=False, action="store_true", help="Print size and L50 to stdout [default: %default]" ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = probe == ">" if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(row.split()[-1]) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print >> sys.stderr, ", ".join(args) summary = (sumsize, l50, nn50, minsize, maxsize, n) print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)) loghistogram(ctgsizes) if opts.print0: print "\t".join(str(x) for x in (",".join(args), sumsize, l50)) return zip(header, summary)
def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ p = OptionParser(n50.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = (probe == '>') if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(row.split()[-1]) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print >> sys.stderr, ", ".join(args) summary = (sumsize, l50, nn50, minsize, maxsize, n) print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in \ zip(header, summary)) loghistogram(ctgsizes, summary=False) return zip(header, summary)
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family))
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family))
def clr(args): """ %prog blastfile fastafiles Calculate the vector clear range file based BLAST to the vectors. """ p = OptionParser(clr.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] fastafiles = args[1:] sizes = {} for fa in fastafiles: f = Fasta(fa) sizes.update(f.itersizes()) b = Blast(blastfile) seen = set() for query, hits in b.iter_hits(): qsize = sizes[query] vectors = list((x.qstart, x.qstop) for x in hits) vmin, vmax = range_minmax(vectors) left_size = vmin - 1 right_size = qsize - vmax if left_size > right_size: clr_start, clr_end = 0, vmin else: clr_start, clr_end = vmax, qsize print "\t".join(str(x) for x in (query, clr_start, clr_end)) del sizes[query] for q, size in sorted(sizes.items()): print "\t".join(str(x) for x in (q, 0, size))
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N+1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i-x].accn] if abs(leni - lenx) > max(leni, lenx)*(1-P/100.): continue g.join(bed[i-x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >>fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family)) return families
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option( "--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir", ) p.add_option( "--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig", ) p.add_option( "--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig", ) p.add_option( "--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig", ) p.add_option( "--astat", default=False, action="store_true", help="create .astat to list repetitiveness", ) p.add_option( "--readids", default=False, action="store_true", help="create file of mapped and unmapped ids", ) from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print("AS {0} {1}".format(ncontigs, totalreads), file=fw) print(file=fw) for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print("CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments), file=fw) print(fill(str(cseq.seq)), file=fw) print(file=fw) if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print("{0}\t{1:.1f}".format(contig, astat), file=astatfw) text = fill([qual] * nbases, delimiter=" ", width=30) print("BQ\n{0}".format(text), file=fw) print(file=fw) rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print(readname, file=readsfw) rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print(af, file=fw) print(file=fw) for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format( rname, alen, ninfos, ntags, fill(aseq) ) qs = "QA 1 {0} 1 {0}".format(alen) print(rd, file=fw) print(file=fw) print(qs, file=fw) print(file=fw)
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def ace(args): """ %prog ace bamfile fastafile convert bam format to ace format. This often allows the remapping to be assessed as a denovo assembly format. bam file needs to be indexed. also creates a .mates file to be used in amos/bambus, and .astat file to mark whether the contig is unique or repetitive based on A-statistics in Celera assembler. """ p = OptionParser(ace.__doc__) p.add_option("--splitdir", dest="splitdir", default="outRoot", help="split the ace per contig to dir [default: %default]") p.add_option("--unpaired", dest="unpaired", default=False, help="remove read pairs on the same contig [default: %default]") p.add_option("--minreadno", dest="minreadno", default=3, type="int", help="minimum read numbers per contig [default: %default]") p.add_option("--minctgsize", dest="minctgsize", default=100, type="int", help="minimum contig size per contig [default: %default]") p.add_option("--astat", default=False, action="store_true", help="create .astat to list repetitiveness [default: %default]") p.add_option("--readids", default=False, action="store_true", help="create file of mapped and unmapped ids [default: %default]") from pysam import Samfile opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, fastafile = args astat = opts.astat readids = opts.readids f = Fasta(fastafile) prefix = bamfile.split(".")[0] acefile = prefix + ".ace" readsfile = prefix + ".reads" astatfile = prefix + ".astat" logging.debug("Load {0}".format(bamfile)) s = Samfile(bamfile, "rb") ncontigs = s.nreferences genomesize = sum(x for a, x in f.itersizes()) logging.debug("Total {0} contigs with size {1} base".format(ncontigs, genomesize)) qual = "20" # default qual totalreads = sum(s.count(x) for x in s.references) logging.debug("Total {0} reads mapped".format(totalreads)) fw = open(acefile, "w") if astat: astatfw = open(astatfile, "w") if readids: readsfw = open(readsfile, "w") print >> fw, "AS {0} {1}".format(ncontigs, totalreads) print >> fw for i, contig in enumerate(s.references): cseq = f[contig] nbases = len(cseq) mapped_reads = [x for x in s.fetch(contig) if not x.is_unmapped] nreads = len(mapped_reads) nsegments = 0 print >> fw, "CO {0} {1} {2} {3} U".format(contig, nbases, nreads, nsegments) print >> fw, fill(str(cseq.seq)) print >> fw if astat: astat = Astat(nbases, nreads, genomesize, totalreads) print >> astatfw, "{0}\t{1:.1f}".format(contig, astat) text = fill([qual] * nbases, delimiter=" ", width=30) print >> fw, "BQ\n{0}".format(text) print >> fw rnames = [] for a in mapped_reads: readname = a.qname rname = readname if readids: print >> readsfw, readname rnames.append(rname) strand = "C" if a.is_reverse else "U" paddedstart = a.pos + 1 # 0-based to 1-based af = "AF {0} {1} {2}".format(rname, strand, paddedstart) print >> fw, af print >> fw for a, rname in zip(mapped_reads, rnames): aseq, npadded = cigar_to_seq(a) if aseq is None: continue ninfos = 0 ntags = 0 alen = len(aseq) rd = "RD {0} {1} {2} {3}\n{4}".format(rname, alen, ninfos, ntags, fill(aseq)) qs = "QA 1 {0} 1 {0}".format(alen) print >> fw, rd print >> fw print >> fw, qs print >> fw