def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def dump(args): """ %prog dump fastafile Convert FASTA sequences to list of K-mers. """ p = OptionParser(dump.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args K = opts.K fw = must_open(opts.outfile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> fw, "\n".join(kmers) fw.close()
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def score(args): """ %prog score blastfile query.fasta A.ids Add up the scores for each query seq. Go through the lines and for each query sequence, add up the scores when subject is in each pile by A.ids. """ from jcvi.formats.base import SetFile from jcvi.formats.fasta import Fasta p = OptionParser(score.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, fastafile, idsfile = args ids = SetFile(idsfile) blast = Blast(blastfile) scores = defaultdict(int) for b in blast: query = b.query subject = b.subject if subject not in ids: continue scores[query] += b.score logging.debug("A total of {0} ids loaded.".format(len(ids))) f = Fasta(fastafile) for s in f.iterkeys_ordered(): sc = scores.get(s, 0) print "\t".join((s, str(sc)))
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.formats.fasta import Fasta from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" sizes.append(int(size)) s = SummaryStats(sizes) print >> sys.stderr, s stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def digest(args): """ %prog digest fastafile NspI,BfuCI Digest fasta sequences to map restriction site positions. """ p = OptionParser(digest.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzymes = args enzymes = enzymes.split(",") enzymes = [x for x in AllEnzymes if str(x) in enzymes] f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") header = ["Contig", "Length"] + [str(x) for x in enzymes] print("\t".join(header), file=fw) for name, rec in f.iteritems_ordered(): row = [name, len(rec)] for e in enzymes: pos = e.search(rec.seq) pos = "na" if not pos else "|".join(str(x) for x in pos) row.append(pos) print("\t".join(str(x) for x in row), file=fw)
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" sizes.append(int(size)) s = SummaryStats(sizes) print >> sys.stderr, s stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def fragment(args): """ %prog fragment fastafile enzyme Cut the fastafile using the specified enzyme, and grab upstream and downstream nucleotide sequence along with the cut site. In this case, the sequences extracted are: |- PstI ============|=========== (-------) Sometimes we need to limit the size of the restriction fragments, for example the GBS protocol does not allow fragments larger than 800bp. |-PstI |- PstI |- PstI ~~~====|=============|==========~~~~~~~===|============ (---) (---) In this case, the second fragment is longer than 800bp, therefore the two ends are NOT extracted, as in the first fragment. """ p = OptionParser(fragment.__doc__) p.add_option( "--flank", default=150, type="int", help="Extract flanking bases of the cut sites", ) p.add_option( "--full", default=False, action="store_true", help="The full extraction mode", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzyme = args flank = opts.flank assert flank > 0 extract = extract_full if opts.full else extract_ends tag = "full" if opts.full else "ends" assert enzyme in set(str(x) for x in AllEnzymes) fragfastafile = fastafile.split(".")[0] + ".{0}.flank{1}.{2}.fasta".format( enzyme, flank, tag ) enzyme = [x for x in AllEnzymes if str(x) == enzyme][0] f = Fasta(fastafile, lazy=True) fw = open(fragfastafile, "w") for name, rec in f.iteritems_ordered(): a = Analysis([enzyme], rec.seq) sites = a.full()[enzyme] extract(rec, sites, flank, fw) logging.debug("Fragments written to `{0}`.".format(fragfastafile))
def main(arg): f = Fasta(arg) s = [str(x.seq) for k, x in f.iteritems_ordered()] m = s[0] for z in s[1:]: m = m.replace(z, "") print Seq(m).translate().strip("*")
def main(arg): f = Fasta(arg) G = {} iG = set() for a in f.keys(): for b in f.keys(): if a == b: continue ov = get_overlap(a, b, f) if not ov: continue a, b, i = ov G[a] = (a, b, i) iG.add(b) # linearize graph start = set(f.keys()) - iG assert len(start) == 1 z = list(start)[0] seq = str(f[z].seq) while z in G: a, b, i = G[z] seq = seq[:-i] + str(f[b].seq) z = b print seq
def main(arg): f = Fasta(arg) for a in f.keys(): for b in f.keys(): if a == b: continue if check_overlap(a, b, f): print a, b
def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ from jcvi.graphics.histogram import loghistogram p = OptionParser(n50.__doc__) p.add_option( "--print0", default=False, action="store_true", help="Print size and L50 to stdout", ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = probe == ">" if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(float(row.split()[-1])) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print(", ".join(args), file=sys.stderr) summary = (sumsize, l50, nn50, minsize, maxsize, n) print( " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)), file=sys.stderr, ) loghistogram(ctgsizes) if opts.print0: print("\t".join(str(x) for x in (",".join(args), sumsize, l50))) return zip(header, summary)
def main(filename): f = Fasta(filename) gc_store = [] for key, rec in f.iteritems(): gc = sum(rec.seq.count(x) for x in 'GCgc') * 100. / len(rec.seq) gc_store.append((gc, key)) gc, key = max(gc_store) print key print gc
def frombed(args): """ %prog frombed bedfile contigfasta readfasta Convert read placement to contig format. This is useful before running BAMBUS. """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.bed import Bed from jcvi.utils.cbook import fill p = OptionParser(frombed.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) bedfile, contigfasta, readfasta = args prefix = bedfile.rsplit(".", 1)[0] contigfile = prefix + ".contig" idsfile = prefix + ".ids" contigfasta = Fasta(contigfasta) readfasta = Fasta(readfasta) bed = Bed(bedfile) checksum = "00000000 checksum." fw_ids = open(idsfile, "w") fw = open(contigfile, "w") for ctg, reads in bed.sub_beds(): ctgseq = contigfasta[ctg] ctgline = "##{0} {1} {2} bases, {3}".format(\ ctg, len(reads), len(ctgseq), checksum) print >> fw_ids, ctg print >> fw, ctgline print >> fw, fill(ctgseq.seq) for b in reads: read = b.accn strand = b.strand readseq = readfasta[read] rc = " [RC]" if strand == "-" else "" readlen = len(readseq) rstart, rend = 1, readlen if strand == "-": rstart, rend = rend, rstart readrange = "{{{0} {1}}}".format(rstart, rend) conrange = "<{0} {1}>".format(b.start, b.end) readline = "#{0}(0){1} {2} bases, {3} {4} {5}".format(\ read, rc, readlen, checksum, readrange, conrange) print >> fw, readline print >> fw, fill(readseq.seq) logging.debug("Mapped contigs written to `{0}`.".format(contigfile)) logging.debug("Contig IDs written to `{0}`.".format(idsfile))
def A50(args): """ %prog A50 contigs_A.fasta contigs_B.fasta ... Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/) """ p = OptionParser(A50.__doc__) p.add_option("--overwrite", default=False, action="store_true", help="overwrite .rplot file if exists [default: %default]") p.add_option("--cutoff", default=0, type="int", dest="cutoff", help="use contigs above certain size [default: %default]") p.add_option("--stepsize", default=10, type="int", dest="stepsize", help="stepsize for the distribution [default: %default]") opts, args = p.parse_args(args) if not args: sys.exit(p.print_help()) import numpy as np from jcvi.utils.table import loadtable stepsize = opts.stepsize # use stepsize to speed up drawing rplot = "A50.rplot" if not op.exists(rplot) or opts.overwrite: fw = open(rplot, "w") header = "\t".join(("index", "cumsize", "fasta")) statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum", "Counts") statsrows = [] print >>fw, header for fastafile in args: f = Fasta(fastafile, index=False) ctgsizes = [length for k, length in f.itersizes()] ctgsizes = np.array(ctgsizes) a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff) cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes) csum, counts = np.sum(ctgsizes), len(ctgsizes) cmean = int(round(cmean)) statsrows.append((fastafile, l50, n50, cmin, cmax, cmean, csum, counts)) logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes)) tag = "{0} (L50={1})".format(\ op.basename(fastafile).rsplit(".", 1)[0], l50) logging.debug(tag) for i, s in zip(xrange(0, len(a50), stepsize), a50[::stepsize]): print >> fw, "\t".join((str(i), str(s / 1000000.), tag)) fw.close() table = loadtable(statsheader, statsrows) print >> sys.stderr, table generate_plot(rplot)
def prepare(args): """ %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta Pick sequences from cdsfile to form pairs, ready to be calculated. The pairsfile can be generated from formats.blast.cscore(). The first two columns contain the pair. """ from jcvi.formats.fasta import Fasta p = OptionParser(prepare.__doc__) p.set_outfile() opts, args = p.parse_args(args) outfile = opts.outfile if len(args) == 2: pairsfile, cdsfile = args pepfile = None elif len(args) == 3: pairsfile, cdsfile, pepfile = args else: sys.exit(not p.print_help()) f = Fasta(cdsfile) fp = open(pairsfile) fw = must_open(outfile, "w") if pepfile: assert outfile != "stdout", "Please specify outfile name." f2 = Fasta(pepfile) fw2 = must_open(outfile + ".pep", "w") for row in fp: if row[0] == '#': continue a, b = row.split()[:2] if a == b: logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b)) continue if a not in f: a = find_first_isoform(a, f) assert a, a if b not in f: b = find_first_isoform(b, f) assert b, b acds = f[a] bcds = f[b] SeqIO.write((acds, bcds), fw, "fasta") if pepfile: apep = f2[a] bpep = f2[b] SeqIO.write((apep, bpep), fw2, "fasta") fw.close() if pepfile: fw2.close()
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print >> sys.stderr, tabulate(r)
def minimap(args): """ %prog minimap ref.fasta query.fasta Wrap minimap2 aligner using query against sequences. When query and ref is the same, we are in "self-scan" mode (e.g. useful for finding internal duplications resulted from mis-assemblies). """ from jcvi.apps.grid import MakeManager from jcvi.formats.fasta import Fasta p = OptionParser(minimap.__doc__) p.add_option( "--chunks", type="int", default=2000000, help="Split ref.fasta into chunks of size in self-scan mode", ) p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ref, query = args chunks = opts.chunks outdir = opts.outdir if ref != query: raise NotImplementedError # "self-scan" mode # build faidx (otherwise, parallel make may complain) sh("samtools faidx {}".format(ref)) f = Fasta(ref) mkdir(outdir) mm = MakeManager() for name, size in f.itersizes(): start = 0 for end in range(chunks, size, chunks): fafile = op.join(outdir, "{}_{}_{}.fa".format(name, start + 1, end)) cmd = "samtools faidx {} {}:{}-{} -o {}".format( ref, name, start + 1, end, fafile) mm.add(ref, fafile, cmd) paffile = fafile.rsplit(".", 1)[0] + ".paf" cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile) mm.add(fafile, paffile, cmd) epsfile = fafile.rsplit(".", 1)[0] + ".eps" cmd = "minidot {} > {}".format(paffile, epsfile) mm.add(paffile, epsfile, cmd) start += chunks mm.write()
def main(arg): f = Fasta(arg) key, rev = f.iteritems().next() s = rev.seq for i in xrange(len(s)): for l in xrange(4, 13): if i + l > len(s): continue ns = s[i:i+l] if str(ns) == str(ns.reverse_complement()): print i + 1, l
def get_GC3(cdsfile): from jcvi.formats.fasta import Fasta f = Fasta(cdsfile, lazy=True) GC3 = {} for name, rec in f.iteritems_ordered(): positions = rec.seq[2::3].upper() gc_counts = sum(1 for x in positions if x in "GC") gc_ratio = gc_counts * 1. / len(positions) GC3[name] = gc_ratio return GC3
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".format( nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug( "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]". format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".format( percentage(totalassembled, totalreads)))
def main(arg): f = Fasta(arg) key, rec = f.iteritems().next() s = rec.seq store = defaultdict(int) for i in xrange(len(s) - 3): kmer = s[i:i+4] assert len(kmer) == 4 store[kmer] += 1 counts = [store.get("".join(x), 0) for x in product("ACGT", repeat=4)] print " ".join(str(x) for x in counts)
def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ from jcvi.graphics.histogram import loghistogram p = OptionParser(n50.__doc__) p.add_option( "--print0", default=False, action="store_true", help="Print size and L50 to stdout [default: %default]" ) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = probe == ">" if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(row.split()[-1]) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print >> sys.stderr, ", ".join(args) summary = (sumsize, l50, nn50, minsize, maxsize, n) print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in zip(header, summary)) loghistogram(ctgsizes) if opts.print0: print "\t".join(str(x) for x in (",".join(args), sumsize, l50)) return zip(header, summary)
def fragment(args): """ %prog fragment fastafile enzyme Cut the fastafile using the specified enzyme, and grab upstream and downstream nucleotide sequence along with the cut site. In this case, the sequences extracted are: |- PstI ============|=========== (-------) Sometimes we need to limit the size of the restriction fragments, for example the GBS protocol does not allow fragments larger than 800bp. |-PstI |- PstI |- PstI ~~~====|=============|==========~~~~~~~===|============ (---) (---) In this case, the second fragment is longer than 800bp, therefore the two ends are NOT extracted, as in the first fragment. """ p = OptionParser(fragment.__doc__) p.add_option("--flank", default=150, type="int", help="Extract flanking bases of the cut sites [default: %default]") p.add_option("--full", default=False, action="store_true", help="The full extraction mode [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzyme = args flank = opts.flank assert flank > 0 extract = extract_full if opts.full else extract_ends tag = "full" if opts.full else "ends" assert enzyme in set(str(x) for x in AllEnzymes) fragfastafile = fastafile.split(".")[0] + \ ".{0}.flank{1}.{2}.fasta".format(enzyme, flank, tag) enzyme = [x for x in AllEnzymes if str(x) == enzyme][0] f = Fasta(fastafile, lazy=True) fw = open(fragfastafile, "w") for name, rec in f.iteritems_ordered(): a = Analysis([enzyme], rec.seq) sites = a.full()[enzyme] extract(rec, sites, flank, fw) logging.debug("Fragments written to `{0}`.".format(fragfastafile))
def merge(args): """ %prog merge gffiles Merge several gff files into one. When only one file is given, it is assumed to be a file with a list of gff files. """ p = OptionParser(merge.__doc__) set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if nargs < 1: sys.exit(not p.print_help()) if nargs == 1: listfile, = args fp = open(listfile) gffiles = [x.strip() for x in fp] else: gffiles = args outfile = opts.outfile deflines = set() fw = must_open(outfile, "w") fastarecs = {} for gffile in gffiles: fp = open(gffile) for row in fp: row = row.rstrip() if row[0] == '#': if row == FastaTag: break if row in deflines: continue else: deflines.add(row) print >> fw, row f = Fasta(gffile, lazy=True) for key, rec in f.iteritems_ordered(): if key in fastarecs.keys(): continue fastarecs[key] = rec print >> fw, FastaTag SeqIO.write(fastarecs.values(), fw, "fasta")
def summary(args): """ %prog summary gffile fastafile Print summary stats, including: - Gene/Exon/Intron - Number - Average size (bp) - Median size (bp) - Total length (Mb) - % of genome - % GC """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gff_file, ref = args s = Fasta(ref) g = make_index(gff_file) geneseqs, exonseqs, intronseqs = [], [], [] # Calc % GC for f in g.features_of_type("gene"): fid = f.id fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop}) geneseqs.append(fseq) exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \ if c.featuretype == "exon") exons = list(exons) for chrom, start, stop in exons: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) exonseqs.append(fseq) introns = range_interleave(exons) for chrom, start, stop in introns: fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop}) intronseqs.append(fseq) r = {} # Report for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)): tsizes = [len(x) for x in tseqs] tsummary = SummaryStats(tsizes, dtype="int") r[t, "Number"] = tsummary.size r[t, "Average size (bp)"] = tsummary.mean r[t, "Median size (bp)"] = tsummary.median r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb") r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1) r[t, "% GC"] = gc(tseqs) print(tabulate(r), file=sys.stderr)
def filter(args): """ %prog filter *.consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=2, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastafiles = args minsize = opts.minsize totalreads = totalassembled = 0 fw = must_open(opts.outfile, "w") for i, fastafile in enumerate(fastafiles): f = Fasta(fastafile, lazy=True) pf = "s{0:03d}".format(i) nreads = nsingletons = nclusters = 0 for desc, rec in f.iterdescriptions_ordered(): nclusters += 1 if desc.startswith("singleton"): nsingletons += 1 nreads += 1 continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) nreads += size if size < minsize: continue rec.description = rec.description.split(None, 1)[-1] rec.id = pf + "_" + rec.id SeqIO.write(rec, fw, "fasta") logging.debug("Scanned {0} clusters with {1} reads ..".\ format(nclusters, nreads)) cclusters, creads = nclusters - nsingletons, nreads - nsingletons logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\ format(cclusters, minsize, creads, creads / cclusters, pf)) totalreads += nreads totalassembled += nreads - nsingletons logging.debug("Total assembled: {0}".\ format(percentage(totalassembled, totalreads)))
def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print("\n".join(kmers), file=proc.stdin) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug( "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format( K, fastafile, jfdb, binfile ) ) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def needle(args): """ %prog needle nw.pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them Automatically writes output file `nw.scores` """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) manager = mp.Manager() results = manager.list() needle_pool = mp.Pool(processes=mp.cpu_count()) pairsfile, apep, bpep = args afasta, bfasta = Fasta(apep), Fasta(bpep) fp = must_open(pairsfile) for i, row in enumerate(fp): a, b = row.split() a, b = afasta[a], bfasta[b] fa, fb = must_open("{0}_{1}_a.fasta".format(pairsfile, i), "w"), must_open( "{0}_{1}_b.fasta".format(pairsfile, i), "w") SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = "{0}_{1}_ab.needle".format(pairsfile, i) needle_pool.apply_async( _needle, (fa.name, fb.name, needlefile, a.id, b.id, results)) needle_pool.close() needle_pool.join() fp.close() scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0]) fw = must_open(scoresfile, "w") for result in results: print(result, file=fw) fw.close()
def main(arg): f = Fasta(arg) key, s = f.iteritems().next() s = s.seq res = set() for z in (s, s.reverse_complement()): for frame in xrange(3): p = z[frame:].translate() for i in xrange(len(p)): if p[i] != 'M': continue for j in xrange(i + 1, len(p)): if p[j] == '*': res.add(p[i:j]) break print "\n".join(str(x) for x in res)
def prepare(args): """ %prog prepare pairsfile cdsfile > paired.cds.fasta Pick sequences from cdsfile to form pairs, ready to be calculated. The pairsfile can be generated from formats.blast.cscore(). The first two columns contain the pair. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pairsfile, cdsfile = args f = Fasta(cdsfile) fp = open(pairsfile) fw = sys.stdout for row in fp: a, b = row.split()[:2] arec = f[a] brec = f[b] SeqIO.write((arec, brec), fw, "fasta")
def freebayes(args): """ %prog freebayes prefix ref.fa *.bam Call SNPs using freebayes. """ p = OptionParser(freebayes.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth [default: %default]") p.add_option("--minqual", default=20, type="int", help="Minimum quality [default: %default]") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix, ref = args[0:2] bams = args[2:] cmd = "bamaddrg -R {0}" cmd += " " + " ".join("-b {0}".format(x) for x in bams) fmd = "freebayes --stdin -C {0} -f {1}".format(opts.mindepth, ref) seqids = list(Fasta(ref).iterkeys_ordered()) for s in seqids: outfile = prefix + ".{0}.vcf".format(s) print cmd.format(s), "|", fmd + " -r {0} -v {1}".format(s, outfile)
def wgsim(args): """ %prog wgsim fastafile Run dwgsim on fastafile. """ p = OptionParser(wgsim.__doc__) p.add_option( "--erate", default=0.01, type="float", help="Base error rate of the read", ) p.add_option( "--noerrors", default=False, action="store_true", help="Simulate reads with no errors", ) p.add_option( "--genomesize", type="int", help="Genome size in Mb [default: estimate from data]", ) add_sim_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile,) = args pf = op.basename(fastafile).split(".")[0] genomesize = opts.genomesize size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize depth = opts.depth readlen = opts.readlen readnum = int(math.ceil(size * depth / (2 * readlen))) distance = opts.distance stdev = distance / 10 outpf = opts.outfile or "{0}.{1}bp.{2}x".format(pf, distance, depth) logging.debug("Total genome size: {0} bp".format(size)) logging.debug("Target depth: {0}x".format(depth)) logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum)) if opts.noerrors: opts.erate = 0 cmd = "dwgsim -e {0} -E {0}".format(opts.erate) if opts.noerrors: cmd += " -r 0 -R 0 -X 0 -y 0" cmd += " -d {0} -s {1}".format(distance, stdev) cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen) cmd += " {0} {1}".format(fastafile, outpf) sh(cmd)
def build_all(self, componentfasta, targetfasta, newagp=None): f = Fasta(componentfasta, index=False) fw = open(targetfasta, "w") for ob, lines_with_same_ob in groupby(self, key=lambda x: x.object): lines = list(lines_with_same_ob) self.build_one(ob, lines, f, fw, newagp=newagp)
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) p.add_option("--csv", help="Write depth per contig to file") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args csv = open(opts.csv, "w") if opts.csv else None f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences if "with" in desc: name, w, size, seqs = desc.split() if csv: print("\t".join(str(x) for x in (name, size, len(rec))), file=csv) assert w == "with" sizes.append(int(size)) # MRD85:00603:02472;size=167; else: name, size, tail = desc.split(";") sizes.append(int(size.replace("size=", ""))) if csv: csv.close() logging.debug("File written to `%s`.", opts.csv) s = SummaryStats(sizes) print(s, file=sys.stderr) stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> proc.stdin, "\n".join(kmers) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\ format(K, fastafile, jfdb, binfile)) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def count(args): """ %prog count cdhit.consensus.fasta Scan the headers for the consensus clusters and count the number of reads. """ from jcvi.graphics.histogram import stem_leaf_plot from jcvi.utils.cbook import SummaryStats p = OptionParser(count.__doc__) p.add_option("--csv", help="Write depth per contig to file") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args csv = open(opts.csv, "w") if opts.csv else None f = Fasta(fastafile, lazy=True) sizes = [] for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): sizes.append(1) continue # consensus_for_cluster_0 with 63 sequences if "with" in desc: name, w, size, seqs = desc.split() if csv: print("\t".join(str(x) for x in (name, size, len(rec))), file=csv) assert w == "with" sizes.append(int(size)) # MRD85:00603:02472;size=167; else: name, size, tail = desc.split(";") sizes.append(int(size.replace("size=", ""))) if csv: csv.close() logging.debug("File written to `{0}`".format(opts.csv)) s = SummaryStats(sizes) print(s, file=sys.stderr) stem_leaf_plot(s.data, 0, 100, 20, title="Cluster size")
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print("\t".join((k, str(size))), file=fw) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print >> fw, "\t".join((k, str(size))) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def n50(args): """ %prog n50 filename Given a file with a list of numbers denoting contig lengths, calculate N50. Input file can be both FASTA or a list of sizes. """ p = OptionParser(n50.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) ctgsizes = [] # Guess file format probe = open(args[0]).readline()[0] isFasta = (probe == '>') if isFasta: for filename in args: f = Fasta(filename) ctgsizes += list(b for a, b in f.itersizes()) else: for row in must_open(args): try: ctgsize = int(row.split()[-1]) except ValueError: continue ctgsizes.append(ctgsize) a50, l50, nn50 = calculate_A50(ctgsizes) sumsize = sum(ctgsizes) minsize = min(ctgsizes) maxsize = max(ctgsizes) n = len(ctgsizes) print >> sys.stderr, ", ".join(args) summary = (sumsize, l50, nn50, minsize, maxsize, n) print >> sys.stderr, " ".join("{0}={1}".format(a, b) for a, b in \ zip(header, summary)) loghistogram(ctgsizes, summary=False) return zip(header, summary)
def main(arg): f = Fasta(arg) store = defaultdict(int) for key, rec in f.iteritems(): store[str(rec.seq)] += 1 store[str(rec.seq.reverse_complement())] += 1 f = Fasta(arg) for key, rec in f.iteritems(): s = str(rec.seq) for t, v in store.items(): if v < 2: continue if s == t: continue if hamming(s, t) > 1: continue print "{0}->{1}".format(s, t)
def main(arg): f = Fasta(arg) k, s = f.iteritems().next() s = str(s.seq) P = [0] * len(s) i = 1 j = 0 while i < len(s): if s[i] == s[j]: P[i] = j + 1 i += 1 j += 1 elif j != 0: j = P[j - 1] else: P[i] = 0 i += 1 print " ".join(str(x) for x in P)
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family))
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family))
def summary(args): """ %prog summary fastafile Report the number of bases and sequences masked. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args f = Fasta(fastafile, index=False) halfmaskedseqs = set() allmasked = 0 allbases = 0 cutoff = 50 for key, seq in f.iteritems(): masked = 0 for base in seq: if base not in "AGCT": masked += 1 seqlen = len(seq) if masked * 100.0 / seqlen > cutoff: halfmaskedseqs.add(key) allmasked += masked allbases += seqlen seqnum = len(f) maskedseqnum = len(halfmaskedseqs) print( "Total masked bases: {0}".format(percentage(allmasked, allbases)), file=sys.stderr, ) print( "Total masked sequences (contain > {0}% masked): {1}".format( cutoff, percentage(maskedseqnum, seqnum)), file=sys.stderr, )
def needle(args): """ %prog needle pairs a.pep.fasta b.pep.fasta Take protein pairs and needle them. """ from Bio.Emboss.Applications import NeedleCommandline from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.base import FileShredder p = OptionParser(needle.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pairsfile, apep, bpep = args afasta = Fasta(apep) bfasta = Fasta(bpep) fp = open(pairsfile) for row in fp: fa = open(pairsfile + "_a.fasta", "w") fb = open(pairsfile + "_b.fasta", "w") a, b = row.split() a = afasta[a] b = bfasta[b] SeqIO.write([a], fa, "fasta") SeqIO.write([b], fb, "fasta") fa.close() fb.close() needlefile = pairsfile + "_ab.needle" needle_cline = NeedleCommandline(asequence=fa.name, bsequence=fb.name, gapopen=10, gapextend=0.5, outfile=needlefile) stdout, stderr = needle_cline() print >> sys.stderr, stdout + stderr #align = AlignIO.read(needlefile, "emboss") nh = NeedleHeader(needlefile) print "\t".join((a.id, b.id, nh.identity, nh.score)) FileShredder([fa.name, fb.name, needlefile])
def wgsim(args): """ %prog wgsim fastafile Run dwgsim on fastafile. """ p = OptionParser(wgsim.__doc__) p.add_option("--erate", default=.02, type="float", help="Base error rate of the read [default: %default]") p.add_option("--distance", default=500, type="int", help="Outer distance between the two ends [default: %default]") p.add_option("--genomesize", type="int", help="Genome size in Mb [default: estimate from data]") p.add_option("--readlen", default=100, type="int", help="Length of the read [default: %default]") p.add_option("--noerrors", default=False, action="store_true", help="Simulate reads with no errors [default: %default]") p.set_depth(depth=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args pf = fastafile.split(".")[0] genomesize = opts.genomesize size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize depth = opts.depth readlen = opts.readlen readnum = size * depth / (2 * readlen) distance = opts.distance stdev = distance / 5 outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth) distance -= 2 * readlen # Outer distance => Inner distance assert distance >= 0, "Outer distance must be >= 2 * readlen" logging.debug("Total genome size: {0} bp".format(size)) logging.debug("Target depth: {0}x".format(depth)) logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum)) if opts.noerrors: opts.erate = 0 cmd = "dwgsim -e {0} -E {0}".format(opts.erate) if opts.noerrors: cmd += " -r 0 -R 0 -X 0 -y 0" cmd += " -d {0} -s {1}".format(distance, stdev) cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen) cmd += " {0} {1}".format(fastafile, outpf) sh(cmd)
def clr(args): """ %prog blastfile fastafiles Calculate the vector clear range file based BLAST to the vectors. """ p = OptionParser(clr.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) blastfile = args[0] fastafiles = args[1:] sizes = {} for fa in fastafiles: f = Fasta(fa) sizes.update(f.itersizes()) b = Blast(blastfile) seen = set() for query, hits in b.iter_hits(): qsize = sizes[query] vectors = list((x.qstart, x.qstop) for x in hits) vmin, vmax = range_minmax(vectors) left_size = vmin - 1 right_size = qsize - vmax if left_size > right_size: clr_start, clr_end = 0, vmin else: clr_start, clr_end = vmax, qsize print "\t".join(str(x) for x in (query, clr_start, clr_end)) del sizes[query] for q, size in sorted(sizes.items()): print "\t".join(str(x) for x in (q, 0, size))
def overlapbatch(args): """ %prog overlapbatch ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args f = Fasta(ctgfasta) for k, rec in f.iteritems_ordered(): fastafile = k + ".fasta" fw = open(fastafile, "w") SeqIO.write([rec], fw, "fasta") fw.close() overlap([fastafile, poolfasta])
def summary(args): """ %prog summary fastafile Report the number of bases and sequences masked. """ p = OptionParser(summary.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args f = Fasta(fastafile, index=False) halfmaskedseqs = set() allmasked = 0 allbases = 0 cutoff = 50 others = 0 for key, seq in f.iteritems(): masked = 0 for base in seq: if base not in "AGCT": masked += 1 seqlen = len(seq) if masked * 100. / seqlen > cutoff: halfmaskedseqs.add(key) allmasked += masked allbases += seqlen seqnum = len(f) maskedseqnum = len(halfmaskedseqs) print >> sys.stderr, "Total masked bases: {0}".\ format(percentage(allmasked, allbases)) print >> sys.stderr, "Total masked sequences (contain > {0}% masked): {1}".\ format(cutoff, percentage(maskedseqnum, seqnum))