def flip(args): """ %prog flip fastafile Go through each FASTA record, check against Genbank file and determines whether or not to flip the sequence. This is useful before updates of the sequences to make sure the same orientation is used. """ p = OptionParser(flip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta" fo = open(outfastafile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): tmpfasta = "a.fasta" fw = open(tmpfasta, "w") SeqIO.write([rec], fw, "fasta") fw.close() o = overlap([tmpfasta, name]) if o.orientation == '-': rec.seq = rec.seq.reverse_complement() SeqIO.write([rec], fo, "fasta") os.remove(tmpfasta)
def main(arg): f = Fasta(arg) s = [str(x.seq) for k, x in f.iteritems_ordered()] m = s[0] for z in s[1:]: m = m.replace(z, "") print Seq(m).translate().strip("*")
def dump(args): """ %prog dump fastafile Convert FASTA sequences to list of K-mers. """ p = OptionParser(dump.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args K = opts.K fw = must_open(opts.outfile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> fw, "\n".join(kmers) fw.close()
def fragment(args): """ %prog fragment fastafile enzyme Cut the fastafile using the specified enzyme, and grab upstream and downstream nucleotide sequence along with the cut site. In this case, the sequences extracted are: |- PstI ============|=========== (-------) Sometimes we need to limit the size of the restriction fragments, for example the GBS protocol does not allow fragments larger than 800bp. |-PstI |- PstI |- PstI ~~~====|=============|==========~~~~~~~===|============ (---) (---) In this case, the second fragment is longer than 800bp, therefore the two ends are NOT extracted, as in the first fragment. """ p = OptionParser(fragment.__doc__) p.add_option( "--flank", default=150, type="int", help="Extract flanking bases of the cut sites", ) p.add_option( "--full", default=False, action="store_true", help="The full extraction mode", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzyme = args flank = opts.flank assert flank > 0 extract = extract_full if opts.full else extract_ends tag = "full" if opts.full else "ends" assert enzyme in set(str(x) for x in AllEnzymes) fragfastafile = fastafile.split(".")[0] + ".{0}.flank{1}.{2}.fasta".format( enzyme, flank, tag ) enzyme = [x for x in AllEnzymes if str(x) == enzyme][0] f = Fasta(fastafile, lazy=True) fw = open(fragfastafile, "w") for name, rec in f.iteritems_ordered(): a = Analysis([enzyme], rec.seq) sites = a.full()[enzyme] extract(rec, sites, flank, fw) logging.debug("Fragments written to `{0}`.".format(fragfastafile))
def digest(args): """ %prog digest fastafile NspI,BfuCI Digest fasta sequences to map restriction site positions. """ p = OptionParser(digest.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzymes = args enzymes = enzymes.split(",") enzymes = [x for x in AllEnzymes if str(x) in enzymes] f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") header = ["Contig", "Length"] + [str(x) for x in enzymes] print("\t".join(header), file=fw) for name, rec in f.iteritems_ordered(): row = [name, len(rec)] for e in enzymes: pos = e.search(rec.seq) pos = "na" if not pos else "|".join(str(x) for x in pos) row.append(pos) print("\t".join(str(x) for x in row), file=fw)
def get_GC3(cdsfile): from jcvi.formats.fasta import Fasta f = Fasta(cdsfile, lazy=True) GC3 = {} for name, rec in f.iteritems_ordered(): positions = rec.seq[2::3].upper() gc_counts = sum(1 for x in positions if x in "GC") gc_ratio = gc_counts * 1. / len(positions) GC3[name] = gc_ratio return GC3
def fragment(args): """ %prog fragment fastafile enzyme Cut the fastafile using the specified enzyme, and grab upstream and downstream nucleotide sequence along with the cut site. In this case, the sequences extracted are: |- PstI ============|=========== (-------) Sometimes we need to limit the size of the restriction fragments, for example the GBS protocol does not allow fragments larger than 800bp. |-PstI |- PstI |- PstI ~~~====|=============|==========~~~~~~~===|============ (---) (---) In this case, the second fragment is longer than 800bp, therefore the two ends are NOT extracted, as in the first fragment. """ p = OptionParser(fragment.__doc__) p.add_option("--flank", default=150, type="int", help="Extract flanking bases of the cut sites [default: %default]") p.add_option("--full", default=False, action="store_true", help="The full extraction mode [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzyme = args flank = opts.flank assert flank > 0 extract = extract_full if opts.full else extract_ends tag = "full" if opts.full else "ends" assert enzyme in set(str(x) for x in AllEnzymes) fragfastafile = fastafile.split(".")[0] + \ ".{0}.flank{1}.{2}.fasta".format(enzyme, flank, tag) enzyme = [x for x in AllEnzymes if str(x) == enzyme][0] f = Fasta(fastafile, lazy=True) fw = open(fragfastafile, "w") for name, rec in f.iteritems_ordered(): a = Analysis([enzyme], rec.seq) sites = a.full()[enzyme] extract(rec, sites, flank, fw) logging.debug("Fragments written to `{0}`.".format(fragfastafile))
def merge(args): """ %prog merge gffiles Merge several gff files into one. When only one file is given, it is assumed to be a file with a list of gff files. """ p = OptionParser(merge.__doc__) set_outfile(p) opts, args = p.parse_args(args) nargs = len(args) if nargs < 1: sys.exit(not p.print_help()) if nargs == 1: listfile, = args fp = open(listfile) gffiles = [x.strip() for x in fp] else: gffiles = args outfile = opts.outfile deflines = set() fw = must_open(outfile, "w") fastarecs = {} for gffile in gffiles: fp = open(gffile) for row in fp: row = row.rstrip() if row[0] == '#': if row == FastaTag: break if row in deflines: continue else: deflines.add(row) print >> fw, row f = Fasta(gffile, lazy=True) for key, rec in f.iteritems_ordered(): if key in fastarecs.keys(): continue fastarecs[key] = rec print >> fw, FastaTag SeqIO.write(fastarecs.values(), fw, "fasta")
def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print("\n".join(kmers), file=proc.stdin) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug( "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format( K, fastafile, jfdb, binfile ) ) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def count(args): """ %prog count fastafile jf.db Run dump - jellyfish - bin - bincount in serial. """ from bitarray import bitarray p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, jfdb = args K = get_K(jfdb) cmd = "jellyfish query {0} -C | cut -d' ' -f 2".format(jfdb) t = must_open("tmp", "w") proc = Popen(cmd, stdin=PIPE, stdout=t) t.flush() f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> proc.stdin, "\n".join(kmers) proc.stdin.close() logging.debug(cmd) proc.wait() a = bitarray() binfile = ".".join((fastafile, jfdb, "bin")) fw = open(binfile, "w") t.seek(0) for row in t: c = row.strip() a.append(int(c)) a.tofile(fw) logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) fw.close() sh("rm {0}".format(t.name)) logging.debug("Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".\ format(K, fastafile, jfdb, binfile)) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile))
def overlapbatch(args): """ %prog overlapbatch ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args f = Fasta(ctgfasta) for k, rec in f.iteritems_ordered(): fastafile = k + ".fasta" fw = open(fastafile, "w") SeqIO.write([rec], fw, "fasta") fw.close() overlap([fastafile, poolfasta])
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option("--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1.0 / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([ fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid) ]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".\ format(", ".join(sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".\ format(", ".join(sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid)]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def longest(args): """ %prog longest pasa.fasta output.subclusters.out Find the longest PASA assembly and label it as full-length. Also removes transcripts shorter than half the length of the longest, or shorter than 200bp. The assemblies for the same locus is found in `output.subclusters.out`. In particular the lines that look like: sub-cluster: asmbl_25 asmbl_26 asmbl_27 """ from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.sizes import Sizes p = OptionParser(longest.__doc__) p.add_option("--prefix", default="pasa", help="Replace asmbl_ with prefix [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, subclusters = args prefix = fastafile.rsplit(".", 1)[0] idsfile = prefix + ".fl.ids" fw = open(idsfile, "w") sizes = Sizes(fastafile).mapping name_convert = lambda x: x.replace("asmbl", opts.prefix) keep = set() # List of IDs to write fp = open(subclusters) nrecs = 0 for row in fp: if not row.startswith("sub-cluster:"): continue asmbls = row.split()[1:] longest_asmbl = max(asmbls, key=lambda x: sizes[x]) longest_size = sizes[longest_asmbl] print(name_convert(longest_asmbl), file=fw) nrecs += 1 cutoff = max(longest_size / 2, 200) keep.update(set(x for x in asmbls if sizes[x] >= cutoff)) fw.close() logging.debug("{0} fl-cDNA records written to `{1}`.".format(nrecs, idsfile)) f = Fasta(fastafile, lazy=True) newfastafile = prefix + ".clean.fasta" fw = open(newfastafile, "w") nrecs = 0 for name, rec in f.iteritems_ordered(): if name not in keep: continue rec.id = name_convert(name) rec.description = "" SeqIO.write([rec], fw, "fasta") nrecs += 1 fw.close() logging.debug("{0} valid records written to `{1}`.".format(nrecs, newfastafile))
def shred(args): """ %prog shred fastafile Similar to the method of `shredContig` in runCA script. The contigs are shredded into pseudo-reads with certain length and depth. """ p = OptionParser(shred.__doc__) p.set_depth(depth=2) p.add_option("--readlen", default=1000, type="int", help="Desired length of the reads [default: %default]") p.add_option("--minctglen", default=0, type="int", help="Ignore contig sequence less than [default: %default]") p.add_option( "--shift", default=50, type="int", help="Overlap between reads must be at least [default: %default]") p.add_option( "--fasta", default=False, action="store_true", help="Output shredded reads as FASTA sequences [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args libID = fastafile.split(".")[0] depth = opts.depth readlen = opts.readlen shift = opts.shift outfile = libID + ".depth{0}".format(depth) if opts.fasta: outfile += ".fasta" else: outfile += ".frg" f = Fasta(fastafile, lazy=True) fw = must_open(outfile, "w", checkexists=True) if not opts.fasta: print >> fw, headerTemplate.format(libID=libID) """ Taken from runCA: |*********| |###################| |--------------------------------------------------| ---------------1--------------- ---------------2--------------- ---------------3--------------- *** - center_increments ### - center_range_width """ for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): seq = rec.seq seqlen = len(seq) if seqlen < opts.minctglen: continue shredlen = min(seqlen - shift, readlen) numreads = max(seqlen * depth / shredlen, 1) center_range_width = seqlen - shredlen ranges = [] if depth == 1: if seqlen < readlen: ranges.append((0, seqlen)) else: for begin in xrange(0, seqlen, readlen - shift): end = min(seqlen, begin + readlen) ranges.append((begin, end)) else: if numreads == 1: ranges.append((0, shredlen)) else: prev_begin = -1 center_increments = center_range_width * 1. / (numreads - 1) for i in xrange(numreads): begin = center_increments * i end = begin + shredlen begin, end = int(begin), int(end) if begin == prev_begin: continue ranges.append((begin, end)) prev_begin = begin for shredID, (begin, end) in enumerate(ranges): shredded_seq = seq[begin:end] fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) fw.close() logging.debug("Shredded reads are written to `{0}`.".format(outfile)) return outfile