def screen(args): """ %prog screen scaffolds.fasta library.fasta Screen sequences against FASTA library. Sequences that have 95% id and 50% cov will be removed by default. """ from jcvi.apps.align import blast from jcvi.formats.blast import covfilter p = OptionParser(screen.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--best", default=1, type="int", help="Get the best N hit [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) scaffolds, library = args pctidflag = "--pctid={0}".format(opts.pctid) blastfile = blast( [library, scaffolds, pctidflag, "--best={0}".format(opts.best)]) idsfile = blastfile.rsplit(".", 1)[0] + ".ids" covfilter([ blastfile, scaffolds, "--union", "--ids=" + idsfile, pctidflag, "--pctcov={0}".format(opts.pctcov) ]) pf = scaffolds.rsplit(".", 1)[0] nf = pf + ".screen.fasta" cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf) sh(cmd) logging.debug("Screened FASTA written to `{0}`.".format(nf)) return nf
def screen(args): """ %prog screen scaffolds.fasta library.fasta Screen sequences against FASTA library. Sequences that have 95% id and 50% cov will be removed by default. """ from jcvi.apps.align import blast from jcvi.formats.blast import covfilter p = OptionParser(screen.__doc__) p.set_align(pctid=95, pctcov=50) p.add_option("--best", default=1, type="int", help="Get the best N hit [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) scaffolds, library = args pctidflag = "--pctid={0}".format(opts.pctid) blastfile = blast([library, scaffolds, pctidflag, "--best={0}".format(opts.best)]) idsfile = blastfile.rsplit(".", 1)[0] + ".ids" covfilter([blastfile, scaffolds, "--ids=" + idsfile, pctidflag, "--pctcov={0}".format(opts.pctcov)]) pf = scaffolds.rsplit(".", 1)[0] nf = pf + ".screen.fasta" cmd = "faSomeRecords {0} -exclude {1} {2}".format(scaffolds, idsfile, nf) sh(cmd) logging.debug("Screened FASTA written to `{0}`.".format(nf)) return nf
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option( "--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced", ) p.add_option("--prefix", help="Prefix of the new object") p.add_option( "--strict", default=False, action="store_true", help="Only update if replacement has no gaps", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta, "--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds( blastfile, order, rclip=rclip, maxsize=maxsize ) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count("n") + x.seq.count("N") exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug( "Ignore {0} updates because of decreasing quality.".format(len(exclude)) ) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def install(args): """ %prog install patchers.bed patchers.fasta backbone.fasta alt.fasta Install patches into backbone, using sequences from alternative assembly. The patches sequences are generated via jcvi.assembly.patch.fill(). The output is a bedfile that can be converted to AGP using jcvi.formats.agp.frombed(). """ from jcvi.apps.align import blast from jcvi.formats.fasta import SeqIO p = OptionParser(install.__doc__) p.set_rclip(rclip=1) p.add_option("--maxsize", default=300000, type="int", help="Maximum size of patchers to be replaced [default: %default]") p.add_option("--prefix", help="Prefix of the new object [default: %default]") p.add_option("--strict", default=False, action="store_true", help="Only update if replacement has no gaps [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) pbed, pfasta, bbfasta, altfasta = args maxsize = opts.maxsize # Max DNA size to replace gap rclip = opts.rclip blastfile = blast([altfasta, pfasta,"--wordsize=100", "--pctid=99"]) order = Bed(pbed).order beforebed, afterbed = blast_to_twobeds(blastfile, order, rclip=rclip, maxsize=maxsize) beforefasta = fastaFromBed(beforebed, bbfasta, name=True, stranded=True) afterfasta = fastaFromBed(afterbed, altfasta, name=True, stranded=True) # Exclude the replacements that contain more Ns than before ah = SeqIO.parse(beforefasta, "fasta") bh = SeqIO.parse(afterfasta, "fasta") count_Ns = lambda x: x.seq.count('n') + x.seq.count('N') exclude = set() for arec, brec in zip(ah, bh): an = count_Ns(arec) bn = count_Ns(brec) if opts.strict: if bn == 0: continue elif bn < an: continue id = arec.id exclude.add(id) logging.debug("Ignore {0} updates because of decreasing quality."\ .format(len(exclude))) abed = Bed(beforebed, sorted=False) bbed = Bed(afterbed, sorted=False) abed = [x for x in abed if x.accn not in exclude] bbed = [x for x in bbed if x.accn not in exclude] abedfile = "before.filtered.bed" bbedfile = "after.filtered.bed" afbed = Bed() afbed.extend(abed) bfbed = Bed() bfbed.extend(bbed) afbed.print_to_file(abedfile) bfbed.print_to_file(bbedfile) shuffle_twobeds(afbed, bfbed, bbfasta, prefix=opts.prefix)
def expand(args): """ %prog expand bes.fasta reads.fastq Expand sequences using short reads. Useful, for example for getting BAC-end sequences. The template to use, in `bes.fasta` may just contain the junction sequences, then align the reads to get the 'flanks' for such sequences. """ import math from jcvi.formats.fasta import Fasta, SeqIO from jcvi.formats.fastq import readlen, first, fasta from jcvi.formats.blast import Blast from jcvi.formats.base import FileShredder from jcvi.apps.bowtie import align, get_samfile from jcvi.apps.align import blast p = OptionParser(expand.__doc__) p.set_depth(depth=200) p.set_firstN() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bes, reads = args size = Fasta(bes).totalsize rl = readlen([reads]) expected_size = size + 2 * rl nreads = expected_size * opts.depth / rl nreads = int(math.ceil(nreads / 1000.)) * 1000 # Attract reads samfile, logfile = align([bes, reads, "--reorder", "--mapped", "--firstN={0}".format(opts.firstN)]) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) pf = mapped.split(".")[0] pf = pf.split("-")[0] bespf = bes.split(".")[0] reads = pf + ".expand.fastq" first([str(nreads), mapped, "-o", reads]) # Perform mini-assembly fastafile = reads.rsplit(".", 1)[0] + ".fasta" qualfile = "" if need_update(reads, fastafile): fastafile, qualfile = fasta([reads]) contigs = op.join(pf, "454LargeContigs.fna") if need_update(fastafile, contigs): cmd = "runAssembly -o {0} -cpu 8 {1}".format(pf, fastafile) sh(cmd) assert op.exists(contigs) # Annotate contigs blastfile = blast([bes, contigs]) mapping = {} for query, b in Blast(blastfile).iter_best_hit(): mapping[query] = b f = Fasta(contigs, lazy=True) annotatedfasta = ".".join((pf, bespf, "fasta")) fw = open(annotatedfasta, "w") keys = list(Fasta(bes).iterkeys_ordered()) # keep an ordered list recs = [] for key, v in f.iteritems_ordered(): vid = v.id if vid not in mapping: continue b = mapping[vid] subject = b.subject rec = v.reverse_complement() if b.orientation == '-' else v rec.id = rid = "_".join((pf, vid, subject)) rec.description = "" recs.append((keys.index(subject), rid, rec)) recs = [x[-1] for x in sorted(recs)] SeqIO.write(recs, fw, "fasta") fw.close() FileShredder([samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf]) logging.debug("Annotated seqs (n={0}) written to `{1}`.".\ format(len(recs), annotatedfasta)) return annotatedfasta
def sizes(args): """ %prog sizes gaps.bed a.fasta b.fasta Take the flanks of gaps within a.fasta, map them onto b.fasta. Compile the results to the gap size estimates in b. The output is detailed below: Columns are: 1. A scaffold 2. Start position 3. End position 4. Gap identifier 5. Gap size in A (= End - Start) 6. Gap size in B (based on BLAST, see below) For each gap, I extracted the left and right sequence (mostly 2Kb, but can be shorter if it runs into another gap) flanking the gap. The flanker names look like gap.00003L and gap.00003R means the left and right flanker of this particular gap, respectively. The BLAST output is used to calculate the gap size. For each flanker sequence, I took the best hit, and calculate the inner distance between the L match range and R range. The two flankers must map with at least 98% identity, and in the same orientation. NOTE the sixth column in the list file is not always a valid number. Other values are: - na: both flankers are missing in B - Singleton: one flanker is missing - Different chr: flankers map to different scaffolds - Strand +|-: flankers map in different orientations - Negative value: the R flanker map before L flanker """ from jcvi.formats.base import DictFile from jcvi.apps.align import blast p = OptionParser(sizes.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) gapsbed, afasta, bfasta = args pf = gapsbed.rsplit(".", 1)[0] extbed = pf + ".ext.bed" extfasta = pf + ".ext.fasta" if need_update(gapsbed, extfasta): extbed, extfasta = flanks([gapsbed, afasta]) q = op.basename(extfasta).split(".")[0] r = op.basename(bfasta).split(".")[0] blastfile = "{0}.{1}.blast".format(q, r) if need_update([extfasta, bfasta], blastfile): blastfile = blast([bfasta, extfasta, "--wordsize=50", "--pctid=98"]) labelsfile = blast_to_twobeds(blastfile) labels = DictFile(labelsfile, delimiter='\t') bed = Bed(gapsbed) for b in bed: b.score = b.span accn = b.accn print "\t".join((str(x) for x in (b.seqid, b.start - 1, b.end, accn, b.score, labels.get(accn, "na"))))