def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args pf = bamfile.split(".")[0] countfile = pf + ".count" nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -n {0} {1}".format(bamfile, nsorted) sh(cmd) cmd = "samtools view -h {0}".format(nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def correct_frag(datadir, tag, origfastb, nthreads, dedup=False, haploidify=False, suffix=False): filt = datadir + "/{0}_filt".format(tag) filtfastb = filt + ".fastb" run_RemoveDodgyReads(infile=origfastb, outfile=filtfastb, removeDuplicates=dedup, rc=False, nthreads=nthreads) filtpairs = filt + ".pairs" edit = datadir + "/{0}_edit".format(tag) editpairs = edit + ".pairs" if need_update(filtpairs, editpairs): cmd = "ln -sf {0} {1}.pairs".format(op.basename(filtpairs), edit) sh(cmd) editfastb = edit + ".fastb" if need_update(filtfastb, editfastb): cmd = "FindErrors HEAD_IN={0} HEAD_OUT={1}".format(filt, edit) cmd += " PLOIDY_FILE=data/ploidy" cmd += nthreads sh(cmd) corr = datadir + "/{0}_corr".format(tag) corrfastb = corr + ".fastb" if need_update(editfastb, corrfastb): cmd = "CleanCorrectedReads DELETE=True" cmd += " HEAD_IN={0} HEAD_OUT={1}".format(edit, corr) cmd += " PLOIDY_FILE={0}/ploidy".format(datadir) if haploidify: cmd += " HAPLOIDIFY=True" cmd += nthreads sh(cmd) export_fastq(datadir, corrfastb, suffix=suffix)
def index(args): """ %prog index bedfile Compress frgscffile.sorted and index it using `tabix`. """ p = OptionParser(index.__doc__) p.add_option("--query", help="Chromosome location [default: %default]") set_outfile(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args gzfile = bedfile + ".gz" if need_update(bedfile, gzfile): bedfile = sort([bedfile]) cmd = "bgzip -c {0}".format(bedfile) sh(cmd, outfile=gzfile) tbifile = gzfile + ".tbi" if need_update(gzfile, tbifile): cmd = "tabix -p bed {0}".format(gzfile) sh(cmd) query = opts.query if not query: return cmd = "tabix {0} {1}".format(gzfile, query) sh(cmd, outfile=opts.outfile)
def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) p.add_option("--type", default="exon", help="Only count feature type") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args cpus = opts.cpus pf = bamfile.split(".")[0] countfile = pf + ".count" if not need_update(bamfile, countfile): return nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted) sh(cmd) cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " -t {0}".format(opts.type) cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def index(args): """ %prog index samfile/bamfile If SAM file, convert to BAM, sort and then index, using SAMTOOLS """ p = OptionParser(index.__doc__) p.add_option("--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file [default: %default]") p.add_option("--unique", default=False, action="store_true", help="only retain uniquely mapped reads [default: %default]") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) samfile, = args cpus = opts.cpus fastafile = opts.fasta if fastafile: assert op.exists(fastafile) bamfile = samfile.replace(".sam", ".bam") if fastafile: faifile = fastafile + ".fai" if need_update(fastafile, faifile): sh("samtools faidx {0}".format(fastafile)) cmd = "samtools view -bt {0} {1} -F 4 -o {2}".\ format(faifile, samfile, bamfile) else: cmd = "samtools view -bS {0} -F 4 -o {1}".\ format(samfile, bamfile) cmd += " -@ {0}".format(cpus) if opts.unique: cmd += " -q 1" if samfile.endswith(".sam") and need_update(samfile, bamfile): sh(cmd) # Already sorted? if bamfile.endswith(".sorted.bam"): sortedbamfile = bamfile else: prefix = bamfile.replace(".bam", "") sortedbamfile = prefix + ".sorted.bam" if need_update(bamfile, sortedbamfile): cmd = "samtools sort {0} {1}.sorted".format(bamfile, prefix) cmd += " -@ {0}".format(cpus) sh(cmd) baifile = sortedbamfile + ".bai" if need_update(sortedbamfile, baifile): sh("samtools index {0}".format(sortedbamfile)) return sortedbamfile
def dedup(args): """ %prog dedup scaffolds.fasta Remove redundant contigs with CD-HIT. This is run prior to assembly.sspace.embed(). """ from jcvi.formats.fasta import gaps from jcvi.apps.cdhit import deduplicate, ids p = OptionParser(dedup.__doc__) p.set_align(pctid=GoodPct) p.set_mingap(default=10) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) scaffolds, = args mingap = opts.mingap splitfile, oagpfile, cagpfile = gaps([scaffolds, "--split", "--mingap={0}".format(mingap)]) dd = splitfile + ".cdhit" clstrfile = dd + ".clstr" idsfile = dd + ".ids" if need_update(splitfile, clstrfile): deduplicate([splitfile, "--pctid={0}".format(opts.pctid)]) if need_update(clstrfile, idsfile): ids([clstrfile]) agp = AGP(cagpfile) reps = set(x.split()[-1] for x in open(idsfile)) pf = scaffolds.rsplit(".", 1)[0] dedupagp = pf + ".dedup.agp" fw = open(dedupagp, "w") ndropped = ndroppedbases = 0 for a in agp: if not a.is_gap and a.component_id not in reps: span = a.component_span logging.debug("Drop component {0} ({1})".\ format(a.component_id, span)) ndropped += 1 ndroppedbases += span continue print >> fw, a fw.close() logging.debug("Dropped components: {0}, Dropped bases: {1}".\ format(ndropped, ndroppedbases)) logging.debug("Deduplicated file written to `{0}`.".format(dedupagp)) tidyagp = tidy([dedupagp, splitfile]) dedupfasta = pf + ".dedup.fasta" build([tidyagp, dd, dedupfasta]) return dedupfasta
def frompsl(args): """ %prog frompsl old.new.psl old.fasta new.fasta Generate chain file from psl file. The pipeline is describe in: <http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver> """ from jcvi.formats.sizes import Sizes p = OptionParser(frompsl.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pslfile, oldfasta, newfasta = args pf = oldfasta.split(".")[0] # Use liftUp to change the coordinate system. Requires .lft files # This step is skipped as the output psl is the same as input? # Chain together alignments from using axtChain chainfile = pf + ".chain" twobitfiles = [] for fastafile in (oldfasta, newfasta): tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles if need_update(pslfile, chainfile): cmd = "axtChain -linearGap=medium -psl {0}".format(pslfile) cmd += " {0} {1} {2}".format(oldtwobit, newtwobit, chainfile) sh(cmd) # Sort chain files sortedchain = chainfile.rsplit(".", 1)[0] + ".sorted.chain" if need_update(chainfile, sortedchain): cmd = "chainSort {0} {1}".format(chainfile, sortedchain) sh(cmd) # Make alignment nets from chains netfile = pf + ".net" oldsizes = Sizes(oldfasta).filename newsizes = Sizes(newfasta).filename if need_update((sortedchain, oldsizes, newsizes), netfile): cmd = "chainNet {0} {1} {2}".format(sortedchain, oldsizes, newsizes) cmd += " {0} /dev/null".format(netfile) sh(cmd) # Create liftOver chain file liftoverfile = pf + ".liftover.chain" if need_update((netfile, sortedchain), liftoverfile): cmd = "netChainSubset {0} {1} {2}".\ format(netfile, sortedchain, liftoverfile) sh(cmd)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=96, pctcov=0) p.add_option("--fast", default=False, action="store_true", help="Place sequence in the first cluster") p.add_option("--consensus", default=False, action="store_true", help="Compute consensus sequences") p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. fastafile, qualfile = fasta([fastafile, "--seqtk"]) ocmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, ocmd) cmd += " -c {0}".format(identity) if ocmd == "cd-hit-est": cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" if not opts.fast: cmd += " -g 1" if opts.pctcov != 0: cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.) dd = fastafile + ".P{0}.cdhit".format(opts.pctid) clstr = dd + ".clstr" cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd) if need_update(fastafile, (dd, clstr)): sh(cmd) if opts.consensus: cons = dd + ".consensus" cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus") cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\ format(clstr, fastafile, cons) if need_update((clstr, fastafile), cons): sh(cmd) return dd
def pasa(args): """ %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3 Wraps `pasa_asmbls_to_training_set.dbi`. """ from jcvi.formats.base import SetFile from jcvi.formats.gff import Gff p = OptionParser(pasa.__doc__) p.set_home("pasa") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, gffile = args transcodergff = fastafile + ".transdecoder.gff3" transcodergenomegff = fastafile + ".transdecoder.genome.gff3" if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)): cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(opts.pasa_home) cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\ format(fastafile, gffile) sh(cmd) completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids" if need_update(transcodergff, completeids): cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff) sh(cmd, outfile=completeids) complete = SetFile(completeids) seen = set() completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3" fw = open(completegff, "w") gff = Gff(transcodergenomegff) for g in gff: a = g.attributes if "Parent" in a: id = a["Parent"][0] else: id = a["ID"][0] asmbl_id = id.split("|")[0] if asmbl_id not in complete: continue print >> fw, g if g.type == "gene": seen.add(id) fw.close() logging.debug("A total of {0} complete models extracted to `{1}`.".\ format(len(seen), completegff))
def data(args): """ %prog data data.bin samples.ids STR.ids meta.tsv Make data.tsv based on meta.tsv. """ p = OptionParser(data.__doc__) p.add_option("--notsv", default=False, action="store_true", help="Do not write data.tsv") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) databin, sampleids, strids, metafile = args final_columns, percentiles = read_meta(metafile) df, m, samples, loci = read_binfile(databin, sampleids, strids) # Clean the data m %= 1000 # Get the larger of the two alleles m[m == 999] = -1 # Missing data final = set(final_columns) remove = [] for i, locus in enumerate(loci): if locus not in final: remove.append(locus) continue pf = "STRs_{}_SEARCH".format(timestamp()) filteredstrids = "{}.STR.ids".format(pf) fw = open(filteredstrids, "w") print >> fw, "\n".join(final_columns) fw.close() logging.debug("Dropped {} columns; Retained {} columns (`{}`)".\ format(len(remove), len(final_columns), filteredstrids)) # Remove low-quality columns! df.drop(remove, inplace=True, axis=1) df.columns = final_columns filtered_bin = "{}.data.bin".format(pf) if need_update(databin, filtered_bin): m = df.as_matrix() m.tofile(filtered_bin) logging.debug("Filtered binary matrix written to `{}`".format(filtered_bin)) # Write data output filtered_tsv = "{}.data.tsv".format(pf) if not opts.notsv and need_update(databin, filtered_tsv): df.to_csv(filtered_tsv, sep="\t", index_label="SampleKey")
def cluster(args): """ %prog cluster prefix fastqfiles Use `vsearch` to remove duplicate reads. This routine is heavily influenced by PyRAD: <https://github.com/dereneaton/pyrad>. """ p = OptionParser(cluster.__doc__) add_consensus_options(p) p.set_align(pctid=95) p.set_outdir() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) prefix = args[0] fastqfiles = args[1:] cpus = opts.cpus pctid = opts.pctid mindepth = opts.mindepth minlength = opts.minlength fastafile, qualfile = fasta(fastqfiles + ["--seqtk", "--outdir={0}".format(opts.outdir), "--outfile={0}".format(prefix + ".fasta")]) prefix = op.join(opts.outdir, prefix) pf = prefix + ".P{0}".format(pctid) derepfile = prefix + ".derep" if need_update(fastafile, derepfile): derep(fastafile, derepfile, minlength, cpus) userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(derepfile, userfile): cluster_smallmem(derepfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((derepfile, userfile, notmatchedfile), clustfile): makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=mindepth) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus) statsfile = pf + ".stats" if need_update(clustSfile, statsfile): makestats(clustSfile, statsfile, mindepth=mindepth)
def mcluster(args): """ %prog mcluster *.consensus Cluster across samples using consensus sequences. """ p = OptionParser(mcluster.__doc__) add_consensus_options(p) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) consensusfiles = args minlength = opts.minlength cpus = opts.cpus pf = opts.prefix pctid = find_pctid(consensusfiles) pf += ".P{0}".format(pctid) consensusfile = pf + ".consensus.fasta" if need_update(consensusfiles, consensusfile): fw_cons = must_open(consensusfile, "w") totalseqs = 0 for cf in consensusfiles: nseqs = 0 s = op.basename(cf).split(".")[0] for name, seq in parse_fasta(cf): name = '.'.join((s, name)) print >> fw_cons, ">{0}\n{1}".format(name, seq) nseqs += 1 logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs)) totalseqs += nseqs logging.debug("Total: {0} seqs".format(totalseqs)) fw_cons.close() userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(consensusfile, userfile): cluster_smallmem(consensusfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((consensusfile, userfile, notmatchedfile), clustfile): makeclust(consensusfile, userfile, notmatchedfile, clustfile) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def geneinfo(bed, order, genomeidx, ploidy): bedfile = bed.filename p = bedfile.split(".")[0] idx = genomeidx[p] pd = ploidy[p] infofile = p + ".info" if not need_update(bedfile, infofile): return infofile fwinfo = open(infofile, "w") for s in bed: chr = "".join(x for x in s.seqid if x in string.digits) try: chr = int(chr) except ValueError: chr = "0" print >> fwinfo, "\t".join(str(x) for x in \ (s.accn, chr, s.start, s.end, s.strand, idx, pd)) fwinfo.close() logging.debug("Update info file `{0}`.".format(infofile)) return infofile
def pasa(args): """ %prog pasa pasa_db fastafile Run EVM in TIGR-only mode. """ p = OptionParser(pasa.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pasa_db, fastafile = args termexons = "pasa.terminal_exons.gff3" if need_update(fastafile, termexons): cmd = "$ANNOT_DEVEL/PASA2/scripts/pasa_asmbls_to_training_set.dbi" cmd += ' -M "{0}:mysql.tigr.org" -p "access:access"'.format(pasa_db) cmd += ' -g {0}'.format(fastafile) sh(cmd) cmd = "$EVM/PasaUtils/retrieve_terminal_CDS_exons.pl" cmd += " trainingSetCandidates.fasta trainingSetCandidates.gff" sh(cmd, outfile=termexons) return termexons
def parse_ctgs(bestedges, frgtoctg): cache = "frgtoctg.cache" if need_update(frgtoctg, cache): reads_to_ctgs = {} frgtodeg = frgtoctg.replace(".frgctg", ".frgdeg") iidtouid = frgtoctg.replace(".posmap.frgctg", ".iidtouid") fp = open(iidtouid) frgstore = {} for row in fp: tag, iid, uid = row.split() if tag == "FRG": frgstore[uid] = int(iid) for pf, f in zip(("ctg", "deg"), (frgtoctg, frgtodeg)): fp = open(f) logging.debug("Parse posmap file `{0}`".format(f)) for row in fp: frg, ctg = row.split()[:2] frg = frgstore[frg] reads_to_ctgs[frg] = pf + ctg logging.debug("Loaded mapping: {0}".format(len(reads_to_ctgs))) fw = open(cache, "w") cPickle.dump(reads_to_ctgs, fw) fw.close() logging.debug("Contig mapping written to `{0}`".format(cache)) reads_to_ctgs = cPickle.load(open(cache)) logging.debug("Contig mapping loaded from `{0}`".format(cache)) return reads_to_ctgs
def uniq(args): """ %prog uniq bedfile > newbedfile Remove overlapping features with higher scores. """ p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args uniqbedfile = bedfile.split(".")[0] + ".uniq.bed" bed = Bed(bedfile) if not need_update(bedfile, uniqbedfile): return uniqbedfile ranges = [Range(x.seqid, x.start, x.end, float(x.score), i) \ for i, x in enumerate(bed)] selected, score = range_chain(ranges) selected = [bed[x.id] for x in selected] newbed = Bed() newbed.extend(selected) newbed.print_to_file(uniqbedfile, sorted=True) logging.debug("Imported: {0}, Exported: {1}".format(len(bed), len(newbed))) return uniqbedfile
def sampe(args, opts): """ %prog sampe database.fasta read1.fq read2.fq Wrapper for `bwa sampe`. Output will be read1.sam. """ dbfile, read1file, read2file = args safile = check_index(dbfile) sai1file = check_aln(dbfile, read1file, cpus=opts.cpus) sai2file = check_aln(dbfile, read2file, cpus=opts.cpus) samfile, _, unmapped = get_samfile(read1file, dbfile, bam=opts.bam, unmapped=opts.unmapped) if not need_update((safile, sai1file, sai2file), samfile): logging.error("`{0}` exists. `bwa samse` already run.".format(samfile)) return "", samfile cmd = "bwa sampe " + " ".join((dbfile, sai1file, sai2file, \ read1file, read2file)) cmd += " " + opts.extra if opts.cutoff: cmd += " -a {0}".format(opts.cutoff) if opts.uniq: cmd += " -n 1" return cmd, samfile
def pairs(args): """ See __doc__ for set_options_pairs(). """ from jcvi.formats.blast import report_pairs, set_options_pairs p = set_options_pairs() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args basename = bedfile.split(".")[0] insertsfile = ".".join((basename, "inserts")) sortedbedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".sorted.bed" if need_update(bedfile, sortedbedfile): bedfile = sort([bedfile, "--accn"]) else: bedfile = sortedbedfile fp = open(bedfile) data = [BedLine(row) for i, row in enumerate(fp) if i < opts.nrows] ascii = not opts.pdf return bedfile, report_pairs(data, opts.cutoff, opts.mateorientation, pairsfile=opts.pairsfile, insertsfile=insertsfile, rclip=opts.rclip, ascii=ascii, bins=opts.bins, distmode=opts.distmode)
def first(args): """ %prog first N fastqfile(s) Get first N reads from file. """ from jcvi.apps.base import need_update p = OptionParser(first.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) N = int(args[0]) nlines = N * 4 fastqfiles = args[1:] fastqfile = fastqfiles[0] outfile = opts.outfile if not need_update(fastqfiles, outfile): logging.debug("File `{0}` exists. Will not overwrite.".format(outfile)) return gz = fastqfile.endswith(".gz") for fastqfile in fastqfiles: if gz: cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines) else: cmd = "head -n {0} {1}".format(nlines, fastqfile) sh(cmd, outfile=opts.outfile, append=True)
def mergeBed(bedfile, d=0, nms=False, s=False, scores=None): sort([bedfile, "-i"]) cmd = "mergeBed -i {0}".format(bedfile) if d: cmd += " -d {0}".format(d) if nms: nargs = len(open(bedfile).readline().split()) if nargs <= 3: logging.debug("Only {0} columns detected... set nms=True"\ .format(nargs)) else: cmd += " -c 4 -o collapse" if s: cmd += " -s" if scores: valid_opts = ("sum", "min", "max", "mean", "median", "mode", "antimode", "collapse") if not scores in valid_opts: scores = "mean" cmd += " -scores {0}".format(scores) mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed" if need_update(bedfile, mergebedfile): sh(cmd, outfile=mergebedfile) return mergebedfile
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def dust(args): """ %prog dust assembly.fasta Remove low-complexity contigs within assembly. """ p = OptionParser(dust.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta" if need_update(fastafile, dustfastafile): cmd = "dustmasker -in {0}".format(fastafile) cmd += " -out {1} -outfmt fasta".format(dustfastafile) sh(cmd) for name, seq in parse_fasta(dustfastafile): nlow = sum(1 for x in seq if x in "acgtN") pctlow = nlow * 100. / len(seq) if pctlow < 98: continue #print "{0}\t{1:.1f}".format(name, pctlow) print name
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def correct_pairs(p, pf, tag): """ Take one pair of reads and correct to generate *.corr.fastq. """ from jcvi.assembly.preprocess import correct as cr logging.debug("Work on {0} ({1})".format(pf, ','.join(p))) itag = tag[0] cm = ".".join((pf, itag)) targets = (cm + ".1.corr.fastq", cm + ".2.corr.fastq", \ pf + ".PE-0.corr.fastq") if not need_update(p, targets): logging.debug("Corrected reads found: {0}. Skipped.".format(targets)) return slink(p, pf, tag) cwd = os.getcwd() os.chdir(pf) cr(sorted(glob("*.fastq") + glob("*.fastq.gz")) + ["--nofragsdedup"]) sh("mv {0}.1.corr.fastq ../{1}".format(itag, targets[0])) sh("mv {0}.2.corr.fastq ../{1}".format(itag, targets[1])) sh("mv frag_reads_corr.corr.fastq ../{0}".format(targets[2])) logging.debug("Correction finished: {0}".format(targets)) os.chdir(cwd)
def prepare(args): """ %prog prepare jira.txt Parse JIRA report and prepare input. Look for all FASTQ files in the report and get the prefix. Assign fastq to a folder and a new file name indicating the library type (e.g. PE-500, MP-5000, etc.). Note that JIRA report can also be a list of FASTQ files. """ p = OptionParser(prepare.__doc__) p.add_option("--first", default=0, type="int", help="Use only first N reads [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) jfile, = args metafile = jfile + ".meta" if need_update(jfile, metafile): fp = open(jfile) fastqfiles = [x.strip() for x in fp if ".fastq" in x] metas = [Meta(x) for x in fastqfiles] fw = open(metafile, "w") print >> fw, "\n".join(str(x) for x in metas) print >> sys.stderr, "Now modify `{0}`, and restart this script.".format(metafile) print >> sys.stderr, "Each line is : genome library fastqfile" fw.close() return mf = MetaFile(metafile) for m in mf: m.make_link(firstN=opts.first)
def rmdup(args): """ %prog rmdup *.bam > rmdup.cmds Remove PCR duplicates from BAM files, generate a list of commands. """ p = OptionParser(rmdup.__doc__) p.add_option("-S", default=False, action="store_true", help="Treat PE reads as SE in rmdup") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) bams = args cmd = "samtools rmdup" if opts.S: cmd += " -S" for b in bams: if "rmdup" in b: continue rb = b.rsplit(".", 1)[0] + ".rmdup.bam" if not need_update(b, rb): continue print " ".join((cmd, b, rb))
def split(self, N, force=False): """ There are two modes of splitting the records - batch: splitting is sequentially to records/N chunks - cycle: placing each record in the splitted files and cycles use `cycle` if the len of the record is not evenly distributed """ mode = self.mode assert mode in ("batch", "cycle", "optimal") logging.debug("set split mode=%s" % mode) self.names = self.__class__.get_names(self.filename, N) if self.outputdir: self.names = [op.join(self.outputdir, x) for x in self.names] if not need_update(self.filename, self.names) and not force: logging.error("file %s already existed, skip file splitting" % self.names[0]) return filehandles = [open(x, "w") for x in self.names] if mode == "batch": for batch, fw in zip(self._batch_iterator(N), filehandles): count = self.write(fw, batch) logging.debug("write %d records to %s" % (count, fw.name)) elif mode == "cycle": handle = self._open(self.filename) for record, fw in zip(handle, cycle(filehandles)): count = self.write(fw, [record]) elif mode == "optimal": """ This mode is based on Longest Processing Time (LPT) algorithm: A simple, often-used algorithm is the LPT algorithm (Longest Processing Time) which sorts the jobs by its processing time and then assigns them to the machine with the earliest end time so far. This algorithm achieves an upper bound of 4/3 - 1/(3m) OPT. Citation: <http://en.wikipedia.org/wiki/Multiprocessor_scheduling> """ endtime = [0] * N handle = self._open(self.filename) for record in handle: mt, mi = min((x, i) for (i, x) in enumerate(endtime)) fw = filehandles[mi] count = self.write(fw, [record]) endtime[mi] += len(record) for fw in filehandles: fw.close()
def __init__(self, bedfile, sizesfile): from jcvi.apps.command import BDPATH from jcvi.formats.bed import sort sortedbedfile = bedfile.rsplit(".", 1)[0] + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile]) bedfile = sortedbedfile coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = BDPATH("genomeCoverageBed") cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def check_index(dbfile): dbfile = get_abs_path(dbfile) safile = dbfile + ".1.bt2" if need_update(dbfile, safile): cmd = "bowtie2-build {0} {0}".format(dbfile) sh(cmd) else: logging.error( "`{0}` exists. `bowtie2-build` already run.".format(safile)) return dbfile
def validate(args): """ %prog validate outdir genome.fasta Validate current folder after MAKER run and check for failures. Failed batch will be written to a directory for additional work. """ p = OptionParser(validate.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) outdir, genome = args counter = Counter() fsnames, suffix = get_fsnames(outdir) dsfile = "{0}{1}/{0}.maker.output/{0}_master_datastore_index.log" dslogs = [dsfile.format(x, suffix) for x in fsnames] all_failed = [] for f, d in zip(fsnames, dslogs): dslog = DatastoreIndexFile(d) counter.update(dslog.scaffold_status.values()) all_failed.extend([(f, x) for x in dslog.failed]) cmd = 'tail maker.*.out | grep -c "now finished"' n = int(popen(cmd).read()) assert len(fsnames) == n print("ALL jobs have been finished", file=sys.stderr) nfailed = len(all_failed) if nfailed == 0: print("ALL scaffolds are completed with no errors", file=sys.stderr) return print("Scaffold status:", file=sys.stderr) print(counter, file=sys.stderr) failed = "FAILED" fw = open(failed, "w") print("\n".join(["\t".join((f, x)) for f, x in all_failed]), file=fw) fw.close() nlines = sum(1 for _ in open("FAILED")) assert nlines == nfailed print("FAILED !! {0} instances.".format(nfailed), file=sys.stderr) # Rebuild the failed batch failed_ids = failed + ".ids" failed_fasta = failed + ".fasta" cmd = "cut -f2 {0}".format(failed) sh(cmd, outfile=failed_ids) if need_update((genome, failed_ids), failed_fasta): cmd = "faSomeRecords {} {} {}".format(genome, failed_ids, failed_fasta) sh(cmd)
def intersectBed(bedfile1, bedfile2): cmd = "intersectBed" cmd += " -a {0} -b {1}".format(bedfile1, bedfile2) suffix = ".intersect.bed" intersectbedfile = ".".join((op.basename(bedfile1).split(".")[0], op.basename(bedfile2).split(".")[0])) + suffix if need_update([bedfile1, bedfile2], intersectbedfile): sh(cmd, outfile=intersectbedfile) return intersectbedfile
def get_bed_file(gff_file, stype, key): from jcvi.formats.gff import bed opr = stype.replace(",", "") + ".bed" bed_opts = ["--type=" + stype, "--key=" + key] bed_file = ".".join((gff_file.split(".")[0], opr)) if need_update(gff_file, bed_file): bed([gff_file, "--outfile={0}".format(bed_file)] + bed_opts) return bed_file
def mergeBed(bedfile, d=0, nms=False): cmd = "mergeBed -i {0}".format(bedfile) if d: cmd += " -d {0}".format(d) if nms: cmd += " -nms" mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed" if need_update(bedfile, mergebedfile): sh(cmd, outfile=mergebedfile) return mergebedfile
def merylhistogram(merylfile): """ Run meryl to dump histogram to be used in kmer.histogram(). The merylfile are the files ending in .mcidx or .mcdat. """ pf, sf = op.splitext(merylfile) outfile = pf + ".histogram" if need_update(merylfile, outfile): cmd = "meryl -Dh -s {0}".format(pf) sh(cmd, outfile=outfile) return outfile
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) p.add_option( "--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format", ) p.add_option("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = opts.format if opts.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(opts.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw) fw.close()
def __init__(self, filename, index=False): super(Maf, self).__init__(filename) indexfile = filename + ".idx" if index: if need_update(filename, indexfile): self.build_index(filename, indexfile) self.index = maf.Index(filename, indexfile) fp = open(filename) self.reader = maf.Reader(fp)
def fill(args): """ %prog fill frag_reads_corr.fastb Run FillFragments on `frag_reads_corr.fastb`. """ p = OptionParser(fill.__doc__) p.add_option("--stretch", default=3, type="int", help="MAX_STRETCH to pass to FillFragments [default: %default]") p.add_option("--cpus", default=32, type="int", help="Number of threads to run [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastb, = args assert fastb == "frag_reads_corr.fastb" pcfile = "frag_reads_corr.k28.pc.info" nthreads = " NUM_THREADS={0}".format(opts.cpus) maxstretch = " MAX_STRETCH={0}".format(opts.stretch) if need_update(fastb, pcfile): cmd = "PathReads READS_IN=frag_reads_corr" cmd += nthreads sh(cmd) filledfastb = "filled_reads.fastb" if need_update(pcfile, filledfastb): cmd = "FillFragments PAIRS_OUT=frag_reads_corr_cpd" cmd += " PRECORRECT_LIBSTATS=True" cmd += maxstretch cmd += nthreads sh(cmd) filledfasta = "filled_reads.fasta" if need_update(filledfastb, filledfasta): cmd = "Fastb2Fasta IN=filled_reads.fastb OUT=filled_reads.fasta" sh(cmd)
def correct_frag(datadir, tag, origfastb, nthreads, dedup=False, haploidify=False): filt = datadir + "/{0}_filt".format(tag) filtfastb = filt + ".fastb" run_RemoveDodgyReads(infile=origfastb, outfile=filtfastb, removeDuplicates=dedup, rc=False, nthreads=nthreads) filtpairs = filt + ".pairs" edit = datadir + "/{0}_edit".format(tag) editpairs = edit + ".pairs" if need_update(filtpairs, editpairs): cmd = "ln -sf {0} {1}.pairs".format(op.basename(filtpairs), edit) sh(cmd) editfastb = edit + ".fastb" if need_update(filtfastb, editfastb): cmd = "FindErrors HEAD_IN={0} HEAD_OUT={1}".format(filt, edit) cmd += " PLOIDY_FILE=data/ploidy" cmd += nthreads sh(cmd) corr = datadir + "/{0}_corr".format(tag) corrfastb = corr + ".fastb" if need_update(editfastb, corrfastb): cmd = "CleanCorrectedReads DELETE=True" cmd += " HEAD_IN={0} HEAD_OUT={1}".format(edit, corr) cmd += " PLOIDY_FILE={0}/ploidy".format(datadir) if haploidify: cmd += " HAPLOIDIFY=True" cmd += nthreads sh(cmd) export_fastq(datadir, corrfastb)
def fasta(args): """ %prog fasta fastqfiles Convert fastq to fasta and qual file. """ p = OptionParser(fasta.__doc__) p.add_option("--seqtk", default=False, action="store_true", help="Use seqtk to convert") p.set_outdir() p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args outdir = opts.outdir if outdir and outdir != ".": mkdir(outdir) fastqfile = fastqfiles[0] pf = op.basename(fastqfile) gzinput = pf.endswith(".gz") if gzinput: pf = pf.rsplit(".", 1)[0] pf, sf = pf.rsplit(".", 1) if sf not in ("fq", "fastq"): logging.debug("Assumed FASTA: suffix not `fq` or `fastq`") return fastqfile, None fastafile, qualfile = pf + ".fasta", pf + ".qual" outfile = opts.outfile or fastafile outfile = op.join(outdir, outfile) if opts.seqtk: if need_update(fastqfiles, outfile): for i, fastqfile in enumerate(fastqfiles): cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile) # First one creates file, following ones append to it sh(cmd, outfile=outfile, append=i) else: logging.debug("Outfile `{0}` already exists.".format(outfile)) return outfile, None for fastqfile in fastqfiles: SeqIO.convert(fastqfile, "fastq", fastafile, "fasta") SeqIO.convert(fastqfile, "fastq", qualfile, "qual") return fastafile, qualfile
def fastaFromBed(bedfile, fastafile, name=False, stranded=False): outfile = op.basename(bedfile).rsplit(".", 1)[0] + ".fasta" cmd = "fastaFromBed -fi {0} -bed {1} -fo {2}".\ format(fastafile, bedfile, outfile) if name: cmd += " -name" if stranded: cmd += " -s" if need_update([bedfile, fastafile], outfile): sh(cmd, outfile=outfile) return outfile
def make_index(gff_file): """ Make a sqlite database for fast retrieval of features. """ import GFFutils db_file = gff_file + ".db" if need_update(gff_file, db_file): if op.exists(db_file): os.remove(db_file) GFFutils.create_gffdb(gff_file, db_file) return GFFutils.GFFDB(db_file)
def star(args): """ %prog star folder reference Run star on a folder with reads. """ p = OptionParser(star.__doc__) p.add_option("--single", default=False, action="store_true", help="Single end mapping") p.set_fastq_names() p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args cpus = opts.cpus mm = MakeManager() num = 1 if opts.single else 2 folder, reference = args gd = "GenomeDir" mkdir(gd) STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd) # Step 0: build genome index genomeidx = op.join(gd, "Genome") if need_update(reference, genomeidx): cmd = STAR + " --runMode genomeGenerate" cmd += " --genomeFastaFiles {0}".format(reference) mm.add(reference, genomeidx, cmd) # Step 1: align for p, prefix in iter_project(folder, opts.names, num): pf = "{0}_star".format(prefix) bamfile = pf + "Aligned.sortedByCoord.out.bam" cmd = STAR + " --readFilesIn {0}".format(" ".join(p)) if p[0].endswith(".gz"): cmd += " --readFilesCommand zcat" cmd += " --outSAMtype BAM SortedByCoordinate" cmd += " --outFileNamePrefix {0}".format(pf) cmd += " --twopassMode Basic" # Compatibility for cufflinks cmd += " --outSAMstrandField intronMotif" cmd += " --outFilterIntronMotifs RemoveNoncanonical" mm.add(p, bamfile, cmd) mm.write()
def check_txt(casfile): """ Check to see if the casfile is already converted to txtfile with txt(). """ if casfile.endswith(".cas"): castabfile = casfile.replace(".cas", ".txt") if need_update(casfile, castabfile): castabfile = txt([casfile]) else: logging.debug("File `{0}` found.".format(castabfile)) else: castabfile = casfile return castabfile
def __init__(self, bedfile, sizesfile): bedfile = sort([bedfile]) coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = "genomeCoverageBed" cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def split_fastafile(fastafile, maxreadlen=32000): pf = fastafile.split(".")[0] smallfastafile = pf + "-small.fasta" bigfastafile = pf + "-big.fasta" shredfastafile = pf + "-big.depth1.fasta" if need_update(fastafile, (smallfastafile, shredfastafile)): filter([fastafile, str(maxreadlen), "--less", "-o", smallfastafile]) filter([fastafile, str(maxreadlen), "-o", bigfastafile]) shred(["--depth=1", "--shift={0}".format(maxreadlen / 100), \ "--readlen={0}".format(maxreadlen), \ "--fasta", bigfastafile]) return smallfastafile, shredfastafile
def check_index(dbfile, supercat=False, go=True): if supercat: updated = False pf = dbfile.rsplit(".", 1)[0] supercatfile = pf + ".supercat" coordsfile = supercatfile + ".coords" if go and need_update(dbfile, supercatfile): cmd = "tGBS-Generate_Pseudo_Genome.pl" cmd += " -f {0} -o {1}".format(dbfile, supercatfile) sh(cmd) # Rename .coords file since gmap_build will overwrite it coordsbak = backup(coordsfile) updated = True dbfile = supercatfile + ".fasta" #dbfile = get_abs_path(dbfile) dbdir, filename = op.split(dbfile) if not dbdir: dbdir = "." dbname = filename.rsplit(".", 1)[0] safile = op.join(dbdir, "{0}/{0}.genomecomp".format(dbname)) if dbname == filename: dbname = filename + ".db" if not go: return dbdir, dbname if need_update(dbfile, safile): cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename) sh(cmd) else: logging.error("`{0}` exists. `gmap_build` already run.".format(safile)) if go and supercat and updated: sh("mv {0} {1}".format(coordsbak, coordsfile)) return dbdir, dbname
def check_aln(dbfile, readfile, cpus=32): from jcvi.formats.fastq import guessoffset saifile = readfile.rsplit(".", 1)[0] + ".sai" if need_update((dbfile, readfile), saifile): offset = guessoffset([readfile]) cmd = "bwa aln " + " ".join((dbfile, readfile)) cmd += " -t {0}".format(cpus) if offset == 64: cmd += " -I" sh(cmd, outfile=saifile) else: logging.error("`{0}` exists. `bwa aln` already run.".format(saifile)) return saifile
def run_compile(arg): filename, filtered, cleanup, store = arg csvfile = filename + ".csv" try: if filename.startswith("s3://"): if check_exists_s3(csvfile): logging.debug("{} exists. Skipped.".format(csvfile)) else: write_csv_ev(filename, filtered, cleanup, store=store) logging.debug("{} written and uploaded.".format(csvfile)) else: if need_update(filename, csvfile): write_csv_ev(filename, filtered, cleanup, store=None) except Exception as e: logging.debug("Thread failed! Error: {}".format(e))
def run_filter(arg): vcffile, lhome, store = arg filteredvcf = vcffile.replace(".vcf", ".filtered.vcf") try: if vcffile.startswith("s3://"): if check_exists_s3(filteredvcf): logging.debug("{} exists. Skipped.".format(filteredvcf)) else: write_filtered(vcffile, lhome, store=store) logging.debug("{} written and uploaded.".format(filteredvcf)) else: if need_update(vcffile, filteredvcf): write_filtered(vcffile, lhome, store=None) except Exception as e: logging.debug("Thread failed! Error: {}".format(e))
def check_index(dbfile): dbfile = get_abs_path(dbfile) dbdir, filename = op.split(dbfile) if not dbdir: dbdir = "." dbname = filename.rsplit(".", 1)[0] safile = op.join(dbdir, "{0}/{0}.salcpchilddc".format(dbname)) if dbname == filename: dbname = filename + ".db" if need_update(dbfile, safile): cmd = "gmap_build -D {0} -d {1} {2}".format(dbdir, dbname, filename) sh(cmd) else: logging.error("`{0}` exists. `gmap_build` already run.".format(safile)) return dbdir, dbname
def split_fastafile(fastafile, maxreadlen=32000): from jcvi.formats.fasta import filter pf = fastafile.split(".")[0] smallfastafile = pf + "-small.fasta" bigfastafile = pf + "-big.fasta" shredfastafile = pf + "-big.depth1.fasta" maxreadlen = str(maxreadlen) if need_update(fastafile, (smallfastafile, shredfastafile)): filter([fastafile, maxreadlen, "--less", "-o", smallfastafile]) filter([fastafile, maxreadlen, "-o", bigfastafile]) shred(["--depth=1", "--readlen={0}".format(maxreadlen), \ "--fasta", bigfastafile]) return smallfastafile, shredfastafile
def build(args): """ %prog build input.bed scaffolds.fasta Build associated genome FASTA file and CHAIN file that can be used to lift old coordinates to new coordinates. The CHAIN file will be used to lift the original marker positions to new positions in the reconstructed genome. The new positions of the markers will be reported in *.lifted.bed. """ p = OptionParser(build.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" chr_fasta = pf + ".chr.fasta" if need_update((chr_agp, scaffolds), chr_fasta): agp_build([chr_agp, scaffolds, chr_fasta]) unplaced_agp = pf + ".unplaced.agp" if need_update((chr_agp, scaffolds), unplaced_agp): write_unplaced_agp(chr_agp, scaffolds, unplaced_agp) unplaced_fasta = pf + ".unplaced.fasta" if need_update((unplaced_agp, scaffolds), unplaced_fasta): agp_build([unplaced_agp, scaffolds, unplaced_fasta]) combined_agp = pf + ".agp" if need_update((chr_agp, unplaced_agp), combined_agp): FileMerger((chr_agp, unplaced_agp), combined_agp).merge() combined_fasta = pf + ".fasta" if need_update((chr_fasta, unplaced_fasta), combined_fasta): FileMerger((chr_fasta, unplaced_fasta), combined_fasta).merge() chainfile = pf + ".chain" if need_update((combined_agp, scaffolds, combined_fasta), chainfile): fromagp([combined_agp, scaffolds, combined_fasta]) liftedbed = mapbed.rsplit(".", 1)[0] + ".lifted.bed" if need_update((mapbed, chainfile), liftedbed): cmd = "liftOver -minMatch=1 {0} {1} {2} unmapped".\ format(mapbed, chainfile, liftedbed) sh(cmd) sort([liftedbed, "-i"]) # Sort bed in place
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ choices = "prepare,align,filter,rmdup,genreads".split(",") p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.add_option("--rc", default=False, action="store_true", help="Reverse complement the reads before alignment") p.add_option("--len", default=100, type="int", help="Extend to this length") p.add_option("--stage", default="prepare", choices=choices, help="Start from certain stage") p.add_option("--dup", default=10, type="int", help="Filter duplicates with coordinates within this distance") p.add_option("--maxdiff", default=1, type="int", help="Maximum number of differences") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" if opts.rc: cmd += " -rc" cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup) cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len) cmd += " -maxdiff {0}".format(opts.maxdiff) cmd += " -stage {0}".format(opts.stage) cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def merge(self, checkexists=False): outfile = self.outfile if checkexists and not need_update(self.filelist, outfile): logging.debug("File `{0}` exists. Merge skipped.".format(outfile)) return files = " ".join(self.filelist) ingz, outgz = self.ingz, self.outgz if ingz and outgz: # can merge gz files directly cmd = "cat {0} > {1}".format(files, outfile) sh(cmd) else: cmd = "zcat" if self.ingz else "cat" cmd += " " + files sh(cmd, outfile=outfile) return outfile
def gmap(args): """ %prog gmap database.fasta fastafile Wrapper for `gmap`. """ p = OptionParser(gmap.__doc__) p.add_option("--cross", default=False, action="store_true", help="Cross-species alignment") p.add_option( "--npaths", default=0, type="int", help="Maximum number of paths to show." " If set to 0, prints two paths if chimera" " detected, else one.", ) p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) dbfile, fastafile = args assert op.exists(dbfile) and op.exists(fastafile) prefix = get_prefix(fastafile, dbfile) logfile = prefix + ".log" gmapfile = prefix + ".gmap.gff3" if not need_update((dbfile, fastafile), gmapfile): logging.error("`{0}` exists. `gmap` already run.".format(gmapfile)) else: dbdir, dbname = check_index(dbfile) cmd = "gmap -D {0} -d {1}".format(dbdir, dbname) cmd += " -f 2 --intronlength=100000" # Output format 2 cmd += " -t {0}".format(opts.cpus) cmd += " --npaths {0}".format(opts.npaths) if opts.cross: cmd += " --cross-species" cmd += " " + fastafile sh(cmd, outfile=gmapfile, errfile=logfile) return gmapfile, logfile
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): print("\t".join((k, str(size))), file=fw) fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))