def pasa(args): """ %prog pasa pasa_db fastafile Run EVM in TIGR-only mode. """ p = OptionParser(pasa.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pasa_db, fastafile = args termexons = "pasa.terminal_exons.gff3" if need_update(fastafile, termexons): cmd = "$ANNOT_DEVEL/PASA2/scripts/pasa_asmbls_to_training_set.dbi" cmd += ' -M "{0}:mysql.tigr.org" -p "access:access"'.format(pasa_db) cmd += ' -g {0}'.format(fastafile) sh(cmd) cmd = "$EVM/PasaUtils/retrieve_terminal_CDS_exons.pl" cmd += " trainingSetCandidates.fasta trainingSetCandidates.gff" sh(cmd, outfile=termexons) return termexons
def query(args): """ %prog query frgscf.sorted scfID:start-end Query certain region to get frg placement, using random access. Build index if not present. """ p = OptionParser(query.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) frgscffile, region = args gzfile = frgscffile + ".gz" tbifile = gzfile + ".tbi" outfile = region + ".posmap" if not (op.exists(gzfile) and op.exists(tbifile)): index(frgscffile) assert op.exists(gzfile) and op.exists(tbifile) cmd = "tabix {0} {1}".format(gzfile, region) sh(cmd, outfile=outfile)
def bwasw(args): """ %prog bwasw database.fasta long_read.fastq Wrapper for `bwa bwasw`. Output will be long_read.sam. """ p = OptionParser(bwasw.__doc__) set_params(p) set_grid(p) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(p.print_help()) extra = opts.extra grid = opts.grid dbfile, readfile = args safile = check_index(dbfile, grid=grid) saifile = check_aln(dbfile, readfile, grid=grid) samfile = readfile.rsplit(".", 1)[0] + ".sam" if op.exists(samfile): logging.error("`{0}` exists. `bwa bwasw` already run.".format(samfile)) return cmd = "bwa bwasw -t 32 {0} {1} ".format(dbfile, readfile) cmd += "{0}".format(extra) sh(cmd, grid=grid, outfile=samfile)
def test(args): """ %prog test unitig{partID}.{unitigID} For example, `%prog test unitig5.530` will test the modified `unitig530` """ p = OptionParser(test.__doc__) p.add_option("--verbose", default=False, action="store_true", help="Turn on verbose debugging [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) prefix = get_prefix() s, = args partID, unitigID = get_ID(s) cmd = CAPATH("utgcns") cmd += " -g ../{0}.gkpStore -t ../{0}.tigStore 1".format(prefix) cmd += " {0} -T {1}".format(partID, s) if opts.verbose: cmd += " -V -V" cmd += " -V -v 2> {0}.log".format(s) sh(cmd) # Show log cmd = "tail {0}.log".format(s) sh(cmd)
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \ pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16): assert db, "Need to specify database fasta file." db = get_abs_path(db) nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus) cmd += " -task {0}".format(task) if wordsize: cmd += " -word_size {0}".format(wordsize) if pctid: cmd += " -perc_identity {0}".format(pctid) if best: cmd += " -max_target_seqs {0}".format(best) sh(cmd) if pctid and hitlen: blastfile = outfile filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blastfile, outfile=filtered_blastfile, pctid=pctid, hitlen=hitlen) shutil.move(filtered_blastfile, blastfile)
def index(args): """ %prog index frgscf.sorted Compress frgscffile.sorted and index it using `tabix`. """ p = OptionParser(index.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) frgscffile, = args gzfile = frgscffile + ".gz" cmd = "bgzip -c {0}".format(frgscffile) if not op.exists(gzfile): sh(cmd, outfile=gzfile) tbifile = gzfile + ".tbi" # Sequence, begin, end in 2, 3, 4-th column, respectively cmd = "tabix -s 2 -b 3 -e 4 {0}".format(gzfile) if not op.exists(tbifile): sh(cmd)
def bam(args): """ %prog snp input.gsnap ref.fasta Convert GSNAP output to BAM. """ from jcvi.formats.sizes import Sizes from jcvi.formats.sam import index p = OptionParser(bam.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) gsnapfile, fastafile = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] uniqsam = pf + ".unique.sam" if need_update((gsnapfile, fastafile), uniqsam): cmd = op.join(EYHOME, "gsnap2gff3.pl") sizesfile = Sizes(fastafile).filename cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam) cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus) sh(cmd) index([uniqsam])
def snp(args): """ %prog snp input.gsnap Run SNP calling on GSNAP output after apps.gsnap.align(). """ p = OptionParser(snp.__doc__) p.set_home("eddyyeh") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gsnapfile, = args EYHOME = opts.eddyyeh_home pf = gsnapfile.rsplit(".", 1)[0] nativefile = pf + ".native" if need_update(gsnapfile, nativefile): cmd = op.join(EYHOME, "convert2native.pl") cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile) cmd += " -proc {0}".format(opts.cpus) sh(cmd) snpfile = pf + ".snp" if need_update(nativefile, snpfile): cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl") cmd += " --native {0} -o {1}".format(nativefile, snpfile) cmd += " -a 2 -ac 0.3 -c 0.8" sh(cmd)
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def uclust(args): """ %prog uclust fastafile Use `usearch` to remove duplicate reads. """ p = OptionParser(uclust.__doc__) p.set_align(pctid=98) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. pf, sf = fastafile.rsplit(".", 1) sortedfastafile = pf + ".sorted.fasta" if need_update(fastafile, sortedfastafile): cmd = "usearch -sortbylength {0} -fastaout {1}".\ format(fastafile, sortedfastafile) sh(cmd) pf = fastafile + ".P{0}.uclust".format(opts.pctid) clstrfile = pf + ".clstr" centroidsfastafile = pf + ".centroids.fasta" if need_update(sortedfastafile, centroidsfastafile): cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile) cmd += " -id {0}".format(identity) cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile) sh(cmd)
def sort(args): """ %prog sort bedfile Sort bed file to have ascending order of seqid, then start. It uses the `sort` command. """ p = OptionParser(sort.__doc__) p.add_option("-i", "--inplace", dest="inplace", default=False, action="store_true", help="Sort bed file in place [default: %default]") p.add_option("--accn", default=False, action="store_true", help="Sort based on the accessions [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args inplace = opts.inplace sortedbed = op.basename(bedfile).rsplit(".", 1)[0] + ".sorted.bed" if inplace: sortedbed = bedfile sortopt = "-k1,1 -k2,2n -k4,4" if not opts.accn else \ "-k4,4 -k1,1 -k2,2n" cmd = "sort {0} {1}".format(sortopt, bedfile) cmd += " -o {0}".format(sortedbed) sh(cmd) return sortedbed
def plot_some_queries(refs, qsizes, rsizes, deltafile, refcov, prefix="out", color="similarity", layout=True): Qfile, Rfile = "Qfile", "Rfile" coords = Coords(deltafile) queries = set() for c in coords: if c.refcov < refcov: continue if c.ref not in refs: continue queries.add(c.query) if not queries or not refs: logging.debug("Empty - {0} vs. {1}".format(queries, refs)) return None writeXfile(queries, qsizes, Qfile) writeXfile(refs, rsizes, Rfile) cmd = "mummerplot {0}".format(deltafile) cmd += " -Rfile {0} -Qfile {1}".format(Rfile, Qfile) cmd += " --postscript -p {0}".format(prefix) if layout: cmd += " --layout" if color == "similarity": cmd += " --color" elif color == "none": cmd += " --nocolor" sh(cmd) cmd = "ps2pdf {0}.ps {0}.pdf".format(prefix) sh(cmd) return prefix + ".pdf"
def txt(args): """ %prog txt casfile convert binary CAS file to tabular output using CLC assembly_table """ p = OptionParser(txt.__doc__) p.add_option("-m", dest="multi", default=False, action="store_true", help="report multi-matches [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) grid = opts.grid casfile, = args txtfile = casfile.replace(".cas", ".txt") assert op.exists(casfile) cmd = "assembly_table -n -s -p " if opts.multi: cmd += "-m " cmd += casfile sh(cmd, grid=grid, outfile=txtfile) return txtfile
def split(args): """ %prog split casfile 1 10 split the binary casfile by using CLCbio `sub_assembly` program, the two numbers are starting and ending index for the `reference`; useful to split one big assembly per contig """ p = OptionParser(split.__doc__) set_grid(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) casfile, start, end = args start = int(start) end = int(end) split_cmd = "sub_assembly -a {casfile} -o sa.{i}.cas -s {i} " + \ "-e sa.{i}.pairs.fasta -f sa.{i}.fragments.fasta -g sa.{i}.ref.fasta" for i in range(start, end + 1): cmd = split_cmd.format(casfile=casfile, i=i) sh(cmd, grid=opts.grid)
def run_FastbAndQualb2Fastq(infile=None, outfile=None, rc=False): corr = op.basename(infile).rsplit(".", 1)[0] cmd = "FastbQualbToFastq HEAD_IN={0} HEAD_OUT={0}".format(corr) cmd += " PAIRED=False PHRED_OFFSET=33" if rc: cmd += " FLIP=True" sh(cmd)
def cufflinks(args): """ %prog cufflinks folder reference Run cufflinks on a folder containing tophat results. """ p = OptionParser(cufflinks.__doc__) p.add_option("--gtf", help="Reference annotation [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) folder, reference = args os.chdir(folder) bams = glob("*tophat/accepted_hits.bam") for bam in bams: pf, ab = op.split(bam) outdir = op.join(pf, "cufflinks") if op.exists(outdir): logging.debug("Directory {0} found. Skipping.".format(outdir)) continue cmd = "cufflinks" cmd += " -o {0}".format(outdir) cmd += " -p {0}".format(opts.cpus) if opts.gtf: cmd += " -g {0}".format(opts.gtf) cmd += " --frag-bias-correct {0}".format(reference) cmd += " --multi-read-correct" cmd += " {0}".format(bam) sh(cmd)
def fake_quals(fa): faq = fa.rsplit(".", 1)[0] + ".qual" if op.exists(faq): logging.debug("Qual file `{0}` found.".format(faq)) else: sh("fakeQuals.py {0} {1}".format(fa, faq)) return fa, faq
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def push_to_s3(s3_store, obj_name): cmd = "sync" if op.isdir(obj_name) else "cp" s3address = "{0}/{1}".format(s3_store, obj_name) s3address = s3ify(s3address) cmd = "aws s3 {0} {1} {2} --sse".format(cmd, obj_name, s3address) sh(cmd) return s3address
def genemark(args): """ %prog genemark species fastafile Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig model gff file is needed. """ p = OptionParser(genemark.__doc__) p.set_home("gmes") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) species, fastafile = args mhome = opts.gmes_home gmdir = "genemark" mkdir(gmdir) cwd = os.getcwd() os.chdir(gmdir) cmd = "ln -sf ../{0}".format(fastafile) sh(cmd) license = op.expanduser("~/.gm_key") assert op.exists(license), "License key ({0}) not found!".format(license) cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile) sh(cmd) os.chdir(cwd) logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
def main(args): """ %prog deltafile refidsfile query.fasta ref.fasta Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refcov", default=.01, type="float", help="Minimum reference coverage [default: %default]") p.add_option("--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile [default: %default]") p.set_align(pctid=96, hitlen=500) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) deltafile, refidsfile, queryfasta, reffasta = args qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(refidsfile) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter([deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)]) if opts.all: for r in refs: pdffile = plot_some_queries([r], qsizes, rsizes, deltafile, refcov) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries(refs, qsizes, rsizes, deltafile, refcov)
def first(args): """ %prog first N fastqfile(s) Get first N reads from file. """ from jcvi.apps.base import need_update p = OptionParser(first.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) N = int(args[0]) nlines = N * 4 fastqfiles = args[1:] fastqfile = fastqfiles[0] outfile = opts.outfile if not need_update(fastqfiles, outfile): logging.debug("File `{0}` exists. Will not overwrite.".format(outfile)) return gz = fastqfile.endswith(".gz") for fastqfile in fastqfiles: if gz: cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines) else: cmd = "head -n {0} {1}".format(nlines, fastqfile) sh(cmd, outfile=opts.outfile, append=True)
def trim(args): """ %prog trim fastqfile Wraps `fastx_trimmer` to trim from begin or end of reads. """ p = OptionParser(trim.__doc__) p.add_option("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.") p.add_option("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args obfastqfile = op.basename(fastqfile) fq = obfastqfile.rsplit(".", 1)[0] + ".ntrimmed.fastq" if fastqfile.endswith(".gz"): fq = obfastqfile.rsplit(".", 2)[0] + ".ntrimmed.fastq.gz" cmd = "fastx_trimmer -Q33 " if opts.first: cmd += "-f {0.first} ".format(opts) if opts.last: cmd += "-l {0.last} ".format(opts) sh(cmd, infile=fastqfile, outfile=fq)
def pull_from_s3(s3_store, file_name=None): file_name = file_name or s3_store.split("/")[-1] if not op.exists(file_name): s3_store = s3ify(s3_store) cmd = "aws s3 cp {0} {1} --sse".format(s3_store, file_name) sh(cmd) return op.abspath(file_name)
def mergeBed(bedfile, d=0, nms=False, s=False, scores=None): sort([bedfile, "-i"]) cmd = "mergeBed -i {0}".format(bedfile) if d: cmd += " -d {0}".format(d) if nms: nargs = len(open(bedfile).readline().split()) if nargs <= 3: logging.debug("Only {0} columns detected... set nms=True"\ .format(nargs)) else: cmd += " -c 4 -o collapse" if s: cmd += " -s" if scores: valid_opts = ("sum", "min", "max", "mean", "median", "mode", "antimode", "collapse") if not scores in valid_opts: scores = "mean" cmd += " -scores {0}".format(scores) mergebedfile = op.basename(bedfile).rsplit(".", 1)[0] + ".merge.bed" if need_update(bedfile, mergebedfile): sh(cmd, outfile=mergebedfile) return mergebedfile
def alignextend(args): """ %prog alignextend ref.fasta read.1.fastq read.2.fastq Wrapper around AMOS alignextend. """ p = OptionParser(alignextend.__doc__) p.add_option("--nosuffix", default=False, action="store_true", help="Do not add /1/2 suffix to the read [default: %default]") p.set_home("amos") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) ref, r1, r2 = args pf = op.basename(r1).split(".")[0] cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") if not opts.nosuffix: cmd += " -suffix" bwa_idx = "{0}.ref.fa.sa".format(pf) if not need_update(ref, bwa_idx): cmd += " -noindex" cmd += " -threads {0}".format(opts.cpus) offset = guessoffset([r1]) if offset == 64: cmd += " -I" cmd += " ".join(("", pf, ref, r1, r2)) sh(cmd)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-454` to remove duplicate reads. """ p = OptionParser(deduplicate.__doc__) p.add_option("--identity", default=.98, type="float", help="Sequence identity threshold [default: %default]") p.add_option("--cpus", default=0, type="int", help="Number of CPUs to use, 0=unlimited [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args from jcvi.apps.command import CDPATH cmd = CDPATH("cd-hit-454") cmd += " -c {0}".format(opts.identity) cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd, grid=opts.grid)
def dust(args): """ %prog dust assembly.fasta Remove low-complexity contigs within assembly. """ p = OptionParser(dust.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta" if need_update(fastafile, dustfastafile): cmd = "dustmasker -in {0}".format(fastafile) cmd += " -out {1} -outfmt fasta".format(dustfastafile) sh(cmd) for name, seq in parse_fasta(dustfastafile): nlow = sum(1 for x in seq if x in "acgtN") pctlow = nlow * 100. / len(seq) if pctlow < 98: continue #print "{0}\t{1:.1f}".format(name, pctlow) print name
def consensus(args): """ %prog consensus fastafile bamfile Convert bam alignments to consensus FASTQ/FASTA. """ p = OptionParser(consensus.__doc__) p.add_option("--fasta", default=False, action="store_true", help="Generate consensus FASTA sequences [default: %default]") p.add_option("--mask", default=0, type="int", help="Mask bases with quality lower than") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile, bamfile = args fasta = opts.fasta suffix = "fasta" if fasta else "fastq" pf = bamfile.rsplit(".", 1)[0] cnsfile = pf + ".cns.{0}".format(suffix) vcfgzfile = pf + ".vcf.gz" vcf([fastafile, bamfile, "-o", vcfgzfile]) cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile) if fasta: cmd += " | seqtk seq -q {0} -A -".format(opts.mask) sh(cmd, outfile=cnsfile)
def deduplicate(args): """ %prog deduplicate fastafile Wraps `cd-hit-est` to remove duplicate sequences. """ p = OptionParser(deduplicate.__doc__) p.set_align(pctid=98) p.add_option("--reads", default=False, action="store_true", help="Use `cd-hit-454` to deduplicate [default: %default]") p.add_option("--samestrand", default=False, action="store_true", help="Enforce same strand alignment [%default: %default]") p.set_home("cdhit") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args identity = opts.pctid / 100. cmd = "cd-hit-454" if opts.reads else "cd-hit-est" cmd = op.join(opts.cdhit_home, cmd) cmd += " -c {0}".format(identity) cmd += " -d 0" # include complete defline if opts.samestrand: cmd += " -r 0" cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile) sh(cmd) dd = fastafile + ".cdhit" return dd
def align(args): """ %prog align reference fastqfiles Use `clc_ref_assemble` to map the read files to a reference. Use a non-zero -s option to turn on paired end mode. """ p = OptionParser(align.__doc__) p.add_option("-o", dest="outfile", default=None, help="Output prefix.cas file [default: %default]") p.add_option("-s", dest="size", default=0, type="int", help="Use paired end mapping with insert [default: %default]") p.add_option("--short", default=False, action="store_true", help="Use `clc_ref_assemble_short` as the mapper [default: %default]") p.add_option("--orientations", default="fb", help="The reads have the orientations [default: %default]") p.add_option("--fraction", default=0.5, help="Fraction of the read that must match [default: %default]") p.add_option("--similarity", default=0.95, help="Similarity of the matching region [default: %default]") p.set_params() p.set_cpus() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) write_file("license.properties", CLCLICENSE, skipcheck=True) ref = args[0] assert op.exists(ref) fastqfiles = args[1:] size = opts.size orientations = opts.orientations assert orientations in ("fb", "bf", "ff", "bb") cmd = "clc_ref_assemble_short" if opts.short else "clc_ref_assemble_long" readprefix = op.basename(fastqfiles[0]).split(".", 1)[0] refprefix = op.basename(ref).split(".", 1)[0] outfile = opts.outfile or "{0}.{1}".format(readprefix, refprefix) if not outfile.endswith(".cas"): outfile += ".cas" cmd += " --cpus {0}".format(opts.cpus) cmd += " -d {0} -o {1} -q ".format(ref, outfile) fastqs = " ".join(fastqfiles) if size == 0: cmd += fastqs else: assert len(fastqfiles) == 2 stddev = size / 4 lb, ub = size - stddev, size + stddev cmd += " -p {0} ss {1} {2} -i {3} ".format(orientations, lb, ub, fastqs) if opts.extra: cmd += " " + opts.extra if not opts.short: cmd += " -l {0} -s {1}".format(opts.fraction, opts.similarity) sh(cmd) return outfile, None
def rm_s3(store): cmd = "aws s3 rm {}".format(store) sh(cmd)
def aws_configure(profile, key, value): sh('aws configure set profile.{0}.{1} {2}'.format(profile, key, value))
def last(args): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ p = OptionParser(last.__doc__) p.add_option("--path", help="Specify LAST path") p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb") p.add_option("--format", default="BlastTab", choices=("TAB", "MAF", "BlastTab", "BlastTab+"), help="Output format") p.add_option("--minlen", default=0, type="int", help="Filter alignments by how many bases match") p.add_option("--minid", default=0, type="int", help="Minimum sequence identity") p.set_cpus() p.set_params() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) subject, query = args path = opts.path cpus = opts.cpus getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") subjectdb = subject.rsplit(".", 1)[0] run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \ lastdb_bin=lastdb_bin) u = 2 if opts.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(cpus) cmd += " -f {0}".format(opts.format) cmd += " {0} {1}".format(subjectdb, query) minlen = opts.minlen minid = opts.minid extra = opts.extra assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() lastfile = get_outfile(subject, query, suffix="last") sh(cmd, outfile=lastfile)
def run_lastdb(infile=None, outfile=None, mask=False, lastdb_bin="lastdb"): outfilebase = outfile.rsplit(".", 1)[0] mask = "-c " if mask else "" cmd = "{0} {1}{2} {3}".format(lastdb_bin, mask, outfilebase, infile) sh(cmd)
def calc(args): """ %prog calc [prot.fasta] cds.fasta > out.ks Protein file is optional. If only one file is given, it is assumed to be CDS sequences with correct frame (frame 0). Results will be written to stdout. Both protein file and nucleotide file are assumed to be Fasta format, with adjacent records as the pairs to compare. Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li Calculate synonymous mutation rates for gene pairs This does the following: 1. Fetches a protein pair. 2. Aligns the protein pair with clustalw (default) or muscle. 3. Convert the output to Fasta format. 4. Use this alignment info to align gene sequences using PAL2NAL 5. Run PAML yn00 to calculate synonymous mutation rates. """ from jcvi.formats.fasta import translate p = OptionParser(calc.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option( "--msa", default="clustalw", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--workdir", default=os.getcwd(), help="Work directory") p.set_outfile() opts, args = p.parse_args(args) if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) output_h = must_open(opts.outfile, "w") print(fields, file=output_h) work_dir = op.join(opts.workdir, "syn_analysis") mkdir(work_dir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) prot_iterator = SeqIO.parse(open(protein_file), "fasta") dna_iterator = SeqIO.parse(open(dna_file), "fasta") for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in \ zip(prot_iterator, prot_iterator, dna_iterator, dna_iterator): print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr) if opts.msa == "clustalw": align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir) mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir) if mrtrans_fasta: ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = \ find_synonymous(mrtrans_fasta, work_dir) if ds_subs_yn is not None: pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name) output_h.write("%s\n" % (",".join( str(x) for x in (pair_name, ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng)))) output_h.flush() # Clean-up sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
def sample(args): """ %prog sample bedfile sizesfile Sample bed file and remove high-coverage regions. When option --targetsize is used, this program uses a differnent mode. It first calculates the current total bases from all ranges and then compare to targetsize, if more, then sample down as close to targetsize as possible. """ import random from jcvi.assembly.coverage import Coverage p = OptionParser(sample.__doc__) p.add_option("--max", default=10, type="int", help="Max depth allowed [default: %default]") p.add_option( "--targetsize", type="int", help="Sample bed file to get target base number [default: %default]") opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, sizesfile = args pf = bedfile.rsplit(".", 1)[0] targetsize = opts.targetsize if targetsize: bed = Bed(bedfile) samplebed = pf + ".sample.bed" fw = open(samplebed, "w") nfeats = len(bed) nbases = bed.sum(unique=False) targetfeats = int(round(nfeats * targetsize / nbases)) sub_bed = random.sample(bed, targetfeats) for b in sub_bed: print >> fw, b logging.debug("File written to `{0}`.".format(samplebed)) return c = Coverage(bedfile, sizesfile) coveragefile = c.filename samplecoveragefile = pf + ".sample.coverage" fw = open(samplecoveragefile, "w") fp = open(coveragefile) for row in fp: seqid, start, end, cov = row.split() cov = int(cov) if cov <= opts.max: fw.write(row) fw.close() samplebedfile = pf + ".sample.bed" cmd = "intersectBed -a {0} -b {1} -wa -u".format(bedfile, samplecoveragefile) sh(cmd, outfile=samplebedfile) logging.debug("Sampled bedfile written to `{0}`.".format(samplebedfile))
def overlap(args): """ %prog overlap ctgfasta poolfasta Fish out the sequences in `poolfasta` that overlap with `ctgfasta`. Mix and combine using `minimus2`. """ p = OptionParser(overlap.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) ctgfasta, poolfasta = args prefix = ctgfasta.split(".")[0] rid = list(Fasta(ctgfasta).iterkeys()) assert len(rid) == 1, "Use overlapbatch() to improve multi-FASTA file" rid = rid[0] splitctgfasta = ctgfasta.rsplit(".", 1)[0] + ".split.fasta" ctgfasta = run_gapsplit(infile=ctgfasta, outfile=splitctgfasta) # Run BLAST blastfile = ctgfasta + ".blast" run_megablast(infile=ctgfasta, outfile=blastfile, db=poolfasta) # Extract contigs and merge using minimus2 closuredir = prefix + ".closure" closure = False if need_update(blastfile, closuredir): mkdir(closuredir, overwrite=True) closure = True if closure: idsfile = op.join(closuredir, prefix + ".ids") cmd = "cut -f2 {0} | sort -u".format(blastfile) sh(cmd, outfile=idsfile) idsfastafile = op.join(closuredir, prefix + ".ids.fasta") cmd = "faSomeRecords {0} {1} {2}".format(poolfasta, idsfile, idsfastafile) sh(cmd) # This step is a hack to weight the bases from original sequences more # than the pulled sequences, by literally adding another copy to be used # in consensus calls. redundantfastafile = op.join(closuredir, prefix + ".redundant.fasta") format([ctgfasta, redundantfastafile, "--prefix=RED."]) mergedfastafile = op.join(closuredir, prefix + ".merged.fasta") cmd = "cat {0} {1} {2}".format(ctgfasta, redundantfastafile, idsfastafile) sh(cmd, outfile=mergedfastafile) afgfile = op.join(closuredir, prefix + ".afg") cmd = "toAmos -s {0} -o {1}".format(mergedfastafile, afgfile) sh(cmd) cwd = os.getcwd() os.chdir(closuredir) cmd = "minimus2 {0} -D REFCOUNT=0".format(prefix) cmd += " -D OVERLAP=100 -D MINID=98" sh(cmd) os.chdir(cwd) # Analyze output, make sure that: # + Get the singletons of the original set back # + Drop any contig that is comprised entirely of pulled set originalIDs = set(Fasta(ctgfasta).iterkeys()) minimuscontig = op.join(closuredir, prefix + ".contig") c = ContigFile(minimuscontig) excludecontigs = set() for rec in c.iter_records(): reads = set(x.id for x in rec.reads) if reads.isdisjoint(originalIDs): excludecontigs.add(rec.id) logging.debug("Exclude contigs: {0}".format(", ".join( sorted(excludecontigs)))) finalfasta = prefix + ".improved.fasta_" fw = open(finalfasta, "w") minimusfasta = op.join(closuredir, prefix + ".fasta") f = Fasta(minimusfasta) for id, rec in f.iteritems_ordered(): if id in excludecontigs: continue SeqIO.write([rec], fw, "fasta") singletonfile = op.join(closuredir, prefix + ".singletons") singletons = set(x.strip() for x in open(singletonfile)) leftovers = singletons & originalIDs logging.debug("Pull leftover singletons: {0}".format(", ".join( sorted(leftovers)))) f = Fasta(ctgfasta) for id, rec in f.iteritems_ordered(): if id not in leftovers: continue SeqIO.write([rec], fw, "fasta") fw.close() fastafile = finalfasta finalfasta = fastafile.rstrip("_") format([ fastafile, finalfasta, "--sequential", "--pad0=3", "--prefix={0}_".format(rid) ]) logging.debug("Improved FASTA written to `{0}`.".format(finalfasta)) n50([ctgfasta]) n50([finalfasta]) errlog = "error.log" for f in (fastafile, blastfile, errlog): if op.exists(f): os.remove(f)
def faToTwoBit(fastafile): twobitfile = fastafile.rsplit(".", 1)[0] + ".2bit" cmd = "faToTwoBit {0} {1}".format(fastafile, twobitfile) if need_update(fastafile, twobitfile): sh(cmd) return twobitfile
def simulate(args): """ %prog simulate run_dir 1 300 Simulate BAMs with varying inserts with dwgsim. The above command will simulate between 1 to 300 CAGs in the HD region, in a directory called `run_dir`. """ p = OptionParser(simulate.__doc__) p.add_option("--ref", default="/Users/htang/projects/ref/hg38.upper.fa", help="Reference genome sequence") add_simulate_options(p) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) rundir, startunits, endunits = args startunits, endunits = int(startunits), int(endunits) basecwd = os.getcwd() mkdir(rundir) os.chdir(rundir) cwd = os.getcwd() # Huntington region pad_left, pad_right = 1000, 10000 chr, start, end = 'chr4', 3074877, 3074933 fasta = Fasta(opts.ref) seq_left = fasta[chr][start - pad_left:start - 1] seq_right = fasta[chr][end:end + pad_right] motif = 'CAG' reffastafile = "ref.fasta" seq = str(fasta[chr][start - pad_left:end + pad_right]) make_fasta(seq, reffastafile, id=chr.upper()) # Write fake sequence for units in range(startunits, endunits + 1): pf = str(units) mkdir(pf) os.chdir(pf) seq = str(seq_left) + motif * units + str(seq_right) fastafile = pf + ".fasta" make_fasta(seq, fastafile, id=chr.upper()) # Simulate reads on it wgsim([ fastafile, "--depth={}".format(opts.depth), "--readlen={}".format(opts.readlen), "--distance={}".format(opts.distance), "--outfile={}".format(pf) ]) read1 = pf + ".bwa.read1.fastq" read2 = pf + ".bwa.read2.fastq" samfile, _ = align(["../{}".format(reffastafile), read1, read2]) indexed_samfile = index([samfile]) sh("mv {} ../{}.bam".format(indexed_samfile, pf)) sh("mv {}.bai ../{}.bam.bai".format(indexed_samfile, pf)) os.chdir(cwd) shutil.rmtree(pf) os.chdir(basecwd)
def draw(args): """ %prog draw --input newicktrees [options] Draw phylogenetic trees into single or combined plots. Input trees should be one of the following: 1. single Newick format tree file 2. a dir containing *ONLY* the tree files to be drawn Newick format: http://evolution.genetics.washington.edu/phylip/newicktree.html This function wraps on jcvi.graphics.tree This function is better used for trees generated by jcvi.apps.phylo (rooted if possible). For drawing general Newick trees from external sources invoke jcvi.graphics.tree directly, which also gives more drawing options. """ trunc_name_options = ['headn', 'oheadn', 'tailn', 'otailn'] p = OptionParser(draw.__doc__) p.add_option("--input", help="path to single input tree file or a dir "\ "containing ONLY the input tree files") p.add_option("--combine", type="string", default="1x1", \ help="combine multiple trees into one plot in nrowxncol") p.add_option("--trunc_name", default=None, help="Options are: {0}. " \ "truncate first n chars, retains only first n chars, " \ "truncate last n chars, retain only last chars. " \ "n=1~99. [default: %default]".format(trunc_name_options)) p.add_option("--SH", default=None, help="path to a file containing SH test p-values in format:" \ "tree_file_name<tab>p-values " \ "This file can be generated with jcvi.apps.phylo build [default: %default]") p.add_option("--scutoff", default=50, type="int", help="cutoff for displaying node support, 0-100 [default: %default]") p.add_option("--barcode", default=None, help="path to seq/taxon name barcode mapping file: " \ "barcode<tab>new_name " \ "This option is downstream of `--trunc_name` [default: %default]") p.add_option("--leafcolorfile", default=None, help="path to a mapping file containing font colors " \ "for the OTUs: leafname<tab>color [default: %default]") p.set_outdir() opts, args, iopts = p.set_image_options(figsize="8x6") input = opts.input outdir = opts.outdir combine = opts.combine.split("x") trunc_name = opts.trunc_name SH = opts.SH mkdir(outdir) if not input: sys.exit(not p.print_help()) elif op.isfile(input): trees_file = input treenames = [op.basename(input)] elif op.isdir(input): trees_file = op.join(outdir, "alltrees.dnd") treenames = [] for f in sorted(os.listdir(input)): sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False) treenames.append(f) else: sys.exit(not p.print_help()) trees = OrderedDict() tree = "" i = 0 for row in LineFile(trees_file, comment="#", load=True).lines: if i == len(treenames): break if not len(row): continue if ";" in row: # sanity check if row.index(";") != len(row)-1: ts = row.split(";") for ii in xrange(len(ts)-1): ts[ii] += ";" else: ts = [row] for t in ts: if ";" in t: tree += t if tree: trees[treenames[i]] = tree tree = "" i+=1 else: tree += t else: tree += row logging.debug("A total of {0} trees imported.".format(len(trees))) sh("rm {0}".format(op.join(outdir, "alltrees.dnd"))) _draw_trees(trees, nrow=int(combine[0]), ncol=int(combine[1]), rmargin=.3,\ iopts=iopts, outdir=outdir, shfile=SH, trunc_name=trunc_name, \ scutoff=opts.scutoff, barcodefile = opts.barcode, leafcolorfile=opts.leafcolorfile)
def parallel(args): """ %prog parallel genome.fasta N Partition the genome into parts and run separately. This is useful if MAKER is to be run on the grid. """ from jcvi.formats.base import split p = OptionParser(parallel.__doc__) p.set_home("maker") p.set_tmpdir(tmpdir="tmp") p.set_grid_opts(array=True) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) genome, NN = args threaded = opts.threaded or 1 tmpdir = opts.tmpdir mkdir(tmpdir) tmpdir = get_abs_path(tmpdir) N = int(NN) assert 1 <= N < 1000, "Required: 1 < N < 1000!" outdir = "outdir" fs = split([genome, outdir, NN]) c = CTLFile("maker_opts.ctl") c.update_abs_path() if threaded > 1: c.update_tag("cpus", threaded) cwd = os.getcwd() dirs = [] for name in fs.names: fn = get_abs_path(name) bn = op.basename(name) dirs.append(bn) c.update_tag("genome", fn) mkdir(bn) sh("cp *.ctl {0}".format(bn)) os.chdir(bn) c.write_file("maker_opts.ctl") os.chdir(cwd) jobs = "jobs" fw = open(jobs, "w") print("\n".join(dirs), file=fw) fw.close() # Submit to grid ncmds = len(dirs) runfile = "array.sh" cmd = op.join(opts.maker_home, "bin/maker") if tmpdir: cmd += " -TMP {0}".format(tmpdir) engine = get_grid_engine() contents = arraysh.format(jobs, cmd) if engine == "SGE" \ else arraysh_ua.format(N, threaded, jobs, cmd) write_file(runfile, contents) if engine == "PBS": return # qsub script outfile = "maker.\$TASK_ID.out" p = GridProcess(runfile, outfile=outfile, errfile=outfile, arr=ncmds, grid_opts=opts) qsubfile = "qsub.sh" qsub = p.build() write_file(qsubfile, qsub)
def refine(args): """ %prog refine breakpoints.bed gaps.bed Find gaps within or near breakpoint region. For breakpoint regions with no gaps, there are two options: - Break in the middle of the region - Break at the closest gap (--closest) """ p = OptionParser(refine.__doc__) p.add_option( "--closest", default=False, action="store_true", help="In case of no gaps, use closest", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) breakpointsbed, gapsbed = args ncols = len(next(open(breakpointsbed)).split()) logging.debug("File %s contains %d columns.", breakpointsbed, ncols) cmd = "intersectBed -wao -a {0} -b {1}".format(breakpointsbed, gapsbed) pf = "{0}.{1}".format(breakpointsbed.split(".")[0], gapsbed.split(".")[0]) ingapsbed = pf + ".bed" sh(cmd, outfile=ingapsbed) fp = open(ingapsbed) data = [x.split() for x in fp] nogapsbed = pf + ".nogaps.bed" largestgapsbed = pf + ".largestgaps.bed" nogapsfw = open(nogapsbed, "w") largestgapsfw = open(largestgapsbed, "w") for b, gaps in groupby(data, key=lambda x: x[:ncols]): gaps = list(gaps) gap = gaps[0] if len(gaps) == 1 and gap[-1] == "0": assert gap[-3] == "." print("\t".join(b), file=nogapsfw) continue gaps = [(int(x[-1]), x) for x in gaps] maxgap = max(gaps)[1] print("\t".join(maxgap), file=largestgapsfw) nogapsfw.close() largestgapsfw.close() beds = [largestgapsbed] toclean = [nogapsbed, largestgapsbed] if opts.closest: closestgapsbed = pf + ".closestgaps.bed" cmd = "closestBed -a {0} -b {1} -d".format(nogapsbed, gapsbed) sh(cmd, outfile=closestgapsbed) beds += [closestgapsbed] toclean += [closestgapsbed] else: pointbed = pf + ".point.bed" pbed = Bed() bed = Bed(nogapsbed) for b in bed: pos = (b.start + b.end) // 2 b.start, b.end = pos, pos pbed.append(b) pbed.print_to_file(pointbed) beds += [pointbed] toclean += [pointbed] refinedbed = pf + ".refined.bed" FileMerger(beds, outfile=refinedbed).merge() # Clean-up FileShredder(toclean) return refinedbed
def __init__(self, filelist, verbose=True): filelist = [x for x in filelist if x and op.exists(x)] cmd = "rm -rf {0}".format(" ".join(filelist)) sh(cmd, log=verbose)
def run_formatdb(infile=None, outfile=None, dbtype="nucl"): cmd = "makeblastdb" cmd += " -dbtype {0} -in {1}".format(dbtype, infile) sh(cmd)
def build(args): """ %prog build [prot.fasta] cds.fasta [options] --outdir=outdir This function wraps on the following steps: 1. msa using ClustalW2 or MUSCLE(default) 2. (optional) alignment editing using Gblocks 3. build NJ tree using PHYLIP in EMBOSS package seq names should be unique by first 10 chars (restriction of PHYLIP) 4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml, *WARNING* maybe slow with large dataset If an outgroup file is provided, the result tree will be rooted on the outgroup according to order in the file, i.e. the name in row1 will be tried first. If not found, row2 will be used, etc. Tail truncated names can be provided so long as it is unique among the seqs. If not uniq, the first occurrence will be used. For example, if you have two moss sequences in your input, then the tree will be rooted on the first moss sequence encountered by the program, unless they are monophylic, in which case the root will be their common ancestor. --stree and --smap are required if --treefix is set. Trees can be edited again using an editor such as Dendroscope. This is the recommended way to get highly customized trees. Newick format trees will be deposited into outdir (. by default). """ from jcvi.formats.fasta import translate p = OptionParser(build.__doc__) p.add_option("--longest", action="store_true", help="Get longest ORF, only works if no pep file, "\ "e.g. ESTs [default: %default]") p.add_option("--nogblocks", action="store_true", help="don't use Gblocks to edit alignment [default: %default]") p.add_option("--synonymous", action="store_true", help="extract synonymous sites of the alignment [default: %default]") p.add_option("--fourfold", action="store_true", help="extract fourfold degenerate sites of the alignment [default: %default]") p.add_option("--msa", default="muscle", choices=("clustalw", "muscle"), help="software used to align the proteins [default: %default]") p.add_option("--noneighbor", action="store_true", help="don't build NJ tree [default: %default]") p.add_option("--ml", default=None, choices=("raxml", "phyml"), help="software used to build ML tree [default: %default]") p.add_option("--outgroup", help="path to file containing outgroup orders [default: %default]") p.add_option("--SH", help="path to reference Newick tree [default: %default]") p.add_option("--shout", default="SH_out.txt", \ help="SH output file name [default: %default]") p.add_option("--treefix", action="store_true", help="use TreeFix to rearrange ML tree [default: %default]") p.add_option("--stree", help="path to species Newick tree [default: %default]") p.add_option("--smap", help="path to smap file: " \ "gene_name_pattern<tab>species_name [default: %default]") p.set_outdir() opts, args = p.parse_args(args) gblocks = not opts.nogblocks synonymous = opts.synonymous fourfold = opts.fourfold neighbor = not opts.noneighbor outgroup = opts.outgroup outdir = opts.outdir if len(args) == 1: protein_file, dna_file = None, args[0] elif len(args) == 2: protein_file, dna_file = args else: print("Incorrect arguments", file=sys.stderr) sys.exit(not p.print_help()) if opts.treefix: stree = opts.stree smap = opts.smap assert stree and smap, "TreeFix requires stree and smap files." opts.ml = "raxml" treedir = op.join(outdir, "tree") mkdir(treedir) if not protein_file: protein_file = dna_file + ".pep" translate_args = [dna_file, "--outfile=" + protein_file] if opts.longest: translate_args += ["--longest"] dna_file, protein_file = translate(translate_args) work_dir = op.join(outdir, "alignment") mkdir(work_dir) p_recs = list(SeqIO.parse(open(protein_file), "fasta")) if opts.msa == "clustalw": align_fasta = clustal_align_protein(p_recs, work_dir) elif opts.msa == "muscle": align_fasta = muscle_align_protein(p_recs, work_dir) n_recs = list(SeqIO.parse(open(dna_file), "fasta")) mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta") if not mrtrans_fasta: logging.debug("pal2nal aborted. " \ "Cannot reliably build tree for {0}".format(dna_file)) return codon_aln_fasta = mrtrans_fasta if gblocks: gb_fasta = run_gblocks(mrtrans_fasta) codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta else: if synonymous: codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous") if fourfold: codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold") if not neighbor and not opts.ml: return codon_aln_fasta alignment = AlignIO.read(codon_aln_fasta, "fasta") if len(alignment) <= 3: raise ValueError("Too few seqs to build tree.") mkdir(op.join(treedir, "work")) if neighbor: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".NJ.unrooted.dnd") try: outfile, phy_file = build_nj_phylip(alignment, \ outfile=out_file, outgroup=outgroup, work_dir=treedir) except: print("NJ tree cannot be built for {0}".format(dna_file)) if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.ml: out_file = op.join(treedir, op.basename(dna_file).rsplit(".", 1)[0] + \ ".ML.unrooted.dnd") if opts.ml == "phyml": try: outfile, phy_file = build_ml_phyml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) elif opts.ml == "raxml": try: outfile, phy_file = build_ml_raxml\ (alignment, outfile=out_file, work_dir=treedir) except: print("ML tree cannot be built for {0}".format(dna_file)) if outgroup: new_out_file = out_file.replace(".unrooted", "") t = smart_reroot(treefile=out_file, outgroupfile=outgroup, \ outfile=new_out_file) if t == new_out_file: sh("rm %s" % out_file) outfile = new_out_file if opts.SH: reftree = opts.SH querytree = outfile SH_raxml(reftree, querytree, phy_file, shout=opts.shout) if opts.treefix: treefix_dir = op.join(treedir, "treefix") assert mkdir(treefix_dir, overwrite=True) sh("cp {0} {1}/".format(outfile, treefix_dir)) input = op.join(treefix_dir, op.basename(outfile)) aln_file = input.rsplit(".", 1)[0] + ".fasta" SeqIO.write(alignment, aln_file, "fasta") outfile = run_treefix(input=input, stree_file=stree, smap_file=smap, \ a_ext=".fasta", o_ext=".dnd", n_ext = ".treefix.dnd") return outfile
def run(self, cpus=1): cmd = "make -j {0} -f {1}".format(cpus, self.makefile) sh(cmd)
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print >> fw, b fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print >> fw, b b.start, b.end = max(start, end - flank + 1), end print >> fw, b fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)) ies_id += 1 fw.close()
def snap(args): """ %prog snap species gffile fastafile Train SNAP model given gffile and fastafile. Whole procedure taken from: <http://gmod.org/wiki/MAKER_Tutorial_2012> """ p = OptionParser(snap.__doc__) p.set_home("maker") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.maker_home snapdir = "snap" mkdir(snapdir) cwd = os.getcwd() os.chdir(snapdir) newgffile = "training.gff3" logging.debug("Construct GFF file combined with sequence ...") sh("cat ../{0} > {1}".format(gffile, newgffile)) sh('echo "##FASTA" >> {0}'.format(newgffile)) sh("cat ../{0} >> {1}".format(fastafile, newgffile)) logging.debug("Make models ...") sh("{0}/src/bin/maker2zff training.gff3".format(mhome)) sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format( mhome)) sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome)) sh("{0}/exe/snap/forge export.ann export.dna".format(mhome)) sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species)) os.chdir(cwd) logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format( snapdir, species))
if tag == "all": sh("qdel -u {0}".format(username)) return valid_jobids = set() method = opts.method or guess_method(tag) if method == "jobid": jobids = tag.split(",") valid_jobids |= set(jobids) elif method == "pattern": qsxmlcmd = 'qstat -u "{0}" -j "{1}" -nenv -njd -xml'.\ format(username, tag) try: qsxml = check_output(shlex.split(qsxmlcmd)).strip() except CalledProcessError, e: qsxml = None logging.debug('No jobs matching the pattern "{0}"'.format(tag)) if qsxml is not None: for job in ET.fromstring(qsxml).findall("djob_info"): for elem in job.findall("element"): jobid = elem.find("JB_job_number").text valid_jobids.add(jobid) if valid_jobids: sh("qdel {0}".format(",".join(valid_jobids))) if __name__ == '__main__': main()
def do_cleanup(minibam, realignbam): sh("rm -f {}* {}*".format(minibam, realignbam))
def augustus(args): """ %prog augustus species gffile fastafile Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from: <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html> """ p = OptionParser(augustus.__doc__) p.add_option("--autotrain", default=False, action="store_true", help="Run autoAugTrain.pl to iteratively train AUGUSTUS") p.set_home("augustus") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) species, gffile, fastafile = args mhome = opts.augustus_home augdir = "augustus" cwd = os.getcwd() mkdir(augdir) os.chdir(augdir) target = "{0}/config/species/{1}".format(mhome, species) if op.exists(target): logging.debug("Removing existing target `{0}`".format(target)) sh("rm -rf {0}".format(target)) config_path = "{0}/config".format(mhome) sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}". format(mhome, species, config_path)) sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".\ format(mhome, gffile, fastafile)) sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".\ format(mhome, species)) sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst" ) sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".\ format(mhome)) sh("grep -c LOCUS raw.gb training.gb") # autoAugTrain failed to execute, disable for now if opts.autotrain: sh("rm -rf {0}".format(target)) sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".\ format(mhome, species)) os.chdir(cwd) sh("cp -r {0} augustus/".format(target))
def mito(args): """ %prog mito chrM.fa input.bam Identify mitochondrial deletions. """ p = OptionParser(mito.__doc__) p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions") p.add_option("--realignonly", default=False, action="store_true", help="Realign only") p.add_option("--svonly", default=False, action="store_true", help="Run Realign => SV calls only") p.add_option("--support", default=1, type="int", help="Minimum number of supporting reads") p.set_home("speedseq", default="/mnt/software/speedseq/bin") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) chrMfa, bamfile = args store = opts.output_path cleanup = not opts.nocleanup if not op.exists(chrMfa): logging.debug("File `{}` missing. Exiting.".format(chrMfa)) return chrMfai = chrMfa + ".fai" if not op.exists(chrMfai): cmd = "samtools index {}".format(chrMfa) sh(cmd) if not bamfile.endswith(".bam"): bamfiles = [x.strip() for x in open(bamfile)] else: bamfiles = [bamfile] if store: computed = ls_s3(store) computed = [ op.basename(x).split('.')[0] for x in computed if x.endswith(".depth") ] remaining_samples = [ x for x in bamfiles if op.basename(x).split(".")[0] not in computed ] logging.debug("Already computed on `{}`: {}".format( store, len(bamfiles) - len(remaining_samples))) bamfiles = remaining_samples logging.debug("Total samples: {}".format(len(bamfiles))) for bamfile in bamfiles: run_mito(chrMfa, bamfile, opts, realignonly=opts.realignonly, svonly=opts.svonly, store=store, cleanup=cleanup)
def jellyfish(args): """ %prog jellyfish [*.fastq|*.fasta] Run jellyfish to dump histogram to be used in kmer.histogram(). """ from jcvi.apps.base import getfilesize from jcvi.utils.cbook import human_size p = OptionParser(jellyfish.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.add_option("--coverage", default=40, type="int", help="Expected sequence coverage [default: %default]") p.add_option("--prefix", default="jf", help="Database prefix [default: %default]") p.add_option("--nohist", default=False, action="store_true", help="Do not print histogram [default: %default]") p.set_home("jellyfish") p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args K = opts.K coverage = opts.coverage totalfilesize = sum(getfilesize(x) for x in fastqfiles) fq = fastqfiles[0] pf = opts.prefix gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage logging.debug("Total file size: {0}, hashsize (-s): {1}".\ format(human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize)) jfpf = "{0}-K{1}".format(pf, K) jfdb = jfpf fastqfiles = " ".join(fastqfiles) jfcmd = op.join(opts.jellyfish_home, "jellyfish") cmd = jfcmd cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf) cmd += " -s {0} -m {1}".format(hashsize, K) if gzip: cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0" else: cmd += " " + fastqfiles if need_update(fastqfiles, jfdb): sh(cmd) if opts.nohist: return jfhisto = jfpf + ".histogram" cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto) if need_update(jfdb, jfhisto): sh(cmd)
def main(args): """ %prog deltafile Plot one query. Extract the references that have major matches to this query. Control "major" by option --refcov. """ p = OptionParser(main.__doc__) p.add_option("--refids", help="Use subset of contigs in the ref") p.add_option( "--refcov", default=0.01, type="float", help="Minimum reference coverage", ) p.add_option( "--all", default=False, action="store_true", help="Plot one pdf file per ref in refidsfile", ) p.add_option( "--color", default="similarity", choices=("similarity", "direction", "none"), help="Color the dots based on", ) p.add_option( "--nolayout", default=False, action="store_true", help="Do not rearrange contigs", ) p.set_align(pctid=0, hitlen=0) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (deltafile,) = args reffasta, queryfasta = open(deltafile).readline().split() color = opts.color layout = not opts.nolayout prefix = op.basename(deltafile).split(".")[0] qsizes = Sizes(queryfasta).mapping rsizes = Sizes(reffasta).mapping refs = SetFile(opts.refids) if opts.refids else set(rsizes.keys()) refcov = opts.refcov pctid = opts.pctid hitlen = opts.hitlen deltafile = filter( [deltafile, "--pctid={0}".format(pctid), "--hitlen={0}".format(hitlen)] ) if opts.all: for r in refs: pdffile = plot_some_queries( [r], qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout, ) if pdffile: sh("mv {0} {1}.pdf".format(pdffile, r)) else: plot_some_queries( refs, qsizes, rsizes, deltafile, refcov, prefix=prefix, color=color, layout=layout, )
def run_mito(chrMfa, bamfile, opts, realignonly=False, svonly=False, store=None, cleanup=False): from jcvi.formats.sam import get_minibam region = "chrM" minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region)) if not op.exists(minibam): get_minibam(bamfile, region) else: logging.debug("{} found. Skipped.".format(minibam)) speedseq_bin = op.join(opts.speedseq_home, "speedseq") realign = minibam.rsplit(".", 1)[0] + ".realign" realignbam = realign + ".bam" margs = " -v -t {} -o {}".format(opts.cpus, realign) if need_update(minibam, realign + ".bam"): cmd = speedseq_bin + " realign" cmd += margs cmd += " {} {}".format(chrMfa, minibam) sh(cmd) else: logging.debug("{} found. Skipped.".format(realignbam)) if realignonly: return depthfile = realign + ".depth" if need_update(realignbam, depthfile): coverage([ chrMfa, realignbam, "--nosort", "--format=coverage", "--outfile={}".format(depthfile) ]) if store: push_to_s3(store, depthfile) vcffile = realign + ".sv.vcf.gz" if need_update(realignbam, vcffile): cmd = speedseq_bin + " sv" cmd += margs cmd += " -R {}".format(chrMfa) cmd += " -m {}".format(opts.support) cmd += " -B {} -D {} -S {}".format(realignbam, realign + ".discordants.bam", realign + ".splitters.bam") sh(cmd) else: logging.debug("{} found. Skipped.".format(vcffile)) if store: push_to_s3(store, vcffile) if svonly: if cleanup: do_cleanup(minibam, realignbam) return piledriver = realign + ".piledriver" if need_update(realignbam, piledriver): cmd = "bamtools piledriver -fasta {}".format(chrMfa) cmd += " -in {}".format(realignbam) sh(cmd, outfile=piledriver) if store: push_to_s3(store, piledriver) if cleanup: do_cleanup(minibam, realignbam)
def info(args): """ %prog info casfile <fastafile> Wraps around `assembly_info` and get the following block. General info: Read info: Coverage info: In particular, the read info will be reorganized so that it shows the percentage of unmapped, mapped, unique and multi-hit reads. When --coverage is used, the program expects a second fastafile to replace the contig IDs with real ones. RPKM = 10^9 x C / NL, which is really just simply C/N C = the number of mappable reads that felt onto the gene's exons N = total number of mappable reads in the experiment L = the sum of the exons in base pairs. """ from jcvi.utils.cbook import percentage p = OptionParser(info.__doc__) p.add_option( "--coverage", default=False, action="store_true", help="Generate coverage output, replacing IDs [default: %default]") set_grid(p) opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) casfile = args[0] pf = casfile.rsplit(".", 1)[0] if opts.coverage: assert len(args) == 2, "You need a fastafile when using --coverage" coveragefile = pf + ".coverage" fw = open(coveragefile, "w") infofile = pf + ".info" cmd = "assembly_info {0}".format(casfile) if not op.exists(infofile): sh(cmd, outfile=infofile, grid=opts.grid) inreadblock = False incontigblock = False fp = open(infofile) row = fp.readline() while row: if row.startswith("Read info:"): inreadblock = True elif row.startswith("Contig info:"): incontigblock = True # Following looks like a hack, but to keep compatible between # CLC 3.20 and CLC 4.0 beta if inreadblock: atoms = row.split('s') last = atoms[-1].split()[0] if len(atoms) > 1 else "0" srow = row.strip() if srow.startswith("Reads"): reads = int(last) if srow.startswith("Unmapped") or srow.startswith("Unassembled"): unmapped = int(last) if srow.startswith("Mapped") or srow.startswith("Assembled"): mapped = int(last) if srow.startswith("Multi"): multihits = int(last) if row.startswith("Coverage info:"): # Print the Read info: block print "Read info:" assert mapped + unmapped == reads unique = mapped - multihits print print "Total reads: {0}".format(reads) print "Unmapped reads: {0}".format( percentage(unmapped, reads, False)) print "Mapped reads: {0}".format( percentage(mapped, reads, False)) print "Unique reads: {0}".format( percentage(unique, reads, False)) print "Multi hit reads: {0}".\ format(percentage(multihits, reads, False)) print inreadblock = False if incontigblock and opts.coverage: fastafile = args[1] s = Sizes(fastafile) while row: atoms = row.split() if len(atoms) == 4 and atoms[0][0] != "C": # Contig # Contig Sites Reads Coverage contig, sites, reads, coverage = atoms contig = int(contig) - 1 size = s.sizes[contig] contig = s.ctgs[contig] assert size == int(sites) # See formula above rpkm = 1e9 * int(reads) / (size * mapped) print >> fw, "\t".join( (contig, sites, reads, "{0:.1f}".format(rpkm))) row = fp.readline() row = fp.readline()
def anneal(args): """ %prog anneal agpfile contigs.fasta Merge adjacent overlapping contigs and make new AGP file. By default it will also anneal lines like these together (unless --nozipshreds): scaffold4 1 1608 1 W ca-bacs.5638.frag11.22000-23608 1 1608 - scaffold4 1609 1771 2 N 163 scaffold yes paired-ends scaffold4 1772 3771 3 W ca-bacs.5638.frag10.20000-22000 1 2000 - These are most likely shreds, which we look for based on names. """ p = OptionParser(anneal.__doc__) p.set_align(pctid=GoodPct, hitlen=GoodOverlap) p.add_option("--hang", default=GoodOverhang, type="int", help="Maximum overhang length") p.set_outdir(outdir="outdir") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) agpfile, contigs = args outdir = opts.outdir if not op.exists(outdir): mkdir(outdir) cmd = "faSplit byname {0} {1}/".format(contigs, outdir) sh(cmd) cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) agp = AGP(agpfile) blastfile = agpfile.replace(".agp", ".blast") if not op.exists(blastfile): populate_blastfile(blastfile, agp, outdir, opts) assert op.exists(blastfile) logging.debug("File `{0}` found. Start loading.".format(blastfile)) blast = BlastSlow(blastfile).to_dict() annealedagp = "annealed.agp" annealedfasta = "annealed.fasta" newagp = deepcopy(agp) clrstore = {} for a, b, qreverse in agp.iter_paired_components(): aid = a.component_id bid = b.component_id pair = (aid, bid) if pair in blast: bl = blast[pair] else: oopts = get_overlap_opts(aid, bid, qreverse, outdir, opts) o = overlap(oopts) if not o: continue bl = o.blastline o = Overlap(bl, a.component_span, b.component_span, cutoff, qreverse=qreverse) if aid not in clrstore: clrstore[aid] = CLR.from_agpline(a) if bid not in clrstore: clrstore[bid] = CLR.from_agpline(b) aclr, bclr = clrstore[aid], clrstore[bid] o.print_graphic() if o.anneal(aclr, bclr): newagp.delete_between(aid, bid, verbose=True) if o.otype == 2: # b ~ a o = o.swapped o.print_graphic() if o.anneal(bclr, aclr): newagp.switch_between(bid, aid, verbose=True) newagp.delete_between(bid, aid, verbose=True) logging.debug("A total of {0} components with modified CLR.".format( len(clrstore))) for cid, c in clrstore.items(): if c.is_valid: continue print("Remove {0}".format(c), file=sys.stderr) newagp.convert_to_gap(cid, verbose=True) # Update all ranges that has modified clr for a in newagp: if a.is_gap: continue aid = a.component_id if aid in clrstore: c = clrstore[aid] a.component_beg = c.start a.component_end = c.end newagp.print_to_file(annealedagp) tidyagp = tidy([annealedagp, contigs]) build([tidyagp, contigs, annealedfasta]) return annealedfasta
def eagle(args): """ %prog eagle fastafile """ p = OptionParser(eagle.__doc__) p.add_option("--share", default="/usr/local/share/EAGLE/", help="Default EAGLE share path") add_sim_options(p) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args share = opts.share depth = opts.depth readlen = opts.readlen distance = opts.distance pf = op.basename(fastafile).split(".")[0] # Since EAGLE does not natively support read length other than 100bp and # 250bp - for an arbitrary read length we need to generate a bunch of # support files # First file is the Runinfo runinfo_readlen = "RunInfo_PairedReads2x{}Cycles1x1Tiles.xml".format(readlen) if not op.exists(runinfo_readlen): runinfo = op.join(share, "RunInfo/RunInfo_PairedReads2x251Cycles1x1Tiles.xml") runinfo_xml = open(runinfo).read() runinfo_xml = runinfo_xml.replace("251", str(readlen))\ .replace("252", str(readlen + 1))\ .replace("502", str(2 * readlen)) fw = open(runinfo_readlen, "w") print >> fw, runinfo_xml.strip() fw.close() # Generate quality profiles quality_file1 = "QualityTable.read1.length{}.qval".format(readlen) quality_file2 = "QualityTable.read2.length{}.qval".format(readlen) if not (op.exists(quality_file1) and op.exists(quality_file2)): for i, qq in enumerate([quality_file1, quality_file2]): cmd = "/usr/local/libexec/EAGLE/scaleQualityTable.pl" cmd += " --input {}".format(op.join(share, "QualityTables/DefaultQualityTable.read{}.length101.qval".format(i + 1))) cmd += " --cycles {}".format(readlen) cmd += " --output {}".format(qq) sh(cmd, silent=True) # Since distance is different from the default distribution which is # centered around 319, we shift our peak to the new peak template_lengths = op.join(share, "TemplateLengthTables/DefaultTemplateLengthTable.tsv") template_distance = "TemplateLengthTable{}.tsv".format(distance) shift = distance - 319 if not op.exists(template_distance): fp = open(template_lengths) fw = open(template_distance, "w") for row in fp: size, counts = row.split() size = int(size) counts = int(counts) size += shift if size < readlen: continue print >> fw, "\t".join(str(x) for x in (size, counts)) fw.close() # All done, let's simulate! cmd = "configureEAGLE.pl" cmd += " --reference-genome {}".format(fastafile) cmd += " --coverage-depth {}".format(depth) cmd += " --gc-coverage-fit-table {}".format(op.join(share, "GcCoverageFitTables/Homo_sapiens.example1.tsv")) cmd += " --run-info {}".format(runinfo_readlen) cmd += " --quality-table {}".format(quality_file1) cmd += " --quality-table {}".format(quality_file2) cmd += " --template-length-table {}".format(template_distance) cmd += " --random-seed {}".format(random.randint(1, 65535)) sh(cmd, silent=True) # Retrieve results outpf = opts.outfile or "{0}.{1}bp.{2}x".format(pf, distance, depth) outpf += ".bwa" cwd = os.getcwd() eagle_dir = "EAGLE" os.chdir(eagle_dir) sh("make bam", silent=True) # Convert BAM to FASTQ from jcvi.formats.sam import fastq a, b = fastq(["eagle.bam", outpf]) sh("mv {} {} ../".format(a, b)) os.chdir(cwd) # Clean-up shutil.rmtree(eagle_dir)
def assemble(args): """ Run `cap3` on a single multi FASTA file containing reads or a folder containing several multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc. """ p = OptionParser(assemble.__doc__) g1 = OptionGroup( p, "Input file options (required)", "Note: Please choose from and provide values for one of the following parameters" ) g1.add_option("--input_file", default=None, help="input file of reads [default: %default]") g1.add_option( "--input_folder", default=None, help= "input folder containing multi FASTA files of reads [default: %default]" ) g1.add_option( "--input_file_list", default=None, help= "list file containing paths to multi FASTA files of reads [default: %default]" ) p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters", "Note: If not specified, `cap3` defaults will be used") g2.add_option("-f", "--max_gap_len", default=20, type="int", help="maximum gap length in any overlap [default: %default]\n" +\ "Same as cap3 `-f` parameter.") g2.add_option("-p", "--ovl_pct_id", default=90, type="int", help="overlap percent identity cutoff [default: %default]\n" +\ "Same as cap3 `-p` parameter.") g2.add_option("-s", "--ovl_sim_score", default=900, type="int", help="overlap similarity score cutoff [default: %default]\n" +\ "Same as cap3 `-s` parameter.") g2.add_option( "-x", "--prefix", dest="prefix", default="cap3", help="prefix string for output file name [default: %default]") p.add_option_group(g2) p.set_params() opts, args = p.parse_args(args) if opts.max_gap_len and opts.max_gap_len <= 1: logging.error("--max_gap_len should be > 1") sys.exit() elif opts.ovl_pct_id and opts.ovl_pct_id <= 65: logging.error("--ovl_pct_id should be > 65") sys.exit() elif opts.ovl_sim_score and opts.ovl_sim_score <= 250: logging.error("--ovl_sim_score should be > 250") sys.exit() file_list = [] if opts.input_file_list: if not op.isfile(opts.input_file_list): logging.error("Input file list {0} does not exist".format( opts.input_file_list)) sys.exit() with open(opts.input_file_list, 'r') as f: file_list = f.read().splitlines() elif opts.input_folder: if not op.isdir(opts.input_folder): logging.error("Input folder {0} does not exist".format( opts.input_folder)) sys.exit() file_list = [file for file in os.listdir(opts.input_folder) \ if file.lower().endswith(('.fa', '.fasta'))] folder = opts.input_folder folder = folder.rstrip('/') for i in xrange(len(file_list)): file_list[i] = folder + "/" + file_list[i] elif opts.input_file: file_list.append(opts.input_file) else: logging.error("Please specify one of the options for input files") sys.exit(not p.print_help()) if len(file_list) == 0: logging.warning( "List of files to process is empty. Please check your input!") sys.exit() for file in file_list: if not op.isfile(file): logging.warning("Input file {0} does not exist".format(file)) else: cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \ opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix) if opts.extra: cmd += " {0}".format(opts.extra) logfile = "{0}.{1}.log".format(file, opts.prefix) sh(cmd, outfile=logfile)