def clean_fasta(args): dirw = make_genomedir(args.species) os.chdir(dirw) for fname in ["raw.fix.fas.index", "11_genome.fas.index"]: if op.isfile(fname): os.remove(fname) if op.islink("10_genome.fna"): os.unlink("10_genome.fna") if op.isfile("10_genome.fna") and not args.overwrite: logging.debug("10_genome.fna already exits: skipped") elif op.isfile("08_seq_map/renamed.fna"): sh("ln -sf 08_seq_map/renamed.fna 10_genome.fna") if op.isfile("08_seq_map/renamed.sizes"): sh("ln -sf 08_seq_map/renamed.sizes 10_genome.sizes") else: logging.error("08_seq_map/renamed.fna not there") sys.exit(1) if not op.isdir("15_intervals"): mkdir("15_intervals") if op.isfile("15_intervals/01.chrom.bed") and not args.overwrite: logging.debug("01.chrom.bed already exits - skipped") else: sh("fasta size --bed 10_genome.fna > 15_intervals/01.chrom.bed") if op.isfile("15_intervals/01.chrom.sizes") and not args.overwrite: logging.debug("01.chrom.sizes already exits - skipped") else: sh("faSize -detailed 10_genome.fna > 15_intervals/01.chrom.sizes") if op.isfile("15_intervals/11.gap.bed") and not args.overwrite: logging.debug("11.gap.bed already exits - skipped") else: sh("fasta gaps 10_genome.fna > 15_intervals/11.gap.bed")
def fpkm(args): """ %prog fpkm fastafile *.bam Calculate FPKM values from BAM file. """ p = OptionParser(fpkm.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] # Create a DUMMY gff file for cuffdiff gffile = fastafile.rsplit(".", 1)[0] + ".gff" if need_update(fastafile, gffile): fw = open(gffile, "w") f = Fasta(fastafile, lazy=True) for key, size in f.itersizes_ordered(): print >> fw, "\t".join(str(x) for x in (key, "dummy", "transcript",\ 1, size, ".", ".", ".", "ID=" + key)) fw.close() logging.debug("Dummy GFF created: {0}".format(gffile)) cmd = "cuffdiff {0} {1}".format(gffile, " ".join(bamfiles)) sh(cmd)
def fastq(args): """ %prog fastq bamfile prefix Convert BAM files to paired FASTQ files. """ p = OptionParser(fastq.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, pf = args singletons = pf + ".se.fastq" a = pf + ".read1.fastq" b = pf + ".read2.fastq" cmd = "samtools collate -uOn 128 {} tmp-prefix".format(bamfile) cmd += " | samtools fastq -s {} -1 {} -2 {} -"\ .format(singletons, a, b) sh(cmd) if os.stat(singletons).st_size == 0: # singleton file is empty os.remove(singletons) return a, b
def run_megablast(infile=None, outfile=None, db=None, wordsize=None, \ pctid=98, hitlen=100, best=None, evalue=0.01, task="megablast", cpus=16): assert db, "Need to specify database fasta file." db = get_abs_path(db) nin = db + ".nin" nin00 = db + ".00.nin" nin = nin00 if op.exists(nin00) else (db + ".nin") run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -evalue {0} -outfmt 6 -num_threads {1}".format(evalue, cpus) cmd += " -task {0}".format(task) if wordsize: cmd += " -word_size {0}".format(wordsize) if pctid: cmd += " -perc_identity {0}".format(pctid) if best: cmd += " -max_target_seqs {0}".format(best) sh(cmd) if pctid and hitlen: blastfile = outfile filtered_blastfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blastfile, outfile=filtered_blastfile, pctid=pctid, hitlen=hitlen) shutil.move(filtered_blastfile, blastfile)
def consensus(args): """ %prog consensus fastafile bamfile Convert bam alignments to consensus FASTQ/FASTA. """ p = OptionParser(consensus.__doc__) sp1.add_argument( "--fasta", default=False, action="store_true", help="Generate consensus FASTA sequences [default: %default]") sp1.add_argument("--mask", default=0, type="int", help="Mask bases with quality lower than") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile, bamfile = args fasta = args.fasta suffix = "fasta" if fasta else "fastq" pf = bamfile.rsplit(".", 1)[0] cnsfile = pf + ".cns.{0}".format(suffix) vcfgzfile = pf + ".vcf.gz" vcf([fastafile, bamfile, "-o", vcfgzfile]) cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile) if fasta: cmd += " | seqtk seq -q {0} -A -".format(args.mask) sh(cmd, outfile=cnsfile)
def push_to_s3(s3_store, obj_name): cmd = "sync" if op.isdir(obj_name) else "cp" s3address = "{0}/{1}".format(s3_store, obj_name) s3address = s3ify(s3address) cmd = "aws s3 {0} {1} {2} --sse".format(cmd, obj_name, s3address) sh(cmd) return s3address
def trim(args): """ %prog trim fastqfile Wraps `fastx_trimmer` to trim from begin or end of reads. """ p = OptionParser(trim.__doc__) sp1.add_argument("-f", dest="first", default=0, type="int", help="First base to keep. Default is 1.") sp1.add_argument("-l", dest="last", default=0, type="int", help="Last base to keep. Default is entire read.") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args obfastqfile = op.basename(fastqfile) fq = obfastqfile.rsplit(".", 1)[0] + ".ntrimmed.fastq" if fastqfile.endswith(".gz"): fq = obfastqfile.rsplit(".", 2)[0] + ".ntrimmed.fastq.gz" cmd = "fastx_trimmer -Q33 " if args.first: cmd += "-f {0.first} ".format(opts) if args.last: cmd += "-l {0.last} ".format(opts) sh(cmd, infile=fastqfile, outfile=fq)
def consensus(args): """ %prog consensus fastafile bamfile Convert bam alignments to consensus FASTQ/FASTA. """ p = OptionParser(consensus.__doc__) sp1.add_argument("--fasta", default=False, action="store_true", help="Generate consensus FASTA sequences [default: %default]") sp1.add_argument("--mask", default=0, type="int", help="Mask bases with quality lower than") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile, bamfile = args fasta = args.fasta suffix = "fasta" if fasta else "fastq" pf = bamfile.rsplit(".", 1)[0] cnsfile = pf + ".cns.{0}".format(suffix) vcfgzfile = pf + ".vcf.gz" vcf([fastafile, bamfile, "-o", vcfgzfile]) cmd += "zcat {0} | vcfutils.pl vcf2fq".format(vcfgzfile) if fasta: cmd += " | seqtk seq -q {0} -A -".format(args.mask) sh(cmd, outfile=cnsfile)
def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = args.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print >> fw, " ".join((ref, query)) print >> fw, "NUCMER" fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def get_minibam_bed(bamfile, bedfile, minibam=None): """ samtools view -L could do the work, but it is NOT random access. Here we are processing multiple regions sequentially. See also: https://www.biostars.org/p/49306/ """ pf = op.basename(bedfile).split(".")[0] minibamfile = minibam or op.basename(bamfile).replace(".bam", ".{}.bam".format(pf)) minisamfile = minibam.replace(".bam", ".sam") baifile = minibamfile + ".bai" if op.exists(baifile): sh("rm {}".format(baifile)) cmd = "samtools view -H {} > {}".format(bamfile, minisamfile) sh(cmd) cmd = "cat {}".format(bedfile) cmd += " | perl -lane 'print \"$F[0]:$F[1]-$F[2]\"'" cmd += " | xargs -n1 -t -I \{\}" cmd += " samtools view {}".format(bamfile) cmd += " \{\} >> " + minisamfile sh(cmd) cmd = "samtools view {} -b".format(minisamfile) cmd += " | samtools sort -" cmd += " -o {0}".format(minibamfile) sh(cmd) sh("samtools index {0}".format(minibamfile)) return minibamfile
def first(args): """ %prog first N fastqfile(s) Get first N reads from file. """ from maize.apps.base import need_update p = OptionParser(first.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) N = int(args[0]) nlines = N * 4 fastqfiles = args[1:] fastqfile = fastqfiles[0] outfile = args.outfile if not need_update(fastqfiles, outfile): logging.debug("File `{0}` exists. Will not overwrite.".format(outfile)) return gz = fastqfile.endswith(".gz") for fastqfile in fastqfiles: if gz: cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines) else: cmd = "head -n {0} {1}".format(nlines, fastqfile) sh(cmd, outfile=args.outfile, append=True)
def get_minibam_bed(bamfile, bedfile, minibam=None): """ samtools view -L could do the work, but it is NOT random access. Here we are processing multiple regions sequentially. See also: https://www.biostars.org/p/49306/ """ pf = op.basename(bedfile).split(".")[0] minibamfile = minibam or op.basename(bamfile).replace( ".bam", ".{}.bam".format(pf)) minisamfile = minibam.replace(".bam", ".sam") baifile = minibamfile + ".bai" if op.exists(baifile): sh("rm {}".format(baifile)) cmd = "samtools view -H {} > {}".format(bamfile, minisamfile) sh(cmd) cmd = "cat {}".format(bedfile) cmd += " | perl -lane 'print \"$F[0]:$F[1]-$F[2]\"'" cmd += " | xargs -n1 -t -I \{\}" cmd += " samtools view {}".format(bamfile) cmd += " \{\} >> " + minisamfile sh(cmd) cmd = "samtools view {} -b".format(minisamfile) cmd += " | samtools sort -" cmd += " -o {0}".format(minibamfile) sh(cmd) sh("samtools index {0}".format(minibamfile)) return minibamfile
def sync_from_s3(s3_store, target_dir=None): s3_store = s3_store.rstrip("/") s3_store = s3ify(s3_store) if target_dir is None: target_dir = op.basename(s3_store) cmd = "aws s3 sync {}/ {}/".format(s3_store, target_dir) sh(cmd) return target_dir
def build_bwa(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/bwa") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if op.isfile("db.bwt") and not args.overwrite: logging.debug("db.bwt already exists - skipped") else: sh("bwa index -a bwtsw -p %s/db %s" % (dirw, fg))
def filter(args): """ %prog filter <deltafile|coordsfile> Produce a new delta/coords file and filter based on id% or cov%. Use `delta-filter` for .delta file. """ p = OptionParser(filter.__doc__) p.set_align(pctid=0, hitlen=0) sp1.add_argument("--overlap", default=False, action="store_true", help="Print overlap status (e.g. terminal, contained)") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pctid = args.pctid hitlen = args.hitlen filename, = args if pctid == 0 and hitlen == 0: return filename pf, suffix = filename.rsplit(".", 1) outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix)) if not need_update(filename, outfile): return outfile if suffix == "delta": cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename) sh(cmd, outfile=outfile) return outfile fp = open(filename) fw = must_open(outfile, "w") for row in fp: try: c = CoordsLine(row) except AssertionError: continue if c.identity < pctid: continue if c.len2 < hitlen: continue if args.overlap and not c.overlap: continue outrow = row.rstrip() if args.overlap: ov = Overlap_types[c.overlap] outrow += "\t" + ov print >> fw, outrow return outfile
def build_blat(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/blat") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if not args.overwrite and op.isfile('db.2bit'): logging.debug("db.2bit already exists - skipped") else: sh("faToTwoBit %s db.2bit" % fg) sh("blat db.2bit tmp.fas tmp.out -makeOoc=db.2bit.tile11.ooc") if op.isfile("tmp.out"): os.remove("tmp.out")
def fasta(args): """ %prog fasta fastqfiles Convert fastq to fasta and qual file. """ p = OptionParser(fasta.__doc__) sp1.add_argument("--seqtk", default=False, action="store_true", help="Use seqtk to convert") p.set_outdir() p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args outdir = args.outdir if outdir and outdir != ".": mkdir(outdir) fastqfile = fastqfiles[0] pf = op.basename(fastqfile) gzinput = pf.endswith(".gz") if gzinput: pf = pf.rsplit(".", 1)[0] pf, sf = pf.rsplit(".", 1) if sf not in ("fq", "fastq"): logging.debug("Assumed FASTA: suffix not `fq` or `fastq`") return fastqfile, None fastafile, qualfile = pf + ".fasta", pf + ".qual" outfile = args.outfile or fastafile outfile = op.join(outdir, outfile) if args.seqtk: if need_update(fastqfiles, outfile): for i, fastqfile in enumerate(fastqfiles): cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile) # First one creates file, following ones append to it sh(cmd, outfile=outfile, append=i) else: logging.debug("Outfile `{0}` already exists.".format(outfile)) return outfile, None for fastqfile in fastqfiles: SeqIO.convert(fastqfile, "fastq", fastafile, "fasta") SeqIO.convert(fastqfile, "fastq", qualfile, "qual") return fastafile, qualfile
def run_blat(infile=None, outfile=None, db="UniVec_Core", pctid=95, hitlen=50, cpus=16, overwrite=True): cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat" cmd += ' {0} {1} -out=blast8 {2}'.format(db, infile, outfile) sh(cmd) blatfile = outfile filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blatfile, outfile=filtered_blatfile, pctid=pctid, hitlen=hitlen) if overwrite: shutil.move(filtered_blatfile, blatfile)
def pull_from_s3(s3_store, file_name=None, overwrite=True): is_dir = s3_store.endswith("/") if is_dir: s3_store = s3_store.rstrip("/") file_name = file_name or s3_store.split("/")[-1] if not op.exists(file_name): s3_store = s3ify(s3_store) if overwrite or (not op.exists(file_name)): cmd = "aws s3 cp {0} {1} --sse".format(s3_store, file_name) if is_dir: cmd += " --recursive" sh(cmd) return op.abspath(file_name)
def build_gatk(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/gatk") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if op.isfile("db.dict") and not args.overwrite: logging.debug("db.dict already exists - skipped") else: if op.exists("db.fasta"): sh("rm db.fasta") if op.exists("db.dict"): sh("rm db.dict") sh("cp ../../10_genome.fna db.fasta") sh("gatk CreateSequenceDictionary -R db.fasta") sh("samtools faidx db.fasta")
def chainstat(args): sh("chain 2bed %s > tmp.bed" % args.fi) logging.debug("total size") sh("bed size tmp.bed") logging.debug("tgt noredundant size") sh("cut -f1-3 tmp.bed | sortBed -i stdin | mergeBed -i stdin | bed size -") logging.debug("qry noredundant size") sh("cut -f5-7 tmp.bed | sortBed -i stdin | mergeBed -i stdin | bed size -")
def index(args): """ %prog index samfile/bamfile If SAM file, convert to BAM, sort and then index, using SAMTOOLS """ p = OptionParser(index.__doc__) sp1.add_argument("--fasta", dest="fasta", default=None, help="add @SQ header to the BAM file [default: %default]") sp1.add_argument("--unique", default=False, action="store_true", help="only retain uniquely mapped reads [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) samfile, = args cpus = args.cpus fastafile = args.fasta if fastafile: assert op.exists(fastafile) bamfile = samfile.replace(".sam", ".bam") if fastafile: faifile = fastafile + ".fai" if need_update(fastafile, faifile): sh("samtools faidx {0}".format(fastafile)) cmd = "samtools view -bt {0} {1} -o {2}".\ format(faifile, samfile, bamfile) else: cmd = "samtools view -bS {0} -o {1}".\ format(samfile, bamfile) cmd += " -@ {0}".format(cpus) if args.unique: cmd += " -q 1" if samfile.endswith(".sam") and need_update(samfile, bamfile): sh(cmd) # Already sorted? if bamfile.endswith(".sorted.bam"): sortedbamfile = bamfile else: prefix = bamfile.replace(".bam", "") sortedbamfile = prefix + ".sorted.bam" if need_update(bamfile, sortedbamfile): cmd = "samtools sort {0} -o {1}".format(bamfile, sortedbamfile) cmd += " -@ {0}".format(cpus) sh(cmd) baifile = sortedbamfile + ".bai" if need_update(sortedbamfile, baifile): sh("samtools index {0}".format(sortedbamfile)) return sortedbamfile
def build_star(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/star") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) f_gtf = "../../50_annotation/10.gtf" if op.isfile("SA") and not args.overwrite: logging.debug("SA already exists - skipped") elif not op.isfile(f_gtf): logging.error("no gtf file: %s" % f_gtf ) sys.exit() else: sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \ --genomeFastaFiles %s --sjdbGTFfile %s" % (args.p, ".", fg, f_gtf))
def run_vecscreen(infile=None, outfile=None, db="UniVec_Core", pctid=None, hitlen=None): """ BLASTN parameters reference: http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html """ db = get_abs_path(db) nin = db + ".nin" run_formatdb(infile=db, outfile=nin) cmd = "blastn" cmd += " -task blastn" cmd += " -query {0} -db {1} -out {2}".format(infile, db, outfile) cmd += " -penalty -5 -gapopen 4 -gapextend 4 -dust yes -soft_masking true" cmd += " -searchsp 1750000000000 -evalue 0.01 -outfmt 6 -num_threads 8" sh(cmd)
def build_star(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/star") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) f_gtf = "../../50_annotation/10.gtf" if op.isfile("SA") and not args.overwrite: logging.debug("SA already exists - skipped") elif not op.isfile(f_gtf): logging.error("no gtf file: %s" % f_gtf) sys.exit() else: sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \ --genomeFastaFiles %s --sjdbGTFfile %s" % (args.p, ".", fg, f_gtf))
def bed(args): """ %prog bed bedfile bamfiles Convert bam files to bed. """ p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bedfile = args[0] bamfiles = args[1:] for bamfile in bamfiles: cmd = "bamToBed -i {0}".format(bamfile) sh(cmd, outfile=bedfile, append=True)
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): fw.write("\t".join((k, str(size))) + "\n") fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def sort(args): """ %prog sort <blastfile|coordsfile> Sort lines so that same query grouped together with scores descending. The sort is 'in-place'. """ p = OptionParser(sort.__doc__) sp1.add_argument("--query", default=False, action="store_true", help="Sort by query position [default: %default]") sp1.add_argument("--ref", default=False, action="store_true", help="Sort by reference position [default: %default]") sp1.add_argument("--refscore", default=False, action="store_true", help="Sort by reference name, then score descending [default: %default]") sp1.add_argument("--coords", default=False, action="store_true", help="File is .coords generated by NUCMER [default: %default]") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args if opts.coords: if opts.query: key = "-k13,13 -k3,3n" elif opts.ref: key = "-k12,12 -k1,1n" else: if opts.query: key = "-k1,1 -k7,7n" elif opts.ref: key = "-k2,2 -k9,9n" elif opts.refscore: key = "-k2,2 -k12,12gr" else: key = "-k1,1 -k12,12gr" cmd = "sort" if opts.tmpdir: cmd += " -T {0}".format(opts.tmpdir) cmd += " {0} {1} -o {1}".format(key, blastfile) sh(cmd)
def merge(self, checkexists=False): outfile = self.outfile if checkexists and not need_update(self.filelist, outfile): logging.debug("File `{0}` exists. Merge skipped.".format(outfile)) return files = " ".join(self.filelist) ingz, outgz = self.ingz, self.outgz if ingz and outgz: # can merge gz files directly cmd = "cat {0} > {1}".format(files, outfile) sh(cmd) else: cmd = "zcat" if self.ingz else "cat" cmd += " " + files sh(cmd, outfile=outfile) return outfile
def get_cookies(name="*****@*****.**", cookies="cookies"): from getpass import getpass # Check if cookies is still good if op.exists(cookies) and last_updated(cookies) < 3600: return cookies username = raw_input("Phytozome Login [{0}]: ".format(name)) if username.strip() == '': username = name pw = getpass("Phytozome Password: "******"curl https://signon.jgi.doe.gov/signon/create --data-ascii" cmd += " login={0}\&password={1} -b {2} -c {2}".format(username, pw, cookies) sh(cmd, outfile="/dev/null", errfile="/dev/null", log=False) return cookies
def vcf(args): """ %prog vcf fastafile bamfiles > out.vcf.gz Call SNPs on bam files. """ from maize.apps.grid import Jobs valid_callers = ("mpileup", "freebayes") p = OptionParser(vcf.__doc__) p.set_outfile(outfile="out.vcf.gz") sp1.add_argument("--nosort", default=False, action="store_true", help="Do not sort the BAM files") sp1.add_argument("--caller", default="mpileup", choices=valid_callers, help="Use variant caller [default: %default]") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) fastafile = args[0] bamfiles = args[1:] caller = args.caller unsorted = [x for x in bamfiles if ".sorted." not in x] if args.nosort: bamfiles = unsorted else: jargs = [[[x, "--unique"]] for x in unsorted] jobs = Jobs(index, args=jargs) jobs.run() bamfiles = [x.replace(".sorted.bam", ".bam") for x in bamfiles] bamfiles = [x.replace(".bam", ".sorted.bam") for x in bamfiles] if caller == "mpileup": cmd = "samtools mpileup -E -uf" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) cmd += " | bcftools call -vmO v" elif caller == "freebayes": cmd = "freebayes -f" cmd += " {0} {1}".format(fastafile, " ".join(bamfiles)) sh(cmd, outfile=args.outfile)
def fromdelta(args): """ %prog fromdelta deltafile Convert deltafile to coordsfile. """ p = OptionParser(fromdelta.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) deltafile, = args coordsfile = deltafile.rsplit(".", 1)[0] + ".coords" cmd = "show-coords -rclH {0}".format(deltafile) sh(cmd, outfile=coordsfile) return coordsfile
def update_conda(args): envs1 = '''base snk work blast hisat2 bismark alfred egglib multiqc primer3 python2 wasp test'''.split() envs2 = '''base snk work'''.split() envs = envs1 if args.opt == 2: envs = envs2 print("will update %d environments: %s" % (len(envs), ' '.join(envs))) for env in envs: print("updating %s" % env) p = Popen(["conda update -n %s --all" % env], stdin=PIPE, shell=True) outs, errs = p.communicate(input=b'y\n') p.terminate() sh("conda env export -n %s --no-builds > $snk/envs/%s.yml" % (env, env))
def noclip(args): """ %prog noclip bamfile Remove clipped reads from BAM. """ p = OptionParser(noclip.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bamfile, = args noclipbam = bamfile.replace(".bam", ".noclip.bam") cmd = "samtools view -h {} | awk -F '\t' '($6 !~ /H|S/)'".format(bamfile) cmd += " | samtools view -@ 4 -b -o {}".format(noclipbam) sh(cmd) sh("samtools index {}".format(noclipbam))
def frompsl(args): """ %prog frompsl old.new.psl old.fasta new.fasta Generate chain file from psl file. The pipeline is describe in: <http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver> """ from maize.formats.sizes import Sizes p = OptionParser(frompsl.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) pslfile, oldfasta, newfasta = args pf = oldfasta.split(".")[0] # Chain together alignments from using axtChain chainfile = pf + ".chain" twobitfiles = [] for fastafile in (oldfasta, newfasta): tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles if need_update(pslfile, chainfile): cmd = "axtChain -linearGap=medium -psl {0}".format(pslfile) cmd += " {0} {1} {2}".format(oldtwobit, newtwobit, chainfile) sh(cmd) # Sort chain files sortedchain = chainfile.rsplit(".", 1)[0] + ".sorted.chain" if need_update(chainfile, sortedchain): cmd = "chainSort {0} {1}".format(chainfile, sortedchain) sh(cmd) # Make alignment nets from chains netfile = pf + ".net" oldsizes = Sizes(oldfasta).filename newsizes = Sizes(newfasta).filename if need_update((sortedchain, oldsizes, newsizes), netfile): cmd = "chainNet {0} {1} {2}".format(sortedchain, oldsizes, newsizes) cmd += " {0} /dev/null".format(netfile) sh(cmd) # Create liftOver chain file liftoverfile = pf + ".liftover.chain" if need_update((netfile, sortedchain), liftoverfile): cmd = "netChainSubset {0} {1} {2}".\ format(netfile, sortedchain, liftoverfile) sh(cmd)
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) sp1.add_argument("--paired", default=False, action="store_true", help="Specify if library layout is paired-end " + \ "[default: %default]") sp1.add_argument("--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]") p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) srafile, = args paired = args.paired compress = args.compress outdir = args.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-files") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=args.grid)
def convert(args): """ %prog convert in.fastq illumina fastq quality encoding uses offset 64, and sanger uses 33. This script creates a new file with the correct encoding. Output gzipped file if input is also gzipped. """ p = OptionParser(convert.__doc__) p.set_phred() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) infastq, = args phred = args.phred or str(guessoffset([infastq])) ophred = {"64": "33", "33": "64"}[phred] gz = infastq.endswith(".gz") outfastq = infastq.rsplit(".", 1)[0] if gz else infastq pf, sf = outfastq.rsplit(".", 1) outfastq = "{0}.q{1}.{2}".format(pf, ophred, sf) if gz: outfastq += ".gz" fin = "illumina" if phred == "64" else "sanger" fout = "sanger" if phred == "64" else "illumina" seqret = "seqret" if infastq.endswith(".gz"): cmd = "zcat {0} | ".format(infastq) cmd += seqret + " fastq-{0}::stdin fastq-{1}::stdout".\ format(fin, fout) else: cmd = seqret + " fastq-{0}::{1} fastq-{2}::stdout".\ format(fin, infastq, fout) sh(cmd, outfile=outfastq) return outfastq
def count(args): """ %prog count bamfile gtf Count the number of reads mapped using `htseq-count`. """ p = OptionParser(count.__doc__) sp1.add_argument("--type", default="exon", help="Only count feature type") p.set_cpus(cpus=8) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, gtf = args cpus = args.cpus pf = bamfile.split(".")[0] countfile = pf + ".count" if not need_update(bamfile, countfile): return nsorted = pf + "_nsorted" nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam" if need_update(bamfile, nsortedsam): cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted) sh(cmd) cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam) sh(cmd, outfile=nsortedsam) if need_update(nsortedsam, countfile): cmd = "htseq-count --stranded=no --minaqual=10" cmd += " -t {0}".format(args.type) cmd += " {0} {1}".format(nsortedsam, gtf) sh(cmd, outfile=countfile)
def pairs(args): """ See __doc__ for OptionParser.set_pairs(). """ import maize.formats.bed p = OptionParser(pairs.__doc__) p.set_pairs() opts, targs = p.parse_args(args) if len(targs) != 1: sys.exit(not p.print_help()) samfile, = targs bedfile = samfile.rsplit(".", 1)[0] + ".bed" if need_update(samfile, bedfile): cmd = "bamToBed -i {0}".format(samfile) sh(cmd, outfile=bedfile) args[args.index(samfile)] = bedfile return maize.formats.bed.pairs(args)
def last(args, dbtype=None): """ %prog database.fasta query.fasta Run LAST by calling LASTDB and LASTAL. LAST program available: <http://last.cbrc.jp> Works with LAST-719. """ query, db = args.query, args.db path = args.path nthread = args.thread if not dbtype: dbtype = args.dbtype getpath = lambda x: op.join(path, x) if path else x lastdb_bin = getpath("lastdb") lastal_bin = getpath("lastal") u = 2 if args.mask else 0 cmd = "{0} -u {1}".format(lastal_bin, u) cmd += " -P {0} -i3G".format(nthread) cmd += " -f {0}".format(args.format) cmd += " {0} {1}".format(db, query) minlen = args.minlen minid = args.minid extra = args.params assert minid != 100, "Perfect match not yet supported" mm = minid / (100 - minid) if minlen: extra += " -e{0}".format(minlen) if minid: extra += " -r1 -q{0} -a{0} -b{0}".format(mm) if extra: cmd += " " + extra.strip() sh(cmd)
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) sp1.add_argument("--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format") sp1.add_argument("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = args.format if args.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".\ format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(args.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print >> fw, "\t".join((seqid, "{0:.1f}".format(cov))) fw.close()