def blat(args): """ %prog blat old.fasta new.fasta Generate psl file using blat. """ p = OptionParser(blat.__doc__) p.add_option("--minscore", default=100, type="int", help="Matches minus mismatches gap penalty [default: %default]") p.add_option("--minid", default=98, type="int", help="Minimum sequence identity [default: %default]") p.set_cpus() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) oldfasta, newfasta = args twobitfiles = [] for fastafile in args: tbfile = faToTwoBit(fastafile) twobitfiles.append(tbfile) oldtwobit, newtwobit = twobitfiles cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat" cmd += " {0} {1}".format(oldtwobit, newfasta) cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\ format(opts.minscore, opts.minid) pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \ for x in (newfasta, oldfasta))) cmd += pslfile sh(cmd)
def check_cfg_fqtrim(c, njob = 1, noutdir = 3): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.adapter, c.trimmomatic]: assert op.isfile(fn), "cannot read %s" % fn for key in ['fastqc', 'parallel']: fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def check_cfg_mapping(c): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) if c.mapper == 'bwa': c.bwa = which(c.bwa) assert c.bwa is not None, "not executable: %s" % c.bwa elif c.mapper == 'hisat2': c.hisat2 = which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'bowtie2': c.bowtie2 = which(c.bowtie2) assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2 else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) njob = 3 c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len( c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def run_blat(infile=None, outfile=None, db="UniVec_Core", pctid=95, hitlen=50, cpus=16, overwrite=True): cmd = "pblat -threads={0}".format(cpus) if which("pblat") else "blat" cmd += ' {0} {1} -out=blast8 {2}'.format(db, infile, outfile) sh(cmd) blatfile = outfile filtered_blatfile = outfile + ".P{0}L{1}".format(pctid, hitlen) run_blast_filter(infile=blatfile, outfile=filtered_blatfile, pctid=pctid, hitlen=hitlen) if overwrite: shutil.move(filtered_blatfile, blatfile)
def check_cfg_mapping(c): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) if c.mapper == 'bwa': c.bwa = which(c.bwa) assert c.bwa is not None, "not executable: %s" % c.bwa elif c.mapper == 'hisat2': c.hisat2 = which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'bowtie2': c.bowtie2 = which(c.bowtie2) assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2 else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) njob = 3 c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def __init__(self, filename, select=None): assert op.exists(filename), "File `{0}` not found".format(filename) # filename can be both .sizes file or FASTA formatted file sizesname = filename if not filename.endswith(".sizes"): sizesname = filename + ".sizes" filename = get_abs_path(filename) if need_update(filename, sizesname): cmd = "faSize" if which(cmd): cmd += " -detailed {0}".format(filename) sh(cmd, outfile=sizesname) else: from jcvi.formats.fasta import Fasta f = Fasta(filename) fw = open(sizesname, "w") for k, size in f.itersizes_ordered(): fw.write("\t".join((k, str(size))) + "\n") fw.close() filename = sizesname assert filename.endswith(".sizes") super(Sizes, self).__init__(filename) self.fp = open(filename) self.filename = filename # get sizes for individual contigs, both in list and dict # this is to preserve the input order in the sizes file sizes = list(self.iter_sizes()) if select: assert select > 0 sizes = [x for x in sizes if x[1] >= select] self.sizes_mapping = dict(sizes) # get cumulative sizes, both in list and dict ctgs, sizes = zip(*sizes) self.sizes = sizes cumsizes = np.cumsum([0] + list(sizes)) self.ctgs = ctgs self.cumsizes = cumsizes self.cumsizes_mapping = dict(zip(ctgs, cumsizes))
def fromsra(args): """ %prog fromsra srafile Convert sra file to fastq using the sratoolkit `fastq-dump` """ p = OptionParser(fromsra.__doc__) sp1.add_argument("--paired", default=False, action="store_true", help="Specify if library layout is paired-end " + \ "[default: %default]") sp1.add_argument("--compress", default=None, choices=["gzip", "bzip2"], help="Compress output fastq files [default: %default]") p.set_outdir() p.set_grid() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) srafile, = args paired = args.paired compress = args.compress outdir = args.outdir script_path = which("fastq-dump") if not script_path: logging.error("Cannot find `fastq-dump` in the PATH") sys.exit() cmd = [script_path] if compress: cmd.append("--{0}".format(compress)) if paired: cmd.append("--split-files") if outdir: cmd.append("--outdir {0}".format(outdir)) cmd.append(srafile) outcmd = " ".join(cmd) sh(outcmd, grid=args.grid)
def check_cfg_index(c, noutdir = 1, njob = 3): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.vcf, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'bcftools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") c.pbs_mems = c.pbs_mem.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns) == len(c.pbs_mems), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def check_cfg_mapping(c, noutdir = 4, njob = 2): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.vcf, c.gene_bed]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba htseq bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) assert c.stranded in ['yes', 'no', 'reverse'], "unknown stranded option: %s" % c.stranded if c.mapper == 'tophat2': c.tophat2 = which(c.tophat2) assert c.tophat2 is not None, "not executable: %s" % c.tophat2 elif c.mapper == 'hisat2': c.hisat2= which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'star': c.star= which(c.star) assert c.star is not None, "not executable: %s" % c.star else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) assert op.isdir(c.genomedir), "cannot access %s" % c.genomedir t = Table.read(c.ilist, format = 'ascii.tab') if 'genome' not in t[0]: genomeb = 'B73c' logging.debug("no 'genome' column detected: use %s" % genomeb) t.add_column(Column([genomeb] * len(t)), name = 'genome') c.t = t genomes = set() for i in range(len(t)): gts = t['genome'][i].split(",") for gt in gts: genomes.add(gt) genomes = sorted(list(genomes)) logging.debug("checking %d genomes" % len(genomes)) c.genomes = dict() for gt in genomes: c.genomes[gt] = dict() dirg = "%s/%s" % (c.genomedir, gt) dbpre = '' if c.mapper == 'tophat2': dbpre = "%s/21.bowtie2/db" % dirg assert op.isfile("%s.4.bt2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) elif c.mapper == 'hisat2': dbpre = "%s/21.hisat2/db" % dirg assert op.isfile("%s.8.ht2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) elif c.mapper == 'star': dbpre = "%s/21.star" % dirg assert op.isfile("%s/SA" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) c.genomes[gt]['db'] = dbpre gff = "%s/51.gff" % dirg assert op.isfile(gff), "no gff for %s: %s" % (gff, gt) c.genomes[gt]['gff'] = gff c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c