def clean_fasta(args): dirw = make_genomedir(args.species) os.chdir(dirw) for fname in ["raw.fix.fas.index", "11_genome.fas.index"]: if op.isfile(fname): os.remove(fname) if op.islink("10_genome.fna"): os.unlink("10_genome.fna") if op.isfile("10_genome.fna") and not args.overwrite: logging.debug("10_genome.fna already exits: skipped") elif op.isfile("08_seq_map/renamed.fna"): sh("ln -sf 08_seq_map/renamed.fna 10_genome.fna") if op.isfile("08_seq_map/renamed.sizes"): sh("ln -sf 08_seq_map/renamed.sizes 10_genome.sizes") else: logging.error("08_seq_map/renamed.fna not there") sys.exit(1) if not op.isdir("15_intervals"): mkdir("15_intervals") if op.isfile("15_intervals/01.chrom.bed") and not args.overwrite: logging.debug("01.chrom.bed already exits - skipped") else: sh("fasta size --bed 10_genome.fna > 15_intervals/01.chrom.bed") if op.isfile("15_intervals/01.chrom.sizes") and not args.overwrite: logging.debug("01.chrom.sizes already exits - skipped") else: sh("faSize -detailed 10_genome.fna > 15_intervals/01.chrom.sizes") if op.isfile("15_intervals/11.gap.bed") and not args.overwrite: logging.debug("11.gap.bed already exits - skipped") else: sh("fasta gaps 10_genome.fna > 15_intervals/11.gap.bed")
def check_cfg_fqtrim(c, njob = 1, noutdir = 3): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.adapter, c.trimmomatic]: assert op.isfile(fn), "cannot read %s" % fn for key in ['fastqc', 'parallel']: fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def sra(args): """ %prog sra [term|term.ids] Given an SRA run ID, fetch the corresponding .sra file from the sra-instant FTP. The term can also be a file containing list of SRR ids, one per line. Once downloaded, the SRA file is processed through `fastq-dump` to produce FASTQ formatted sequence files, which are gzipped by default. """ p = OptionParser(sra.__doc__) sp1.add_argument("--nogzip", dest="nogzip", default=False, action="store_true", help="Do not gzip the FASTQ generated by fastq-dump") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) term, = args if op.isfile(term): terms = [x.strip() for x in open(term)] else: terms = [term] for term in terms: srafile = download_srr_term(term) pf = srafile.split(".")[0] mkdir(pf) _opts = [srafile, "--paired", "--outdir={0}".format(pf)] if not args.nogzip: _args.append("--compress=gzip") fromsra(_opts)
def index(cfg, args): c = AttrDict(cfg['index']) c = check_cfg_index(c) if args.check: return 0 os.chdir(c.dirw) jcmds = [[ "cd %s" % c.dirw ],[ "cd %s" % c.dirw ],[ "cd %s" % c.dirw ]] bcfgs = [ [dict(opt = 'bash')], [dict(opt = 'parallel', thread = c.pbs_ppns[1])], [dict(opt = 'bash')] ] assert c.njob == len(bcfgs) == len(jcmds), "not %d jobs" % c.njob jobs = [] for i in range(c.njob): prefix = "%s.%d" % (c.job_prefix, i+1) jcfg = { 'queue': c.pbs_queues[i], 'ppn': c.pbs_ppns[i], 'walltime': c.pbs_walltimes[i], 'mem': c.pbs_mems[i], 'email': c.pbs_email, } job = PbsJob.from_cfg(jcfg = jcfg, jcmds = jcmds[i], bcfgs = bcfgs[i], prefix = prefix, njob = len(bcfgs[i]), bash = c.bash, parallel = c.parallel) jobs.append(job) t = Table.read(c.ilist, format = 'ascii.tab') nrow = len(t) gts = [t['genotype'][x] for x in range(nrow) if t['type'][x] == 'Inbred'] gts = set(gts) logging.debug("creating pseudo-refs for %d genomes" % len(gts)) print(" ".join(gts)) for gt in gts: diro = "%s/%s" % (c.outdirs[0], gt) mkdir(diro) jobs[0].subjobs[0].add_cmd("%s consensus -f %s %s -s %s \ -c %s/25.chain -o %s/11_genome.fas" % \ (c.bcftools, c.genome, c.vcf, gt, diro, diro)) #jobs[1].subjobs[0].add_cmd("genome fasta %s" % diro) #jobs[0].subjobs[0].add_cmd("genome blat %s" % diro) #jobs[0].subjobs[0].add_cmd("genome bwa %s" % diro) jobs[1].subjobs[0].add_cmd("genome bowtie %s" % diro) jobs[2].subjobs[0].add_cmd("genome hisat %s" % diro) for job in jobs: job.write() fj = "%s.sh" % c.job_prefix create_job_chain([job.fname for job in jobs], fj) logging.debug("job chain with %s jobs was created: %s" % (c.njob, fj))
def make_genomedir(species): dirw = species if species.isalnum(): dirw = op.join("/home/springer/zhoux379/data/genome", species) logging.debug("converting species to directory: %s" % dirw) if not op.isdir(dirw): logging.debug("creating diretory: %s" % dirw) mkdir(dirw) return dirw
def build_bwa(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/bwa") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if op.isfile("db.bwt") and not args.overwrite: logging.debug("db.bwt already exists - skipped") else: sh("bwa index -a bwtsw -p %s/db %s" % (dirw, fg))
def build_blat(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/blat") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if not args.overwrite and op.isfile('db.2bit'): logging.debug("db.2bit already exists - skipped") else: sh("faToTwoBit %s db.2bit" % fg) sh("blat db.2bit tmp.fas tmp.out -makeOoc=db.2bit.tile11.ooc") if op.isfile("tmp.out"): os.remove("tmp.out")
def fasta(args): """ %prog fasta fastqfiles Convert fastq to fasta and qual file. """ p = OptionParser(fasta.__doc__) sp1.add_argument("--seqtk", default=False, action="store_true", help="Use seqtk to convert") p.set_outdir() p.set_outfile(outfile=None) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) fastqfiles = args outdir = args.outdir if outdir and outdir != ".": mkdir(outdir) fastqfile = fastqfiles[0] pf = op.basename(fastqfile) gzinput = pf.endswith(".gz") if gzinput: pf = pf.rsplit(".", 1)[0] pf, sf = pf.rsplit(".", 1) if sf not in ("fq", "fastq"): logging.debug("Assumed FASTA: suffix not `fq` or `fastq`") return fastqfile, None fastafile, qualfile = pf + ".fasta", pf + ".qual" outfile = args.outfile or fastafile outfile = op.join(outdir, outfile) if args.seqtk: if need_update(fastqfiles, outfile): for i, fastqfile in enumerate(fastqfiles): cmd = "seqtk seq -A {0} -L 30 -l 70".format(fastqfile) # First one creates file, following ones append to it sh(cmd, outfile=outfile, append=i) else: logging.debug("Outfile `{0}` already exists.".format(outfile)) return outfile, None for fastqfile in fastqfiles: SeqIO.convert(fastqfile, "fastq", fastafile, "fasta") SeqIO.convert(fastqfile, "fastq", qualfile, "qual") return fastafile, qualfile
def build_bowtie(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/bowtie2") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if op.isfile("db.rev.1.bt2") and not args.overwrite: logging.debug("db.*.bt2 already exists - skipped") else: sh("rm -rf *") sh("ln -sf %s db.fa" % fg) # need to "module load bowtie2" sh("bowtie2-build db.fa db")
def build_gatk(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/gatk") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) if op.isfile("db.dict") and not args.overwrite: logging.debug("db.dict already exists - skipped") else: if op.exists("db.fasta"): sh("rm db.fasta") if op.exists("db.dict"): sh("rm db.dict") sh("cp ../../10_genome.fna db.fasta") sh("gatk CreateSequenceDictionary -R db.fasta") sh("samtools faidx db.fasta")
def build_hisat(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/hisat2") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) f_gtf = "../../50_annotation/10.gtf" if op.isfile("db.1.ht2") and not args.overwrite: logging.debug("db.1.ht2 already exists - skipped") elif not op.isfile(f_gtf): logging.error("no gtf file: f_gtf") sys.exit() else: sh("hisat2_extract_exons.py %s > db.exon" % f_gtf) sh("hisat2_extract_splice_sites.py %s > db.ss" % f_gtf) sh("hisat2-build -p %d --ss db.ss --exon db.exon %s db" % (args.p, fg))
def build_star(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/star") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) f_gtf = "../../50_annotation/10.gtf" if op.isfile("SA") and not args.overwrite: logging.debug("SA already exists - skipped") elif not op.isfile(f_gtf): logging.error("no gtf file: %s" % f_gtf) sys.exit() else: sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \ --genomeFastaFiles %s --sjdbGTFfile %s" % (args.p, ".", fg, f_gtf))
def build_star(args): dirg, fg = get_genomedir(args.species) dirw = op.join(dirg, "21_dbs/star") if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) f_gtf = "../../50_annotation/10.gtf" if op.isfile("SA") and not args.overwrite: logging.debug("SA already exists - skipped") elif not op.isfile(f_gtf): logging.error("no gtf file: %s" % f_gtf ) sys.exit() else: sh("STAR --runThreadN %d --runMode genomeGenerate --genomeDir %s \ --genomeFastaFiles %s --sjdbGTFfile %s" % (args.p, ".", fg, f_gtf))
def __init__(self, filename, outputdir=None, format="fasta", mode="cycle"): self.filename = filename self.outputdir = outputdir self.mode = mode format = format or self._guess_format(filename) logging.debug("format is %s" % format) if format in ("fasta", "fastq"): self.klass = "seqio" elif format == "clust": self.klass = "clust" else: self.klass = "txt" self.format = format mkdir(outputdir)
def merge_dirs(args): diris, diro = args.diri, args.diro mkdir(diro, overwrite=True) for diri in args.diri: for fn in os.listdir(diri): fi = op.join(diri, fn) fo = op.join(diro, fn) if not op.isfile(fi): continue if op.isfile(fo): if not cmp(fi, fo): if args.replace: copy(fi, fo) else: print("%s/%s diff from %s/%s - skipped" % (diri, fn, diro, fn)) else: copy(fi, fo)
def merge(args): """ %prog merge merged_bams bams1_dir bams2_dir ... Merge BAM files. Treat the bams with the same prefix as a set. Output the commands first. """ from maize.apps.grid import MakeManager p = OptionParser(merge.__doc__) p.set_sep(sep="_", help="Separator to group per prefix") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) merged_bams = args[0] bamdirs = args[1:] mkdir(merged_bams) bams = [] for x in bamdirs: bams += glob(op.join(x, "*.bam")) bams = [x for x in bams if "nsorted" not in x] logging.debug("Found a total of {0} BAM files.".format(len(bams))) sep = args.sep key = lambda x: op.basename(x).split(sep)[0] bams.sort(key=key) mm = MakeManager() for prefix, files in groupby(bams, key=key): files = sorted(list(files)) nfiles = len(files) source = " ".join(files) target = op.join(merged_bams, op.basename(files[0])) if nfiles == 1: source = get_abs_path(source) cmd = "ln -s {0} {1}".format(source, target) mm.add("", target, cmd) else: cmd = "samtools merge -@ 8 {0} {1}".format(target, source) mm.add(files, target, cmd, remove=True) mm.write()
def prepare(cfg): cfg = cfg['prepare'] dirw, qry, tgt = cfg['dirw'], cfg['qry'], cfg['tgt'] qry_fas, tgt_fas = cfg['qry_fas'], cfg['tgt_fas'] tmpdir = cfg['temp_dir'] npieces = int(cfg['npieces']) if not op.isdir(dirw): logging.debug("making directory: %s" % dirw) mkdir(dirw) os.chdir(dirw) subdirs = ['01_tgt_genome', '02_qry_genome'] for subdir in subdirs: if not op.isdir(subdir): logging.debug("making directory: %s" % subdir) mkdir(subdir) if op.isfile(tgt_fas): fo = "raw.fa" if tgt_fas.endswith(".fa.gz") or tgt_fas.endswith(".fas.gz"): fo = "raw.fa.gz" sh("ln -sf %s 01_tgt_genome/%s" % (tgt_fas, fo)) else: logging.error("%s not exist" % qry_fas) if op.isfile(qry_fas): fo = "raw.fa" if qry_fas.endswith(".fa.gz") or qry_fas.endswith(".fas.gz"): fo = "raw.fa.gz" sh("ln -sf %s 02_qry_genome/%s" % (qry_fas, fo)) else: logging.error("%s not exist" % qry_fas) sh("genome fasta %s/%s" % (dirw, "01_tgt_genome")) sh("genome blat %s/%s" % (dirw, "01_tgt_genome")) sh("genome fasta %s/%s" % (dirw, "02_qry_genome")) sh("genome blat %s/%s" % (dirw, "02_qry_genome")) sh("bed filter --minsize 1000 02_qry_genome/16.gap.bed > 04.qry.gap.bed") sh("subtractBed -nonamecheck -a 02_qry_genome/15.bed -b 04.qry.gap.bed | bed filter -min 100 - | bed makewindow -w 100000 -s 95000 - > 05.qry.clean.bed") sh("bed size 05.qry.clean.bed") sh("fasta extract 02_qry_genome/11_genome.fas 05.qry.clean.bed > 06.qry.fas") sh("fasta split --N %d %s %s" % (npieces, '06.qry.fas', diro1))
def tsvs(args): """ %prog tsvs excelfile Convert all worksheets in EXCEL to tsv files. """ excelfile = args.excel odir = args.outdir sep = args.sep xl = pd.ExcelFile(excelfile) sheets = xl.sheet_names print("will convert %d sheets under %s" % (len(sheets), odir)) mkdir(odir) suf = '.tsv' if sep == '\t' else '.csv' for sheet in sheets: fo = "%s/%s%s" % (odir, sheet, suf) print(" writing %s" % fo) df = pd.read_excel(excelfile, sheet_name=sheet, header=0) df.to_csv(fo, sep=sep, header=True, index=False)
def check_cfg_mapping(c): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) if c.mapper == 'bwa': c.bwa = which(c.bwa) assert c.bwa is not None, "not executable: %s" % c.bwa elif c.mapper == 'hisat2': c.hisat2 = which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'bowtie2': c.bowtie2 = which(c.bowtie2) assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2 else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) njob = 3 c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len( c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def check_cfg_mapping(c): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == 2, "not 2 outdirs: %s" % c.outdir for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) if c.mapper == 'bwa': c.bwa = which(c.bwa) assert c.bwa is not None, "not executable: %s" % c.bwa elif c.mapper == 'hisat2': c.hisat2 = which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'bowtie2': c.bowtie2 = which(c.bowtie2) assert c.bowtie2 is not None, "not executable: %s" % c.bowtie2 else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) njob = 3 c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
def split_old(args): fi, dirw = op.realpath(args.fi), op.realpath(args.outdir) n = args.N if not op.exists(dirw): mkdir(dirw) else: sh("rm -rf %s/*" % dirw) cdir = os.path.dirname(os.path.realpath(__file__)) cwd = os.getcwd() os.chdir(dirw) sh("ln -sf %s part.fas" % fi) sh("pyfasta split -n %d part.fas" % n) sh("rm part.fas part.fas.*") digit = ndigit(n) sizes = [] for i in range(0,n): fmt = "part.%%0%dd.fas" % digit fp = fmt % i sizes.append(os.stat(fp).st_size) sizes.sort() print("size range: %s - %s" % (prettysize(sizes[0]), prettysize(sizes[n-1])))
def check_cfg_index(c, noutdir = 1, njob = 3): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.vcf, c.genome, c.gff]: assert op.isfile(fn), "cannot read %s" % fn for key in 'bcftools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") c.pbs_mems = c.pbs_mem.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns) == len(c.pbs_mems), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
description='rename fastq files') parser.add_argument('diri', help='FROM directory path') parser.add_argument('diro', help='TO directory path') parser.add_argument('list', help='read list (sid Readfile1[ Readfile2])') args = parser.parse_args() #dirw = '/home/springer/zhoux379/projects/3rnaseq/cache' #diri = '/scratch.global/tenders/3RNA_0418/fastq_gz_files' #diro = '10.fastq' diri, diro, ilist = args.diri, args.diro, args.list if not op.isfile(ilist): logging.error("cannot read %s" % ilist) sys.exit() if not op.isdir(diro): mkdir(diro) t = Table.read(ilist, format='ascii.tab') paired = False if 'Readfile2' in t.colnames: paired = True tag = "paired" if paired else "single" logging.debug("proceed as %s-end reads" % tag) for i in range(len(t)): if paired: sid, fq1, fq2 = t['sid'][i], t['Readfile1'][i], t['Readfile2'][i] fq1, fq2 = op.join(diri, fq1), op.join(diri, fq2) assert op.isfile(fq1), "cannot access fq1: %s" % fq1 assert op.isfile(fq2), "cannot access fq2: %s" % fq2 cmd = "cp %s %s/%s_1.fq.gz" % (fq1, diro, sid)
def run_blat(cfg): cfg = cfg['blat'] dirw, jobpre, diro1, diro2 = \ cfg['dirw'], cfg['job_prefix'], cfg['outdir1'], cfg['outdir2'] qry, tgt = cfg['qry'], cfg['tgt'] parallel = cfg['parallel'] npieces, npieces2 = int(cfg['npieces']), int(cfg['npieces2']) pbs_template, pbs_queue, pbs_walltime, pbs_ppn, pbs_email = \ cfg['pbs_template'], cfg['pbs_queue'], cfg['pbs_walltime'], \ cfg['pbs_ppn'], cfg['pbs_email'] if not op.isdir(dirw): mkdir(dirw) os.chdir(dirw) for subdir in [diro1, diro2]: if not op.isdir(subdir): mkdir(subdir) dirt = "01_tgt_genome" dirq = "02_qry_genome" tgt_fas = "%s/11_genome.fas" % dirt qry_fas = "%s/11_genome.fas" % dirq tgt_2bit = "%s/21.blat/db.2bit" % dirt qry_2bit = "%s/21.blat/db.2bit" % dirq tgt_ooc = "%s/21.blat/db.2bit.tile11.ooc" % dirt tgt_size = "%s/15.sizes" % dirt qry_size = "%s/15.sizes" % dirq tgt_size_bed = "%s/15.bed" % dirt qry_size_bed = "%s/15.bed" % dirq tgt_gap = "%s/16.gap.bed" % dirt qry_gap = "%s/16.gap.bed" % dirq for fn in [tgt_fas, qry_fas, tgt_2bit, qry_2bit, tgt_ooc, tgt_size, qry_size, tgt_size_bed, qry_size_bed, tgt_gap, qry_gap]: if not op.isfile(fn): logging.error("%s is not there" % fn) sys.exit() pbs_queues = pbs_queue.split(",") pbs_ppns = pbs_ppn.split(",") pbs_walltimes = pbs_walltime.split(",") njob = len(pbs_queues) assert len(pbs_walltimes) == njob, "not %d jobs" % njob assert len(pbs_ppns) == njob, "not %d jobs" % njob fbs = ["%sb.%d.sh" % (jobpre, i+1) for i in range(njob)] fjs = ["%sj.%d.pbs" % (jobpre, i+1) for i in range(njob)] bcmds, jcmds = [], [] #1 blat cmds = [] bcmds.append(cmds) prepre = "part.%%0%dd" % ndigit(npieces-1) jcmds.append([ "let i=${PBS_ARRAYID}", "cd %s" % dirw, "printf -v pre %s \"$i\"" % prepre, "pblat %s %s/${pre}.fas -threads=%s -ooc=%s %s/${pre}.psl" % \ (tgt_2bit, diro1, pbs_ppns[0], tgt_ooc, diro2) ]) #2 process blat bcmds.append([ "pslCat -nohead 12.blat/part.*.psl > 12.psl", "psl qcoord 12.psl %s > 13.psl" % qry_size, "pslCheck -querySizes=%s -targetSizes=%s -pass=14.check.psl 13.psl" % (qry_size, tgt_size), "axtChain -linearGap=medium -psl 14.check.psl %s %s 21.chain" % (tgt_2bit, qry_2bit), "chainPreNet 21.chain %s %s 23.chain" % (tgt_size, qry_size), "chain 2bed --qry 23.chain | sortBed -i stdin | \ mergeBed -i stdin > 23.bed", "subtractBed -a %s -b %s -nonamecheck | \ subtractBed -a stdin -b 23.bed -nonamecheck | \ bed filter --minsize 50 - > 24.nov.bed" % (qry_size_bed, qry_gap), "seqret.pl -d %s -b 24.nov.bed -o 24.nov.fas" % qry_fas, "rm 23.chain 23.bed", "fasta split --N %d %s %s" % (npieces2, '24.nov.fas', '25.nov'), ]) jcmds.append([ "cd %s" % dirw, "bash %s" % fbs[1], ]) #3 blat nov cmds = [] bcmds.append(cmds) prepre = "part.%%0%dd" % ndigit(npieces2-1) jcmds.append([ "let i=${PBS_ARRAYID}", "cd %s" % dirw, "printf -v pre %s \"$i\"" % prepre, "pblat %s %s/${pre}.fas -threads=%s -ooc=%s %s/${pre}.psl" % \ (tgt_2bit, '25.nov', pbs_ppns[2], tgt_ooc, '25.nov') ]) #4 process blat bcmds.append([ "pslCat -nohead 25.nov/part.*.psl > 25.nov.psl", "psl qcoord 25.nov.psl %s > 26.psl" % qry_size, "pslCheck -querySizes=%s -targetSizes=%s -pass=27.check.psl 26.psl" % (qry_size, tgt_size), "pslCat 14.check.psl 27.check.psl > 31.1.psl", "pslSwap 31.1.psl 41.1.psl", "rm 25.nov.psl 26.psl", "axtChain -linearGap=medium -psl 31.1.psl %s %s 31.2.chain" % (tgt_2bit, qry_2bit), "chainPreNet 31.2.chain %s %s 31.3.chain" % (tgt_size, qry_size), "chainSwap 31.3.chain 31.3.q.chain", "chainNet 31.3.chain %s %s 31.5.net 31.5.q.net" % (tgt_size, qry_size), "netChainSubset 31.5.net 31.3.chain stdout | chainSort stdin 31.5.chain", "netChainSubset 31.5.q.net 31.3.q.chain stdout | chainSort stdin 31.5.q.chain", "chainNet 31.5.q.chain %s %s /dev/null 31.8.net" % (qry_size, tgt_size), "netChainSubset 31.8.net 31.3.chain 31.8.chain", "axtChain -linearGap=medium -psl 41.1.psl %s %s 41.2.chain" % (qry_2bit, tgt_2bit), "chainPreNet 41.2.chain %s %s 41.3.chain" % (qry_size, tgt_size), "chainSwap 41.3.chain 41.3.q.chain", "chainNet 41.3.chain %s %s 41.5.net 41.5.q.net" % (qry_size, tgt_size), "netChainSubset 41.5.net 41.3.chain stdout | chainSort stdin 41.5.chain", "netChainSubset 41.5.q.net 41.3.q.chain stdout | chainSort stdin 41.5.q.chain", "chainNet 41.5.q.chain %s %s /dev/null 41.8.net" % (tgt_size, qry_size), "netChainSubset 41.8.net 41.3.chain 41.8.chain", ]) jcmds.append([ "cd %s" % dirw, "bash %s" % fbs[3], ]) #5 process vnt bcmds.append([ "snp2vcf.pl -i snp -o snp.vcf -s %s" % qry, ]) jcmds.append([ "cd %s/31.9" % dirw, "bash %s" % fbs[4], ]) assert len(bcmds) == njob, "not %d jobs" % njob assert len(jcmds) == njob, "not %d jobs" % njob for i in range(njob): fb, fj = fbs[i], fjs[i] if op.isfile(fb): os.remove(fb) if len(bcmds[i]) > 0: fhb = open(fb, "w") fhb.write("\n".join(bcmds[i]) + "\n") fhb.close() pbsjob = PbsJob(queue = pbs_queues[i], ppn = pbs_ppns[i], walltime = pbs_walltimes[i], email = pbs_email, cmds = "\n".join(jcmds[i]) ) pbsjob.write(fj) logging.debug("%s job scripts were created" % njob) eprint("qsub -t 0-%d %s" % (npieces-1, fjs[0])) eprint("qsub -W depend=afterok: %s" % fjs[1]) eprint("qsub -t 0-%d %s" % (npieces2-1, fjs[2])) eprint("qsub -W depend=afterok: %s" % fjs[3]) eprint("qsub -W depend=afterok: %s" % fjs[4])
def check_cfg_mapping(c, noutdir = 4, njob = 2): c.outdirs = c.outdir.split(",") assert len(c.outdirs) == noutdir, "not %s outdirs: %s" % (noutdir, c.outdir) for subdir in [c.dirw, c.temp_dir] + c.outdirs: if not op.isdir(subdir): mkdir(subdir) for fn in [c.ilist, c.vcf, c.gene_bed]: assert op.isfile(fn), "cannot read %s" % fn for key in 'samtools parallel sambamba htseq bcftools bedtools'.split(): fp = which(c[key]) assert fp is not None, "not executable: %s" % c[key] c[key] = fp c.paired = str2bool(c.paired) assert c.stranded in ['yes', 'no', 'reverse'], "unknown stranded option: %s" % c.stranded if c.mapper == 'tophat2': c.tophat2 = which(c.tophat2) assert c.tophat2 is not None, "not executable: %s" % c.tophat2 elif c.mapper == 'hisat2': c.hisat2= which(c.hisat2) assert c.hisat2 is not None, "not executable: %s" % c.hisat2 elif c.mapper == 'star': c.star= which(c.star) assert c.star is not None, "not executable: %s" % c.star else: logging.error("unsupported mapper: %s" % c.mapper) sys.exit(1) assert op.isdir(c.genomedir), "cannot access %s" % c.genomedir t = Table.read(c.ilist, format = 'ascii.tab') if 'genome' not in t[0]: genomeb = 'B73c' logging.debug("no 'genome' column detected: use %s" % genomeb) t.add_column(Column([genomeb] * len(t)), name = 'genome') c.t = t genomes = set() for i in range(len(t)): gts = t['genome'][i].split(",") for gt in gts: genomes.add(gt) genomes = sorted(list(genomes)) logging.debug("checking %d genomes" % len(genomes)) c.genomes = dict() for gt in genomes: c.genomes[gt] = dict() dirg = "%s/%s" % (c.genomedir, gt) dbpre = '' if c.mapper == 'tophat2': dbpre = "%s/21.bowtie2/db" % dirg assert op.isfile("%s.4.bt2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) elif c.mapper == 'hisat2': dbpre = "%s/21.hisat2/db" % dirg assert op.isfile("%s.8.ht2" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) elif c.mapper == 'star': dbpre = "%s/21.star" % dirg assert op.isfile("%s/SA" % dbpre), "no %s db-index: %s" % (c.mapper, dbpre) c.genomes[gt]['db'] = dbpre gff = "%s/51.gff" % dirg assert op.isfile(gff), "no gff for %s: %s" % (gff, gt) c.genomes[gt]['gff'] = gff c.pbs_walltimes = c.pbs_walltime.split(",") c.pbs_ppns = c.pbs_ppn.split(",") c.pbs_queues = c.pbs_queue.split(",") assert njob == len(c.pbs_queues) == len(c.pbs_walltimes) == len(c.pbs_ppns), "not %d jobs: %s" % (njob, c.pbs_queue) c.njob = njob return c
) parser.add_argument('diri', help = 'FROM directory path') parser.add_argument('diro', help = 'TO directory path') parser.add_argument('list', help = 'read list (sid Readfile1[ Readfile2])') args = parser.parse_args() #dirw = '/home/springer/zhoux379/projects/3rnaseq/cache' #diri = '/scratch.global/tenders/3RNA_0418/fastq_gz_files' #diro = '10.fastq' diri, diro, ilist = args.diri, args.diro, args.list if not op.isfile(ilist): logging.error("cannot read %s" % ilist) sys.exit() if not op.isdir(diro): mkdir(diro) t = Table.read(ilist, format = 'ascii.tab') paired = False if 'Readfile2' in t.colnames: paired = True tag = "paired" if paired else "single" logging.debug("proceed as %s-end reads" % tag) for i in range(len(t)): if paired: sid, fq1, fq2 = t['sid'][i], t['Readfile1'][i], t['Readfile2'][i] fq1, fq2 = op.join(diri, fq1), op.join(diri, fq2) assert op.isfile(fq1), "cannot access fq1: %s" % fq1 assert op.isfile(fq2), "cannot access fq2: %s" % fq2 cmd = "cp %s %s/%s_1.fq.gz" % (fq1, diro, sid)
def entrez(args): """ %prog entrez <filename|term> `filename` contains a list of terms to search. Or just one term. If the results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed the download. """ p = OptionParser(entrez.__doc__) allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"], "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"], "xml" : ["genome", "nuccore", "nucgss", "nucest", "gene"], "gb" : ["genome", "nuccore", "nucgss"], "est" : ["nucest"], "gss" : ["nucgss"], "acc" : ["nuccore"], } valid_formats = tuple(allowed_databases.keys()) valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene") sp1.add_argument("--noversion", dest="noversion", default=False, action="store_true", help="Remove trailing accession versions") sp1.add_argument("--format", default="fasta", choices=valid_formats, help="download format [default: %default]") sp1.add_argument("--database", default="nuccore", choices=valid_databases, help="search database [default: %default]") sp1.add_argument("--retmax", default=1000000, type="int", help="how many results to return [default: %default]") sp1.add_argument("--skipcheck", default=False, action="store_true", help="turn off prompt to check file existence [default: %default]") sp1.add_argument("--batchsize", default=500, type="int", help="download the results in batch for speed-up [default: %default]") p.set_outdir(outdir=None) sp1.add_argument("--outprefix", default="out", help="output file name prefix [default: %default]") p.set_email() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) filename, = args if op.exists(filename): pf = filename.rsplit(".", 1)[0] list_of_terms = [row.strip() for row in open(filename)] if args.noversion: list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms] else: pf = filename # the filename is the search term list_of_terms = [filename.strip()] fmt = args.format database = args.database batchsize = args.batchsize assert database in allowed_databases[fmt], \ "For output format '{0}', allowed databases are: {1}".\ format(fmt, allowed_databases[fmt]) assert batchsize >= 1, "batchsize must >= 1" if " " in pf: pf = args.outprefix outfile = "{0}.{1}".format(pf, fmt) outdir = args.outdir if outdir: mkdir(outdir) # If noprompt, will not check file existence if not outdir: fw = must_open(outfile, "w", checkexists=True, \ skipcheck=args.skipcheck) if fw is None: return seen = set() totalsize = 0 for id, size, term, handle in batch_entrez(list_of_terms, retmax=args.retmax, \ rettype=fmt, db=database, batchsize=batchsize, \ email=args.email): if outdir: outfile = urljoin(outdir, "{0}.{1}".format(term, fmt)) fw = must_open(outfile, "w", checkexists=True, \ skipcheck=args.skipcheck) if fw is None: continue rec = handle.read() if id in seen: logging.error("Duplicate key ({0}) found".format(rec)) continue totalsize += size print >> fw, rec print >> fw seen.add(id) if seen: print >> sys.stderr, "A total of {0} {1} records downloaded.".\ format(totalsize, fmt.upper()) return outfile