def Subsampling(args): """ %prog Subsampling SMs_file vcf_or_vcf.gz Subsampling vcf file using bcftools. The samples order will also change following the order in SMs_file. """ p = OptionParser(Subsampling.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) SMsfile, vcffile, = args prefix = vcffile.split('/')[-1].split('.vcf')[0] new_f = prefix + '.subsm.vcf' cmd = "bcftools view -S %s %s > %s\n" % (SMsfile, vcffile, new_f) print(cmd) jobfile = '%s.subsm.slurm' % prefix f = open(jobfile, 'w') header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'module load bcftools\n' header += cmd f.write(header) print( 'slurm file %s.subsm.slurm has been created, you can sbatch your job file.' % prefix)
def CombineRep(args): """ %prog CombinRep dir combine all fg.gz files for same sample """ p = OptionParser(CombineRep.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args fqs = [i for i in os.listdir(mydir) if i.endswith('fq.gz')] fqs = sorted( fqs, key=lambda x: int(x.split('.')[0].split('_')[0].split('R')[0])) SMs = [x.split('.')[0].split('_')[0].split('R')[0] for x in fqs] mydf = pd.DataFrame(dict(zip(['SM', 'FNs'], [SMs, fqs]))) mygrpdf = mydf.groupby('SM').agg(['count', lambda x: ' '.join(x)]) f = open('combine_fqs.sh', 'w') for sm in mygrpdf.index: n, fns = mygrpdf.loc[sm, :] cmd = 'cat %s > %s.cbd.fq.gz\n' % (fns, sm) f.write(cmd) f.close() cmd1 = 'chmod +x combine_fqs.sh\n' cmd2 = './combine_fqs.sh\n' header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd1 header += cmd2 f = open('CombineFQs.slurm' % prefix, 'w') f.write(header) f.close() print( 'slurm file CombineFQs.slurm has been created, you can sbatch your job file.' )
def cMLM(args): """ %prog cMLM pheno(with header, tab delimited) geno_prefix(GM and GD prefix) PCA Kinship Run automated GAPIT compressed mixed linear model """ p = OptionParser(cMLM.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pheno, geno_prefix, PCA, Kinship = args mem = '.'.join(pheno.split('.')[0:-1]) f1 = open('%s.cMLM.R'%mem, 'w') #print(Gapit_header) gapit_cmd = Gapit_header%(pheno,geno_prefix,geno_prefix,PCA,Kinship,mem) f1.write(gapit_cmd) f2 = open('%s.cMLM.slurm'%mem, 'w') h = Slurm_header h += 'module load R/3.3\n' header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) f2.write(header) cmd = 'R CMD BATCH %s.cMLM.R\n'%mem f2.write(cmd) f1.close() f2.close() print('R script %s.cMLM.R and slurm file %s.cMLM.slurm has been created, you can sbatch your job file.'%(mem, mem))
def plot(args): """ %prog plot gwas_out result_prefix plt MVP results using MVP.Report function. https://github.com/XiaoleiLiuBio/MVP """ p = OptionParser(plot.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) gwasfn, op, = args # op: output prefix f1 = open('%s.plot.R' % op, 'w') cmds = ''' library('MVP') myData = read.csv(%s) MVP.Report(myData, plot.type='m', col=c("dodgerblue4","deepskyblue"), LOG10=TRUE, ylim=NULL, th reshold=8.9e-8, threshold.col='grey', chr.den.col=NULL, file='png', memo='MLM', dpi=300) ''' f1.write(MVP_Run_header % (pheno, op, op, op, op)) f1.close() f2 = open('%s.mlm.farmcpu.slurm' % opts.prefix, 'w') header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'module load R\n' header += 'R CMD BATCH %s.mlm.farmcpu.R\n' % opts.prefix f2.write(header) f2.close() print('%s.mlm.farmcpu.R and %s.mlm.farmcpu.slurm have been created.' % (opts.prefix, opts.prefix))
def IndexBam(args): """ %prog IndexBam dir create the index for bam files """ p = OptionParser(IndexBam.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args allfiles = [i for i in os.listdir(mydir) if i.endswith('sorted.bam')] print('Total %s sorted.bam files' % len(allfiles)) for i in allfiles: SM = i.split('.')[0] cmd = 'samtools index %s\n' % i header = Slurm_header % (opts.time, opts.memory, SM, SM, SM) header += 'module load samtools/0.1\n' header += cmd jobfile = '%s.idx.slurm' % SM f = open(jobfile, 'w') f.write(header) f.close() print( 'slurm files *.idx.slurm has been created, you can sbatch your job file.' )
def genPCA(args): """ %prog genPCA hmp N Generate first N PCs using tassel """ p = OptionParser(genPCA.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmp, N, = args out_prefix = hmp.replace('.hmp', '') cmd = 'run_pipeline.pl -Xms28g -Xmx29g -fork1 -h %s -PrincipalComponentsPlugin -ncomponents %s -covariance true -endPlugin -export %s_%sPCA -runfork1\n' % ( hmp, N, out_prefix, N) h = Slurm_header h += 'ml java/1.8\n' h += 'ml tassel/5.2\n' header = h % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.PCA%s.slurm' % (out_prefix, N), 'w') f.write(header) f.close() print( 'slurm file %s.PCA%s.slurm has been created, you can sbatch your job file.' % (out_prefix, N))
def IndePvalue(args): """ %prog IndePvalue plink_bed_prefix output calculate the number of independent SNPs (Me) and the bonferroni pvalue """ p = OptionParser(IndePvalue.__doc__) p.set_slurm_opts(jn=True) p.add_option( '--cutoff', default='0.05', choices=('0.01', '0.05'), help='choose the pvalue cutoff for the calculation of bonferroni pvalue' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) bed, output = args mem = int(opts.memory / 1000) - 2 cmd = 'java -Xmx%sg -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % ( mem, GEC, bed, output) h = Slurm_header h += 'module load java/1.8\n' header = h % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.Me_SNP.slurm' % output, 'w') f.write(header) f.close() print( 'slurm file %s.Me_SNP.slurm has been created, you can sbatch your job file.' % output)
def impute(args): """ %prog impute vcf impute missing data in vcf using beagle or linkimpute """ p = OptionParser(impute.__doc__) p.set_slurm_opts(jn=True) p.add_option('--software', default='linkimpute', choices=('linkimpute', 'beagle'), help='specify the imputation software') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) vcffile, = args prefix = '.'.join(vcffile.split('.')[0:-1]) new_f = prefix + '.impt.vcf' cmd = 'java -Xss100m -Xmx18G -jar %s -v %s %s \n' % (lkipt, vcffile, new_f) \ if opts.software == 'linkimpute' \ else 'java -Xss16G -Xmx18G -jar %s gt=%s out=%s.beagle \n' % (begle, vcffile, prefix) header = Slurm_header % (opts.time, 20000, opts.prefix, opts.prefix, opts.prefix) header += 'module load java/1.7 \n' \ if opts.software == 'linkimpute' \ else 'module load java/1.8 \n' header += cmd f = open('%s.%s.slurm' % (prefix, opts.software), 'w') f.write(header) f.close() print('slurm file %s.%s.slurm has been created! ' % (prefix, opts.software))
def vcf2hmp(args): """ %prog vcf2hmp vcf convert vcf generated from beagle to hmp format using tassel """ p = OptionParser(vcf2hmp.__doc__) p.set_slurm_opts(jn=True) p.add_option('--version', default='2', choices=('1', '2'), help='specify the hmp type. 1: hyploid. 2: diploid') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) vcffile, = args prefix = '.'.join(vcffile.split('.')[0:-1]) cmd = '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType HapmapDiploid\n' % (tassel, vcffile) \ if opts.version == '2' \ else '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType Hapmap\n' % (tassel, vcffile) header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'module load java/1.8\n' header += cmd f = open('%s.vcf2hmp.slurm' % prefix, 'w') f.write(header) f.close() print( 'slurm file %s.vcf2hmp.slurm has been created, you can submit your job file.' % prefix)
def EstimateLD(args): """ %prog dir_in dir_out run LD decay using tassel """ p = OptionParser(EstimateLD.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*vcf', help='pattern of vcf files') p.add_option('--window_size', default='1000', help='specify how many SNPs in the sliding window') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) dir_in, dir_out = args dir_out = Path(dir_out) if not dir_out.exists(): dir_out.mkdir() for vcf in Path(dir_in).glob(opts.pattern): prefix = vcf.name.replace('.vcf', '') out_fn = '%s.ld' % prefix cmd = 'run_pipeline.pl -Xms512m -Xmx14g -fork1 -vcf %s -ld -ldWinSize %s -ldType SlidingWindow -td_tab %s/%s\n' % ( vcf, opts.window_size, dir_out, out_fn) header = Slurm_header % (opts.time, 15000, prefix, prefix, prefix) header += 'ml java/1.8\n' header += 'ml tassel/5.2\n' header += cmd with open('%s.estLD.slurm' % prefix, 'w') as f: f.write(header) print( 'slurm file %s.estLD.slurm has been created, you can submit your job file.' % prefix)
def SummarizeLD(args): """ %prog dir_in dir_out summarize LD decay in log scale """ p = OptionParser(EstimateLD.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.ld.txt', help='pattern of ld.txt files') p.add_option('--max_dist', default='1,000,000', help='the maximum ld distance') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) dir_in, dir_out = args dir_out = Path(dir_out) if not dir_out.exists(): dir_out.mkdir() num0 = opts.max_dist.count('0') for fn in Path(dir_in).glob(opts.pattern): prefix = '.'.join(fn.name.split('.')[0:-1]) out_fn = '%s.sum.csv' % prefix cmd = 'python -m schnablelab.SNPcalling.base SummarizeLD %s %s %s/%s\n' % ( fn, num0, dir_out, out_fn) header = Slurm_header % (opts.time, opts.memory, prefix, prefix, prefix) header += cmd with open('%s.sumLD.slurm' % prefix, 'w') as f: f.write(header) print( 'slurm file %s.sumLD.slurm has been created, you can submit your job file.' % prefix)
def only_MAF(args): """ %prog in_dir out_dir filter MAF """ p = OptionParser(only_MAF.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') p.add_option('--maf', default='0.01', help='maf cutoff') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcffile in vcfs: prefix = '.'.join(vcffile.name.split('.')[0:-1]) cmd = "python -m schnablelab.SNPcalling.base MAF %s %s\n" % (vcffile, opts.maf) with open('%s.maf.slurm' % prefix, 'w') as f: header = Slurm_header % (opts.time, opts.memory, prefix, prefix, prefix) header += 'ml bcftools\n' header += cmd f.write(header) print( 'slurm file %s.maf.slurm has been created, you can sbatch your job file.' % prefix)
def pdf2png(args): """ %prog pdf2png dir_in dir_out Run imagemagick to convert pdf to png """ p = OptionParser(pdf2png.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) pdfs = dir_path.glob('*.pdf') for pdf in pdfs: print(pdf) prf = pdf.name.replace('.pdf', '') png = pdf.name.replace('.pdf', '.png') header = Slurm_header % (100, 15000, prf, prf, prf) header += 'ml imagemagick\n' cmd = 'convert -density 300 {} -resize 25% {}/{}\n'.format( pdf, out_path, png) header += cmd with open('pdf2png.%s.slurm' % prf, 'w') as f: f.write(header)
def NUM_ALT(args): """ %prog NUM_ALT vcf_or_vcf.gz only retain SNPs with only one ALT """ p = OptionParser(NUM_ALT.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) vcffile, = args prefix = vcffile.split('.')[0] new_f = prefix + '.alt1.vcf' cmd = "bcftools view -i 'N_ALT=1' %s > %s" % (vcffile, new_f) jobfile = '%s.alt1.slurm' % prefix f = open(jobfile, 'w') header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'module load bacftools\n' header += cmd f.write(header) print( 'slurm file %s.alt1.slurm has been created, you can sbatch your job file.' % prefix)
def ped2bed(args): """ %prog ped_prefix Convert plink ped to binary bed format using Plink """ p = OptionParser(ped2bed.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ped_prefix, = args cmd = 'plink --noweb --file %s --make-bed --out %s\n' % (ped_prefix, ped_prefix) print('run cmd on local:\n%s' % cmd) header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'ml plink\n' header += cmd f = open('%s.ped2bed.slurm' % ped_prefix, 'w') f.write(header) f.close() print( 'Job file has been created. You can submit: sbatch -p jclarke %s.ped2bed.slurm' % ped_prefix)
def GLM(args): """ %prog GLM GenoPrefix Pheno Outdir RUN automated GEMMA General Linear Model """ p = OptionParser(GLM.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) GenoPrefix, Pheno, Outdir = args meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation' outprefix = Pheno.split('.')[0] cmd = '%s -g %s -p %s -a %s -lm 4 -outdir %s -o %s' \ %(gemma, meanG, Pheno, annoG, Outdir, outprefix) print('The command running on the local node:\n%s'%cmd) h = Slurm_header header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.glm.slurm'%outprefix, 'w') f.write(header) f.close() print('slurm file %s.glm.slurm has been created, you can sbatch your job file.'%outprefix)
def SortHmp(args): """ %prog SortHmp hmp Sort hmp in wired TASSEL way... """ p = OptionParser(SortHmp.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmp, = args prefix = hmp.replace('.hmp', '') out_prefix = hmp.replace('.hmp', '') + '.sorted' cmd = 'run_pipeline.pl -Xms16g -Xmx18g -SortGenotypeFilePlugin -inputFile %s -outputFile %s -fileType Hapmap\n' % ( hmp, out_prefix) cmd1 = 'mv %s %s' % (out_prefix + '.hmp.txt', out_prefix + '.hmp') h = Slurm_header h += 'module load java/1.8\n' h += 'module load tassel/5.2\n' header = h % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd header += cmd1 f = open('%s.Sort.slurm' % prefix, 'w') f.write(header) f.close() print( 'slurm file %s.Sort.slurm has been created, you can sbatch your job file.' % prefix)
def MLM(args): """ %prog MLM GenoPrefix('*.mean' and '*.annotation') Pheno Outdir RUN automated GEMMA Mixed Linear Model """ p = OptionParser(MLM.__doc__) p.add_option('--kinship', default=False, help = 'specify the relatedness matrix file name') p.add_option('--pca', default=False, help = 'specify the principle components file name') p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) GenoPrefix, Pheno, Outdir = args meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation' outprefix = '.'.join(Pheno.split('/')[-1].split('.')[0:-1]) cmd = '%s -g %s -p %s -a %s -lmm 4 -outdir %s -o %s' \ %(gemma, meanG, Pheno, annoG, Outdir, outprefix) if opts.kinship: cmd += ' -k %s'%opts.kinship if opts.pca: cmd += ' -c %s'%opts.pca print('The command running on the local node:\n%s'%cmd) h = Slurm_header header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.mlm.slurm'%outprefix, 'w') f.write(header) f.close() print('slurm file %s.mlm.slurm has been created, you can sbatch your job file.'%outprefix)
def hmp2vcf(args): """ %prog hmp2vcf hmp convert hmp to vcf format using tassel """ p = OptionParser(hmp2vcf.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmpfile, = args prefix = '.'.join(hmpfile.split('.')[0:-1]) cmd = 'run_pipeline.pl -Xms512m -Xmx10G -fork1 -h %s -export -exportType VCF\n' % ( hmpfile) print(cmd) header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'ml tassel/5.2\n' header += cmd f = open('%s.hmp2vcf.slurm' % prefix, 'w') f.write(header) f.close() print( 'slurm file %s.hmp2vcf.slurm has been created, you can sbatch your job file.' % prefix)
def farmcpu(args): """ %prog farmcpu pheno(with header, tab delimited) geno_prefix(GM(chr must be nums) and GD prefix) PCA Run automated FarmCPU """ p = OptionParser(farmcpu.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pheno, geno_prefix, PCA = args mem = '.'.join(pheno.split('/')[-1].split('.')[0:-1]) f1 = open('%s.FarmCPU.R' % mem, 'w') farmcpu_cmd = FarmCPU_header % (pheno, geno_prefix, geno_prefix, PCA, mem) f1.write(farmcpu_cmd) f2 = open('%s.FarmCPU.slurm' % mem, 'w') h = Slurm_header h += 'module load R/3.3\n' header = h % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) f2.write(header) cmd = 'R CMD BATCH %s.FarmCPU.R' % mem f2.write(cmd) f1.close() f2.close() print( 'R script %s.FarmCPU.R and slurm file %s.FarmCPU.slurm has been created, you can sbatch your job file.' % (mem, mem))
def keras_cnn(args): """ %prog train_dir val_dir num_category model_name_prefix Run vgg model """ p = OptionParser(keras_cnn.__doc__) p.add_option('--epoch', default=500, help = 'number of epoches') p.add_option('--lr_n', default=1, type='int', help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times') p.set_slurm_opts(gpu=True) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) train_dir, val_dir, numC, mnp = args #mnp:model name prefix out_fns = fns(mnp, n=opts.lr_n) for i in range(int(opts.lr_n)): cmd = 'python -m schnablelab.CNN.keras_vgg %s %s %s %s %s %s'%(train_dir, val_dir, numC, out_fns.lrs[i], opts.epoch, out_fns.model_name[i]) SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i]) SlurmHeader += 'module load anaconda\nsource activate MCY\n' SlurmHeader += cmd f = open('%s.slurm'%out_fns.model_name[i], 'w') f.write(SlurmHeader) f.close() print('slurm file %s.slurm has been created, you can sbatch your job file.'%out_fns.model_name[i])
def CallHeightBatch(args): """ %prog imagePattern("CM*.polish.png") generate height call jobs for all polished image files """ p = OptionParser(CallHeightBatch.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pattern, = args all_pngs = glob(pattern) for i in all_pngs: out_prefix = i.split('/')[-1].split('.polish.png')[0] jobname = out_prefix + '.Height' cmd = 'python -m schnablelab.CNN.CallHeight CallHeight %s %s\n' % ( i, out_prefix) header = Slurm_header % (opts.time, opts.memory, jobname, jobname, jobname) header += "ml anaconda\nsource activate %s\n" % opts.env header += cmd jobfile = open('%s.CallHeight.slurm' % out_prefix, 'w') jobfile.write(header) jobfile.close() print('%s.CallHeight.slurm call height job file generated!' % jobname)
def Sam2Bam(args): """ %prog Sam2Bam dir Convert sam to bam format """ p = OptionParser(Sam2Bam.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args allfiles = [i for i in os.listdir(mydir) if i.endswith('sam')] print('Total %s sam files' % len(allfiles)) for i in allfiles: SM = i.split('.')[0] output = '%s.bam' % SM cmd = 'samtools view -bS %s > %s\n' % (i, output) header = Slurm_header % (opts.time, opts.memory, SM, SM, SM) header += 'module load samtools/0.1\n' header += cmd jobfile = '%s.sam2bam.slurm' % SM f = open(jobfile, 'w') f.write(header) f.close() print( 'slurm files *.sam2bam.slurm has been created, you can sbatch your job file.' )
def RunMACS2(args): """ %prog species(bd, si, sb) out_prefix BAMs(separated by comma) call peaks using all bam files """ p = OptionParser(RunMACS2.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) species, out_prefix, bams, = args all_bams = ' '.join([i for i in bams.split(',')]) print('BAMS: %s' % all_bams) g_dict = {'bd': '2e8', 'si': '3e8', 'sb': '6e8'} cmd = 'macs2 callpeak -t %s -n %s --outdir %s -f BAM -q 0.01 -g %s -B --nomodel --shift 37 --extsize 73\n' % ( all_bams, out_prefix, out_prefix, g_dict[species]) header = Slurm_header % (opts.time, opts.memory, out_prefix, out_prefix, out_prefix) header += 'module load macs2\n' header += cmd jobfile = '%s.macs2.slurm' % out_prefix f = open(jobfile, 'w') f.write(header) f.close() print( 'slurm files %s.macs2.slurm has been created, you can sbatch your job file.' )
def SNPsCall(args): """ %prog SNPsCall ref info create the index for bam files """ p = OptionParser(SNPsCall.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref, info, = args allfiles = [i for i in os.listdir('.') if i.endswith('sorted.bam')] print('Total %s sorted.bam files' % len(allfiles)) f1 = open('bamfiles.fb.list', 'w') for i in allfiles: f1.write(i + '\n') f1.close() f2 = open(info) chrlist = [i.rstrip() for i in f2] for seq in chrlist: cmd = '/work/schnablelab/cmiao/SorghumGWAS/scripts/freebayes/bin/freebayes -r %s -f %s -C 1 -L bamfiles.fb.list > %s\n' % ( seq, ref, "_".join(seq.split(':')) + '.vcf') header = Slurm_header % (opts.time, opts.memory, seq, seq, seq) header += cmd jobfile = '%s.fb.slurm' % ("_".join(seq.split(':'))) f = open(jobfile, 'w') f.write(header) f.close() print( 'slurm files *.fb.slurm has been created, you can sbatch your job file.' )
def only_ALT(args): """ %prog in_dir out_dir filter number of ALT using bcftools """ p = OptionParser(only_ALT.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcffile in vcfs: prefix = '.'.join(vcf.name.split('.')[0:-1]) new_f = prefix + '.alt1.vcf' cmd = "bcftools view -i 'N_ALT=1' %s > %s"%(vcffile, new_f) with open('%s.alt1.slurm'%prefix, 'w') as f: header = Slurm_header%(opts.time, opts.memory, prefix, prefix, prefix) header += 'ml bacftools\n' header += cmd f.write(header) print('slurm file %s.alt1.slurm has been created, you can sbatch your job file.'%prefix)
def Trim(args): """ %prog Trim dir quality control on raw fq.gz using Trimmomatric """ p = OptionParser(Trim.__doc__) p.set_slurm_opts(array=False) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args allfiles = [i for i in os.listdir(mydir) if i.endswith('.fq.gz')] print('Total %s fastq.gz files' % len(allfiles)) for i in allfiles: sm = i.split('.')[0] cmd1 = 'java -jar $TM_HOME/trimmomatic.jar SE %s %s CROP:185 SLIDINGWINDOW:4:15 MINLEN:30' % ( i, sm + '.trimed.fq\n') cmd2 = 'gzip %s' % (sm + '.trimed.fq\n') header = Slurm_header % (opts.time, opts.memory, SM, SM, SM) header += cmd1 header += cmd2 jobfile = '%s.trimc.slurm' % sm f = open(jobfile, 'w') f.write(header) f.close() print( 'slurm files *.trimed.slurm has been created, you can sbatch your job file.' )
def hmp2ped(args): """ %prog hmp Convert hmp to plink ped format using Tassel """ p = OptionParser(hmp2ped.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmp, = args prefix = '.'.join(hmp.split('.')[0:-1]) cmd = 'run_pipeline.pl -Xms512m -Xmx38G -fork1 -h %s -export -exportType Plink\n' % hmp header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'ml java/1.8\n' header += 'ml tassel/5.2\n' header += cmd f = open('%s.hmp2ped.slurm' % prefix, 'w') f.write(header) f.close() print( 'Job file has been created. You can submit: sbatch -p jclarke %s.hmp2ped.slurm' % prefix)
def PredictSlurmGPU(args): """ %prog model_name npyPattern("CM*.npy") job_n generate prediction GPU jobs for all npy files """ p = OptionParser(PredictSlurmGPU.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mn, npy_pattern, jobn, = args if opts.prefix == 'myjob': print('specify job name prefix!') sys.exit() npys = glob(npy_pattern) print(len(npys)) grps = cutlist(npys, int(jobn)) for gn, grp in grps: st, ed = gn.split('-') ed = int(ed) + 1 gn = '%s-%s' % (st, ed) cmd = "python -m schnablelab.CNN.Predict Predict %s '%s' %s\n" % ( mn, npy_pattern, gn) opt = '%s.%s' % (opts.prefix, gn) header = Slurm_gpu_header % (opts.time, opts.memory, opt, opt, opt) header += "ml anaconda\nsource activate MCY\n" header += cmd with open('%s.gpu.slurm' % opt, 'w') as f: f.write(header) print('%s.gpu.slurm prediction GPU job file generated!' % opt)
def Preprocess(args): """ %prog Preprocess dir 1, Only keep variants: number of ALT==1, quality score >=10, MAF>=0.01, missing rate>0.3, type is snp. 2, split msnp to snps. only applicable on the unimputed vcf files. """ p = OptionParser(Preprocess.__doc__) p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args allfiles = [i for i in os.listdir('.') if i.endswith('.vcf')] print('Total %s .vcf files' % len(allfiles)) for i in allfiles: SM = i.split('.')[0] cmd = "bcftools view -i 'N_ALT==1 && QUAL>=10 && MAF>=0.01 && NS/N_SAMPLES > 0.3' -v 'snps' %s | bcftools -m -snps > %s.prprcss.vcf" % ( i, SM) jobfile = '%s.PreprocessVCF.slurm' % SM f = open(jobfile, 'w') header = Slurm_header % (opts.time, opts.memory, SM, SM, SM) header += 'module load bcftools\n' header += cmd f.write(header) f.close() print( 'slurm file %s.PreprocessVCF.slurm has been created, now you can sbatch your job files.' % SM)