def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger): '''Start variant vcf-filter using gatk''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('tmp'): os.makedirs('tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' vcffilter_calls = [] cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py' # for each chromosome for v in vcfs: arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q) if rmsk: arg = arg + ' --rmsk %s' % rmsk if ab != 0.5: arg = arg + ' --ab %f' % ab if prune != 0: arg = arg + ' --prune %i' % prune vcffilter_calls.append(cmd + arg) # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2 * 86400) s.wait() print "--------------------------------------"
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger): '''Annotate vcf.gz file with dbSNP, exchanging chromsome names to dbSNP version sort vcf and the input to dbSNP ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not dbsnp or dbsnp == 'None': print "No dbsnp file given - skipping" print "--------------------------------------" return vcf if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create command cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py' arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o) dbsnp_calls = [cmd + arg] # submit jobs print "Submitting jobs" dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition) # release jobs # print "Releasing jobs" #dbsnp_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" return o
def start_trim(args, logger): '''Start trimming from genobox.py''' import genobox_modules from genobox_classes import Moab, Semaphore import subprocess import os import sys # set queueing paths = genobox_modules.setSystem() home = os.getcwd() if args.partition == 'uv': cpuA = 'procs=2,mem=512mb,walltime=172800,flags=sharedmem' cpuC = 'procs=1,mem=2gb,walltime=172800,flags=sharedmem' cpuE = 'procs=1,mem=5gb,walltime=172800,flags=sharedmem' cpuB = 'procs=16,mem=10gb,walltime=172800,flags=sharedmem' cpuF = 'procs=2,mem=%s,walltime=172800,flags=sharedmem' % args.m else: cpuA = 'nodes=1:ppn=2,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=%s,walltime=172800' % args.m # create path if not os.path.exists('trimmed'): os.makedirs('trimmed') # create calls (single_calls, se_files) = single_trim(args) (paired_calls, pe1_files, pe2_files) = paired_trim(args) # submit jobs print "Submitting jobs" if args.se: single_moab = Moab(single_calls, logfile=logger, runname='run_genobox_trimse', queue=args.queue, cpu=cpuA, partition=args.partition) if args.pe1 and args.pe2: paired_moab = Moab(paired_calls, logfile=logger, runname='run_genobox_trimpe', queue=args.queue, cpu=cpuA, partition=args.partition) # release jobs print "Releasing jobs" #if args.se: # single_moab.release() #if args.pe1 and args.pe2: # paired_moab.release() # wait for jobs to finish print "Waiting for jobs to finish ..." semaphore_ids = [] if args.se: semaphore_ids = semaphore_ids + single_moab.ids if args.pe1 and args.pe2: semaphore_ids = semaphore_ids + paired_moab.ids s = Semaphore(semaphore_ids, home, 'read_trimming', args.queue, 60, 86400) s.wait() print "--------------------------------------" sys.stderr.write('Done\n') # return trimmed files return (se_files, pe1_files, pe2_files)
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger): '''Start variant vcf-filter Genome file must be given, format is a line for each chromosome: chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth Filtering steps: vcfutils.pl varFilter annotated repeats using rmsk heterozygote variants on haploid chromosomes allelic balance pruning of variants within N nt of each other ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('genotyping/tmp'): os.makedirs('genotyping/tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' if caller == 'samtools': # create command cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py' if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (bcf, genome, caller, Q, rmsk, ab, prune, outfile) vcffilter_calls = [cmd+arg] # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition) # release jobs # print "Releasing jobs" vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return filename of final vcf return o
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger): '''Start variant vcf-filter Genome file must be given, format is a line for each chromosome: chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth Filtering steps: vcfutils.pl varFilter annotated repeats using rmsk heterozygote variants on haploid chromosomes allelic balance pruning of variants within N nt of each other ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('genotyping/tmp'): os.makedirs('genotyping/tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' if caller == 'samtools': # create command cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py' if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % ( bcf, genome, caller, Q, rmsk, ab, prune, outfile) vcffilter_calls = [cmd + arg] # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" # return filename of final vcf return o
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab( rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuG, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition ) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 345600) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 345600) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger): '''Extract high confidence same-as-reference bases from bcf, options are to: exchange ids annotate using dbsnp filter rmsk filter ambiguous indel positions ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # read genome file genome = get_genome(genome_file) # create commands bcf2ref_calls = [] cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py' for chr in genome: # set outfile name if len(genome) == 1: if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o else: if dir and dir != 'None': outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1]) else: outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1]) arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile) bcf2ref_calls.append(cmd+arg) # submit jobs print "Submitting jobs" bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" bcf2ref_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2*86400) s.wait() print "--------------------------------------"
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" bamindex_moab.release() mpileup_moab.release() bcfcombine_moab.release() bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger): '''Extract high confidence same-as-reference bases from bcf, options are to: exchange ids annotate using dbsnp filter rmsk filter ambiguous indel positions ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # read genome file genome = get_genome(genome_file) # create commands bcf2ref_calls = [] cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py' for chr in genome: # set outfile name if len(genome) == 1: if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o else: if dir and dir != 'None': outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1]) else: outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1]) arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % ( bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile) bcf2ref_calls.append(cmd + arg) # submit jobs print "Submitting jobs" bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" #bcf2ref_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2 * 86400) s.wait() print "--------------------------------------"
def start_assembly(args, logger): '''Start assembly''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuV = 'nodes=1:ppn=%i,mem=%s,walltime=172800' % (args.n, args.m) cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # set kmersizes (if auto) if args.ksizes == ['auto']: args.ksizes = set_kmersizes(args) # trimming calls if args.trim: illuminatrim_calls = illumina_trim(args, int(args.ksizes[0]), 15, 20, 15, False) # checking if files needs to be interleaved interleave_dict = {} interleave_dict['shortPaired'] = interleave(args.shortPaired, args.sample)[0] ; args.shortPaired = interleave(args.shortPaired, args.sample)[1] interleave_dict['shortPaired2'] = interleave(args.shortPaired2, args.sample)[0] ; args.shortPaired2 = interleave(args.shortPaired2, args.sample)[1] interleave_dict['longPaired'] = interleave(args.longPaired, args.sample)[0] ; args.longPaired = interleave(args.longPaired, args.sample)[1] # interleave calls interleave_calls = [] for key,value in interleave_dict.items(): if value: interleave_calls.append(value) # velvet calls velveth_calls = create_velveth_calls(args) velvetg_calls = create_velvetg_calls(args) # velvet parse calls velvetparse_calls = get_best_assembly(args) velvetaccept_calls = accept_assembly(args) velvetclean_calls = clean() # set environment variable: env_var = 'OMP_NUM_THREADS=%i' % int(args.n - 1) # submit and release jobs print "Submitting jobs" # if trimming is needed if args.trim: illuminatrim_moab = Moab(illuminatrim_calls, logfile=logger, runname='run_genobox_trim', queue=args.queue, cpu=cpuF) # if no interleaving is needed if len(interleave_calls) == 0: velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if interleaving is needed else: interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids) velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if no trimming else: # if no interleaving is needed if len(interleave_calls) == 0: velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if interleaving is needed else: interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF) velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # submit job for velvetparse if more than one ksize was chosen if len(args.ksizes) > 1: velvetparse_moab = Moab(velvetparse_calls, logfile=logger, runname='run_genobox_velvetparse', queue=args.queue, cpu=cpuA, depend=True, depend_type='conc', depend_val=[len(velvetg_calls)], depend_ids=velvetg_moab.ids) velvetaccept_moab = Moab(velvetaccept_calls, logfile=logger, runname='run_genobox_velvetaccept', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetparse_moab.ids) velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetaccept_moab.ids) else: velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetg_moab.ids) # release jobs print "Releasing jobs" if args.trim and len(illuminatrim_calls) > 0: illuminatrim_moab.release() if len(interleave_calls) > 0: interleave_moab.release() velveth_moab.release() velvetg_moab.release() if len(args.ksizes) > 1: velvetparse_moab.release() velvetaccept_moab.release() velvetclean_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(velvetclean_moab.ids, home, 'velvet', args.queue, 20, 2*86400) s.wait() print "--------------------------------------"
def start_bamstats(args, bam, partition, logger, wait=True): '''Starts calculation of bam statistics''' # samtools flagstat # bedtools genomeCoverageBed # python avgdepth import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('stats'): os.makedirs('stats') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=7gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' cpuUV = 'procs=1,mem=%i,walltime=172800,flags=sharedmem' # create calls if args.mapdamage: mapdamage_calls = mapdamapge(bam, args.fa) else: flagstat_calls = sam_flagstat(bam) coverage_calls = bed_genomeCov(bam) plotcoverage_calls = plot_coverage(bam) avgdepth_calls = python_avgdepth(bam) saturation_calls = get_saturation(bam) # submit jobs print "Submitting jobs" if args.mapdamage: mapdamage_moab = Moab(mapdamage_calls, logfile=logger, runname='run_genobox_mapdamage', queue=args.queue, cpu=cpuA, partition=partition) else: flagstat_moab = Moab(flagstat_calls, logfile=logger, runname='run_genobox_flagstat', queue=args.queue, cpu=cpuC, partition=partition) coverage_moab = Moab(coverage_calls, logfile=logger, runname='run_genobox_coverage', queue=args.queue, cpu=cpuC, partition=partition) plotcoverage_moab = Moab(plotcoverage_calls, logfile=logger, runname='run_genobox_plotcoverage', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=coverage_moab.ids, partition=partition) avgdepth_moab = Moab(avgdepth_calls, logfile=logger, runname='run_genobox_avgdepth', queue=args.queue, cpu=cpuE, partition=partition) #saturation_moab = Moab(saturation_calls, logfile=logger, runname='run_genobox_saturation', queue=args.queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" # wait for jobs to finish if wait: print "Waiting for jobs to finish ..." if args.mapdamage: semaphore_ids = mapdamage_moab.ids else: semaphore_ids = flagstat_moab.ids + coverage_moab.ids + plotcoverage_moab.ids + avgdepth_moab.ids s = Semaphore(semaphore_ids, home, 'bam_stats', args.queue, 20, 86400) s.wait() print "--------------------------------------" else: print "Jobs running, continuing" print "--------------------------------------"
def bwa_se_align(fastqs, fa, fqtypes, qtrim, N, alignpath, bwa6, library, threads, queue, add_aln, partition, logger): '''Start alignment using bwa of fastq reads on index''' import subprocess import genobox_modules from genobox_classes import Moab import os paths = genobox_modules.setSystem() home = os.getcwd() # setting cpus cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' if threads != 1: if partition == 'uv' or partition == 'uv2': cpuB = 'procs=%s,mem=5gb,walltime=345600,flags=sharedmem' % threads else: if threads > 8: cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads else: cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads else: cpuB = cpuA # get readgroups RG = library.getRG('Data') #RG = genobox_modules.read_groups_from_libfile('Data', library) # align if bwa6: cmd = paths['bwa_6_2_home'] + 'bwa aln ' else: cmd = paths['bwa_home'] + 'bwa aln ' if add_aln: cmd = cmd + add_aln bwa_align = [] saifiles = [] for i, fq in enumerate(fastqs): f = os.path.split(fq)[1] saifile = alignpath + f + '.sai' saifiles.append(saifile) if fqtypes[i] == 'Illumina': arg = ' -I -t %i -q %i %s %s > %s' % (threads, qtrim, fa, fq, saifile) elif fqtypes[i] == 'Sanger': arg = ' -t %i -q %i %s %s > %s' % (threads, qtrim, fa, fq, saifile) elif fqtypes[i] == 'Solexa': raise ValueError( 'File %s is in Solexa format, convert to Sanger first\n' % fq) bwa_align.append(cmd + arg) # samse bwa_samse = [] bamfiles = [] bamfiles_dict = dict() for i, fq in enumerate(fastqs): f = os.path.split(fq)[1] bamfile = alignpath + f + '.bam' bamfiles.append(bamfile) bamfiles_dict[fq] = bamfile if bwa6: p = paths['bwa_6_2_home'] else: p = paths['bwa_home'] call = '%sbwa samse -n %i -r \"%s\" %s %s %s | %ssamtools view -Sb - > %s' % ( p, N, '\\t'.join( RG[fq]), fa, saifiles[i], fq, paths['samtools_home'], bamfile) bwa_samse.append(call) # submit jobs # create moab instance for the align_calls and dispatch to queue bwa_align_moab = Moab(bwa_align, logfile=logger, runname='run_genobox_bwaalign', queue=queue, cpu=cpuB, partition=partition) bwa_samse_moab = Moab(bwa_samse, logfile=logger, runname='run_genobox_bwasamse', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bwa_align_moab.ids, partition=partition) # release jobs print "Releasing jobs" #bwa_align_moab.release() #bwa_samse_moab.release() return (bwa_samse_moab.ids, bamfiles_dict)
def bwasw_iontorrent(fastqs, fa, fqtypes, alignpath, bwa6, library, threads, queue, partition, logger): '''Start alignment of fastq files using BWA-SW Iontorrent data''' import subprocess import genobox_modules from genobox_classes import Moab import os paths = genobox_modules.setSystem() home = os.getcwd() # setting cpus cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' if threads != 1: if partition == 'uv' or partition == 'uv2': cpuB = 'procs=%s,mem=5gb,walltime=345600,flags=sharedmem' % threads else: if threads > 8: cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads else: cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads else: cpuB = cpuA # align if bwa6: cmd = paths['bwa_6_2_home'] + 'bwa ' else: cmd = paths['bwa_home'] + 'bwa ' bwa_align = [] bamfiles = [] bamfiles_dict = dict() for i, fq in enumerate(fastqs): f = os.path.split(fq)[1] bamfile = alignpath + f + '.bam' bamfiles.append(bamfile) bamfiles_dict[fq] = bamfile if fqtypes[i] == 'Illumina': raise ValueError( 'BWA-SW should not align reads with Illumina Qualities') elif fqtypes[i] == 'Sanger': arg = ' bwasw -t %i %s %s | %ssamtools view -Sb - > %s' % ( threads, fa, fq, paths['samtools_home'], bamfile) bwa_align.append(cmd + arg) # submit jobs # create moab instance for the align_calls and dispatch to queue bwa_align_moab = Moab(bwa_align, logfile=logger, runname='run_genobox_bwaalign', queue=queue, cpu=cpuB, partition=partition) # release jobs print "Releasing jobs" #bwa_align_moab.release() return (bwa_align_moab.ids, bamfiles_dict)
def bwa_pe_align(pe1, pe2, fa, fqtypes_pe1, fqtypes_pe2, qtrim, N, alignpath, bwa6, a, library, threads, queue, add_aln, partition, logger): '''Start alignment using bwa of paired end fastq reads on index''' import subprocess import genobox_modules from genobox_classes import Moab import os paths = genobox_modules.setSystem() home = os.getcwd() # setting cpus cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' if threads != 1: if partition == 'uv' or partition == 'uv2': cpuB = 'procs=%s,mem=5gb,walltime=172800,flags=sharedmem' % threads else: if threads > 8: cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads else: cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads else: cpuB = cpuA # get readgroups RG = library.getRG('Data') #RG = genobox_modules.read_groups_from_libfile('Data', library) # align and sampe if bwa6: cmd = paths['bwa_6_2_home'] + 'bwa ' else: cmd = paths['bwa_home'] + 'bwa ' bwa_align = [] sam2bam_calls = [] bwa_align1_calls = [] bwa_align2_calls = [] bwa_sampe_calls = [] saifiles1 = [] saifiles2 = [] bamfiles = [] bamfiles_dict = dict() for i, fq in enumerate(pe1): # set input fastq format if fqtypes_pe1[i] != fqtypes_pe2[i]: raise ValueError('Fastq formats are not the same for %s and %s' % (pe1[i], pe2[i])) elif fqtypes_pe1[i] == 'Sanger': bwa_cmd = '%s aln' % cmd elif fqtypes_pe1[i] == 'Illumina': bwa_cmd = '%s aln -I ' % cmd else: raise ValueError('fqtype must be Sanger or Illumina') if add_aln: bwa_cmd = bwa_cmd + add_aln # set filenames f1 = os.path.split(pe1[i])[1] f2 = os.path.split(pe2[i])[1] saifile1 = alignpath + f1 + '.sai' saifile2 = alignpath + f2 + '.sai' saifiles1.append(saifile1) saifiles2.append(saifile2) bamfiles.append(alignpath + f1 + '.bam') bamfiles_dict[pe1[i]] = alignpath + f1 + '.bam' bamfiles_dict[pe2[i]] = alignpath + f1 + '.bam' if bwa6: p = paths['bwa_6_2_home'] else: p = paths['bwa_home'] # generate calls bwa_align1 = '%s -t %s -q %i %s -f %s %s ' % (bwa_cmd, threads, qtrim, fa, saifiles1[i], pe1[i]) bwa_align2 = '%s -t %s -q %i %s -f %s %s ' % (bwa_cmd, threads, qtrim, fa, saifiles2[i], pe2[i]) sampecall = '%sbwa sampe -n %i -a %i -r \"%s\" %s %s %s %s %s | %ssamtools view -Sb - > %s' % ( p, N, a, '\\t'.join(RG[fq]), fa, saifiles1[i], saifiles2[i], pe1[i], pe2[i], paths['samtools_home'], bamfiles[i]) bwa_align1_calls.append(bwa_align1) bwa_align2_calls.append(bwa_align2) bwa_sampe_calls.append(sampecall) # submit jobs # create moab instance for the align_calls and dispatch to queue bwa_align1_moab = Moab(bwa_align1_calls, logfile=logger, runname='run_genobox_bwaalign1', queue=queue, cpu=cpuB, partition=partition) bwa_align2_moab = Moab(bwa_align2_calls, logfile=logger, runname='run_genobox_bwaalign2', queue=queue, cpu=cpuB, partition=partition) # set jobids in the correct way bwa_alignids = [] for i in range(len(bwa_align1_moab.ids)): bwa_alignids.append(bwa_align1_moab.ids[i]) bwa_alignids.append(bwa_align2_moab.ids[i]) # submit sampe bwa_sampe_moab = Moab(bwa_sampe_calls, logfile=logger, runname='run_genobox_bwasampe', queue=queue, cpu=cpuA, depend=True, depend_type='conc', depend_val=[2], depend_ids=bwa_alignids, partition=partition) # release jobs print "Releasing jobs" #bwa_align1_moab.release() #bwa_align2_moab.release() #bwa_sampe_moab.release() return (bwa_sampe_moab.ids, bamfiles_dict)
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o