def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger): '''Start variant vcf-filter using gatk''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('tmp'): os.makedirs('tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' vcffilter_calls = [] cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py' # for each chromosome for v in vcfs: arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q) if rmsk: arg = arg + ' --rmsk %s' % rmsk if ab != 0.5: arg = arg + ' --ab %f' % ab if prune != 0: arg = arg + ' --prune %i' % prune vcffilter_calls.append(cmd + arg) # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2 * 86400) s.wait() print "--------------------------------------"
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger): '''Annotate vcf.gz file with dbSNP, exchanging chromsome names to dbSNP version sort vcf and the input to dbSNP ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not dbsnp or dbsnp == 'None': print "No dbsnp file given - skipping" print "--------------------------------------" return vcf if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create command cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py' arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o) dbsnp_calls = [cmd + arg] # submit jobs print "Submitting jobs" dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition) # release jobs # print "Releasing jobs" #dbsnp_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" return o
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger): '''Start variant vcf-filter using gatk''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('tmp'): os.makedirs('tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' vcffilter_calls = [] cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py' # for each chromosome for v in vcfs: arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q) if rmsk: arg = arg + ' --rmsk %s' % rmsk if ab != 0.5: arg = arg + ' --ab %f' % ab if prune != 0: arg = arg + ' --prune %i' % prune vcffilter_calls.append(cmd+arg) # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return filename of final vcf
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger): '''Annotate vcf.gz file with dbSNP, exchanging chromsome names to dbSNP version sort vcf and the input to dbSNP ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not dbsnp or dbsnp == 'None': print "No dbsnp file given - skipping" print "--------------------------------------" return vcf if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create command cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py' arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o) dbsnp_calls = [cmd+arg] # submit jobs print "Submitting jobs" dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition) # release jobs # print "Releasing jobs" #dbsnp_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2*86400) s.wait() print "--------------------------------------" return o
def start_genotyping_gatk(bam, genome, fa, dbsnp, call_conf, call_emit, output_mode, queue, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=3gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (gatk_calls, vcffiles) = unified_genotyper(bam, genome, fa, dbsnp, call_conf, call_emit, output_mode) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) gatk_moab = Moab(gatk_calls, logfile=logger, runname='run_genobox_genotyping_gatk', queue=queue, cpu=cpuE, depend=True, depend_type='expand', depend_val=[len(gatk_calls)], depend_ids=bamindex_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #gatk_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(gatk_moab.ids, home, 'genotyping', queue, 20, 2*86400) s.wait() print "--------------------------------------" # remove temporary files #genobox_modules.rm_files(bcffiles) # return output variant files return vcffiles
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger): '''Start variant vcf-filter Genome file must be given, format is a line for each chromosome: chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth Filtering steps: vcfutils.pl varFilter annotated repeats using rmsk heterozygote variants on haploid chromosomes allelic balance pruning of variants within N nt of each other ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('genotyping/tmp'): os.makedirs('genotyping/tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' if caller == 'samtools': # create command cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py' if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (bcf, genome, caller, Q, rmsk, ab, prune, outfile) vcffilter_calls = [cmd+arg] # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition) # release jobs # print "Releasing jobs" vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return filename of final vcf return o
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger): '''Start variant vcf-filter Genome file must be given, format is a line for each chromosome: chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth Filtering steps: vcfutils.pl varFilter annotated repeats using rmsk heterozygote variants on haploid chromosomes allelic balance pruning of variants within N nt of each other ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('genotyping/tmp'): os.makedirs('genotyping/tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' if caller == 'samtools': # create command cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py' if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % ( bcf, genome, caller, Q, rmsk, ab, prune, outfile) vcffilter_calls = [cmd + arg] # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" # return filename of final vcf return o
def start_alignment(args, logger): '''Start alignment of fastq files using BWA''' import genobox_modules from genobox_classes import Semaphore, Library import subprocess import os import random import string paths = genobox_modules.setSystem() home = os.getcwd() semaphore_ids = [] bamfiles = dict() if not os.path.exists('alignment'): os.makedirs('alignment') # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment) if hasattr(args, 'mapq'): library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) else: library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, [30], args.libs, args.pl) # check for fa check_fa(args.fa, args.bwa6) # check for if trimming was performed (abgv only) and set correct files #(se_files, pe1_files, pe2_files) = check_trim(args) # start single end alignments if args.se: # get platform info (PL, PL2data) = library.getPL('Data') print "Submitting single end alignments" for key, value in PL2data.items(): if key == 'ILLUMINA' or key == 'HELICOS': fqtypes_se = [] # filter to only contain single end files toalign = [] for v in value: if v in args.se: toalign.append(v) for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwa_se_align( toalign, args.fa, fqtypes_se, args.qtrim, args.N, 'alignment/', args.bwa6, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'PACBIO': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwasw_pacbio( toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'IONTORRENT' or key == '454': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwasw_iontorrent( toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) # start paired end alignments if args.pe1: if len(args.pe1) != len(args.pe2): raise ValueError( 'Same number of files must be given to --pe1 and --pe2') # set fqtypes fqtypes_pe1 = [] fqtypes_pe2 = [] for fq in args.pe1: if args.quals: fqtypes_pe1.append(args.quals) else: fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6)) for fq in args.pe2: if args.quals: fqtypes_pe2.append(args.quals) else: fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6)) # submit print "Submitting paired end alignments" (pe_align_ids, bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1, fqtypes_pe2, args.qtrim, args.N, 'alignment/', args.bwa6, args.a, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(pe_align_ids) bamfiles.update(bamfiles_pe) # update library library.update_with_tag('Data', 'BAM', bamfiles, True) # wait for jobs to finish print "Waiting for jobs to finish ..." s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 345600) s.wait() print "--------------------------------------" # return bamfiles return (bamfiles, library)
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab(rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 2*86400) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger): '''Extract high confidence same-as-reference bases from bcf, options are to: exchange ids annotate using dbsnp filter rmsk filter ambiguous indel positions ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # read genome file genome = get_genome(genome_file) # create commands bcf2ref_calls = [] cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py' for chr in genome: # set outfile name if len(genome) == 1: if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o else: if dir and dir != 'None': outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1]) else: outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1]) arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile) bcf2ref_calls.append(cmd+arg) # submit jobs print "Submitting jobs" bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" #bcf2ref_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2*86400) s.wait() print "--------------------------------------"
def start_assembly(args, logger): '''Start assembly''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuV = 'nodes=1:ppn=%i,mem=%s,walltime=172800' % (args.n, args.m) cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # set kmersizes (if auto) if args.ksizes == ['auto']: args.ksizes = set_kmersizes(args) # trimming calls if args.trim: illuminatrim_calls = illumina_trim(args, int(args.ksizes[0]), 15, 20, 15, False) # checking if files needs to be interleaved interleave_dict = {} interleave_dict['shortPaired'] = interleave(args.shortPaired, args.sample)[0] ; args.shortPaired = interleave(args.shortPaired, args.sample)[1] interleave_dict['shortPaired2'] = interleave(args.shortPaired2, args.sample)[0] ; args.shortPaired2 = interleave(args.shortPaired2, args.sample)[1] interleave_dict['longPaired'] = interleave(args.longPaired, args.sample)[0] ; args.longPaired = interleave(args.longPaired, args.sample)[1] # interleave calls interleave_calls = [] for key,value in interleave_dict.items(): if value: interleave_calls.append(value) # velvet calls velveth_calls = create_velveth_calls(args) velvetg_calls = create_velvetg_calls(args) # velvet parse calls velvetparse_calls = get_best_assembly(args) velvetaccept_calls = accept_assembly(args) velvetclean_calls = clean() # set environment variable: env_var = 'OMP_NUM_THREADS=%i' % int(args.n - 1) # submit and release jobs print "Submitting jobs" # if trimming is needed if args.trim: illuminatrim_moab = Moab(illuminatrim_calls, logfile=logger, runname='run_genobox_trim', queue=args.queue, cpu=cpuF) # if no interleaving is needed if len(interleave_calls) == 0: velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if interleaving is needed else: interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids) velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if no trimming else: # if no interleaving is needed if len(interleave_calls) == 0: velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # if interleaving is needed else: interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF) velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var) velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids) # submit job for velvetparse if more than one ksize was chosen if len(args.ksizes) > 1: velvetparse_moab = Moab(velvetparse_calls, logfile=logger, runname='run_genobox_velvetparse', queue=args.queue, cpu=cpuA, depend=True, depend_type='conc', depend_val=[len(velvetg_calls)], depend_ids=velvetg_moab.ids) velvetaccept_moab = Moab(velvetaccept_calls, logfile=logger, runname='run_genobox_velvetaccept', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetparse_moab.ids) velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetaccept_moab.ids) else: velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetg_moab.ids) # release jobs print "Releasing jobs" if args.trim and len(illuminatrim_calls) > 0: illuminatrim_moab.release() if len(interleave_calls) > 0: interleave_moab.release() velveth_moab.release() velvetg_moab.release() if len(args.ksizes) > 1: velvetparse_moab.release() velvetaccept_moab.release() velvetclean_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(velvetclean_moab.ids, home, 'velvet', args.queue, 20, 2*86400) s.wait() print "--------------------------------------"
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger): '''Extract high confidence same-as-reference bases from bcf, options are to: exchange ids annotate using dbsnp filter rmsk filter ambiguous indel positions ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # read genome file genome = get_genome(genome_file) # create commands bcf2ref_calls = [] cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py' for chr in genome: # set outfile name if len(genome) == 1: if dir and dir != 'None': outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1]) else: outfile = o else: if dir and dir != 'None': outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1]) else: outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1]) arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % ( bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile) bcf2ref_calls.append(cmd + arg) # submit jobs print "Submitting jobs" bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" #bcf2ref_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2 * 86400) s.wait() print "--------------------------------------"
def start_bamstats(args, bam, partition, logger, wait=True): '''Starts calculation of bam statistics''' # samtools flagstat # bedtools genomeCoverageBed # python avgdepth import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('stats'): os.makedirs('stats') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=7gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' cpuUV = 'procs=1,mem=%i,walltime=172800,flags=sharedmem' # create calls if args.mapdamage: mapdamage_calls = mapdamapge(bam, args.fa) else: flagstat_calls = sam_flagstat(bam) coverage_calls = bed_genomeCov(bam) plotcoverage_calls = plot_coverage(bam) avgdepth_calls = python_avgdepth(bam) saturation_calls = get_saturation(bam) # submit jobs print "Submitting jobs" if args.mapdamage: mapdamage_moab = Moab(mapdamage_calls, logfile=logger, runname='run_genobox_mapdamage', queue=args.queue, cpu=cpuA, partition=partition) else: flagstat_moab = Moab(flagstat_calls, logfile=logger, runname='run_genobox_flagstat', queue=args.queue, cpu=cpuC, partition=partition) coverage_moab = Moab(coverage_calls, logfile=logger, runname='run_genobox_coverage', queue=args.queue, cpu=cpuC, partition=partition) plotcoverage_moab = Moab(plotcoverage_calls, logfile=logger, runname='run_genobox_plotcoverage', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=coverage_moab.ids, partition=partition) avgdepth_moab = Moab(avgdepth_calls, logfile=logger, runname='run_genobox_avgdepth', queue=args.queue, cpu=cpuE, partition=partition) #saturation_moab = Moab(saturation_calls, logfile=logger, runname='run_genobox_saturation', queue=args.queue, cpu=cpuE, partition=partition) # release jobs print "Releasing jobs" # wait for jobs to finish if wait: print "Waiting for jobs to finish ..." if args.mapdamage: semaphore_ids = mapdamage_moab.ids else: semaphore_ids = flagstat_moab.ids + coverage_moab.ids + plotcoverage_moab.ids + avgdepth_moab.ids s = Semaphore(semaphore_ids, home, 'bam_stats', args.queue, 20, 86400) s.wait() print "--------------------------------------" else: print "Jobs running, continuing" print "--------------------------------------"
def start_trim(args, logger): '''Start trimming from genobox.py''' import genobox_modules from genobox_classes import Moab, Semaphore import subprocess import os import sys # set queueing paths = genobox_modules.setSystem() home = os.getcwd() if args.partition == 'uv': cpuA = 'procs=2,mem=512mb,walltime=172800,flags=sharedmem' cpuC = 'procs=1,mem=2gb,walltime=172800,flags=sharedmem' cpuE = 'procs=1,mem=5gb,walltime=172800,flags=sharedmem' cpuB = 'procs=16,mem=10gb,walltime=172800,flags=sharedmem' cpuF = 'procs=2,mem=%s,walltime=172800,flags=sharedmem' % args.m else: cpuA = 'nodes=1:ppn=2,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=%s,walltime=172800' % args.m # create path if not os.path.exists('trimmed'): os.makedirs('trimmed') # create calls (single_calls, se_files) = single_trim(args) (paired_calls, pe1_files, pe2_files) = paired_trim(args) # submit jobs print "Submitting jobs" if args.se: single_moab = Moab(single_calls, logfile=logger, runname='run_genobox_trimse', queue=args.queue, cpu=cpuA, partition=args.partition) if args.pe1 and args.pe2: paired_moab = Moab(paired_calls, logfile=logger, runname='run_genobox_trimpe', queue=args.queue, cpu=cpuA, partition=args.partition) # release jobs print "Releasing jobs" #if args.se: # single_moab.release() #if args.pe1 and args.pe2: # paired_moab.release() # wait for jobs to finish print "Waiting for jobs to finish ..." semaphore_ids = [] if args.se: semaphore_ids = semaphore_ids + single_moab.ids if args.pe1 and args.pe2: semaphore_ids = semaphore_ids + paired_moab.ids s = Semaphore(semaphore_ids, home, 'read_trimming', args.queue, 60, 86400) s.wait() print "--------------------------------------" sys.stderr.write('Done\n') # return trimmed files return (se_files, pe1_files, pe2_files)
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab( rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuG, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition ) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 345600) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 345600) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_alignment(args, logger): '''Start alignment of fastq files using BWA''' import genobox_modules from genobox_classes import Semaphore, Library import subprocess import os import random import string paths = genobox_modules.setSystem() home = os.getcwd() semaphore_ids = [] bamfiles = dict() if not os.path.exists('alignment'): os.makedirs('alignment') # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment) if hasattr(args, 'mapq'): library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) else: library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, [30], args.libs, args.pl) # check for fa check_fa(args.fa, args.bwa6) # check for if trimming was performed (abgv only) and set correct files #(se_files, pe1_files, pe2_files) = check_trim(args) # start single end alignments if args.se: # get platform info (PL, PL2data) = library.getPL('Data') print "Submitting single end alignments" for key,value in PL2data.items(): if key == 'ILLUMINA' or key == 'HELICOS': fqtypes_se = [] # filter to only contain single end files toalign = [] for v in value: if v in args.se: toalign.append(v) for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwa_se_align(toalign, args.fa, fqtypes_se, args.qtrim, args.N, 'alignment/', args.bwa6, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'PACBIO': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) (se_align_ids, bamfiles_se) = bwasw_pacbio(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'IONTORRENT': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) (se_align_ids, bamfiles_se) = bwasw_iontorrent(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) # start paired end alignments if args.pe1: if len(args.pe1) != len(args.pe2): raise ValueError('Same number of files must be given to --pe1 and --pe2') # set fqtypes fqtypes_pe1 = [] fqtypes_pe2 = [] for fq in args.pe1: fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6)) for fq in args.pe2: fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6)) print "Submitting paired end alignments" (pe_align_ids, bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1, fqtypes_pe2, args.qtrim, args.N, 'alignment/', args.bwa6, args.a, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(pe_align_ids) bamfiles.update(bamfiles_pe) # update library library.update_with_tag('Data', 'BAM', bamfiles, True) # wait for jobs to finish print "Waiting for jobs to finish ..." s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 172800) s.wait() print "--------------------------------------" # return bamfiles return (bamfiles, library)
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o