def start_ab(args, logger): '''Perform alignment and bam processing''' import os import subprocess final_bam = args.outbam # initialize library file from given arguments library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess(library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------"
def start_alignment(args, logger): '''Start alignment of fastq files using BWA''' import genobox_modules from genobox_classes import Semaphore, Library import subprocess import os import random import string paths = genobox_modules.setSystem() home = os.getcwd() semaphore_ids = [] bamfiles = dict() if not os.path.exists('alignment'): os.makedirs('alignment') # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment) if hasattr(args, 'mapq'): library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) else: library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, [30], args.libs, args.pl) # check for fa check_fa(args.fa, args.bwa6) # check for if trimming was performed (abgv only) and set correct files #(se_files, pe1_files, pe2_files) = check_trim(args) # start single end alignments if args.se: # get platform info (PL, PL2data) = library.getPL('Data') print "Submitting single end alignments" for key,value in PL2data.items(): if key == 'ILLUMINA' or key == 'HELICOS': fqtypes_se = [] # filter to only contain single end files toalign = [] for v in value: if v in args.se: toalign.append(v) for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwa_se_align(toalign, args.fa, fqtypes_se, args.qtrim, args.N, 'alignment/', args.bwa6, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'PACBIO': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) (se_align_ids, bamfiles_se) = bwasw_pacbio(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'IONTORRENT': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6)) (se_align_ids, bamfiles_se) = bwasw_iontorrent(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) # start paired end alignments if args.pe1: if len(args.pe1) != len(args.pe2): raise ValueError('Same number of files must be given to --pe1 and --pe2') # set fqtypes fqtypes_pe1 = [] fqtypes_pe2 = [] for fq in args.pe1: fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6)) for fq in args.pe2: fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6)) print "Submitting paired end alignments" (pe_align_ids, bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1, fqtypes_pe2, args.qtrim, args.N, 'alignment/', args.bwa6, args.a, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(pe_align_ids) bamfiles.update(bamfiles_pe) # update library library.update_with_tag('Data', 'BAM', bamfiles, True) # wait for jobs to finish print "Waiting for jobs to finish ..." s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 172800) s.wait() print "--------------------------------------" # return bamfiles return (bamfiles, library)
def start_abgv(args, logger): '''Start alignment, bam processing, genotyping, vcffiltering, dbsnp annotation, bcf2ref''' import os import subprocess # check genome file genobox_modules.check_genome(args.genome) final_bam = 'alignment/%s.flt.sort.rmdup.bam' % args.sample final_bcf = 'genotyping/%s.all.bcf' % args.sample # initialize library file from given arguments library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" # toggle start trimming #if args.no_trim == False: # print "Starting trimming" # (se_files, pe1_files, pe2_files) = start_trim(args, logger) # library.update(Trim=se_files+pe1_files+pe2_files) print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess(library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger) print "Starting bam stats" start_bamstats(args, final_bam, args.partition, logger, wait=False) print "Starting genotyping" if args.caller == 'samtools': final_bcf = start_genotyping(final_bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger) print "Starting vcffiltering" final_vcf = start_vcffilter(final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref(final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, 'genotyping/indels_for_filtering.vcf', args.oref, args.queue, args.sample, args.partition, logger) elif args.caller == 'gatk': print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk(final_bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk(vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, args.logger) # remove queuing system outfiles genobox_modules.rm_files(['run_genobox_*', 'semaphores.*']) print "Done" print "--------------------------------------" print "Raw genotyping is written in genotyping/all.bcf" print "High confidence variants: %s" % args.ovar print "High confidence reference: %s" % args.oref print "--------------------------------------"
def start_abgv(args, logger): """Start alignment, bam processing, genotyping, vcffiltering, dbsnp annotation, bcf2ref""" import os import subprocess # check genome file genobox_modules.check_genome(args.genome) final_bam = "alignment/%s.flt.sort.rmdup.bam" % args.sample final_bcf = "genotyping/%s.all.bcf" % args.sample # initialize library file from given arguments library = genobox_modules.initialize_library( args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl ) # start run if args.sample: print "--------------------------------------" print "Processing sample: %s" % args.sample print "--------------------------------------" # toggle start trimming # if args.no_trim == False: # print "Starting trimming" # (se_files, pe1_files, pe2_files) = start_trim(args, logger) # library.update(Trim=se_files+pe1_files+pe2_files) print "Starting alignment" (bamfiles, library) = start_alignment(args, logger) print "Starting bam processing" final_bam = start_bamprocess( library, genobox_modules.unique(bamfiles.values()), args.mapq, args.libs, args.tmpdir, args.queue, final_bam, args.realignment, args.known, args.fa, args.sample, args.partition, logger, ) print "Starting bam stats" start_bamstats(args, final_bam, args.partition, logger, wait=False) print "Starting genotyping" if args.caller == "samtools": final_bcf = start_genotyping( final_bam, args.genome, args.fa, args.prior, args.pp, args.queue, final_bcf, args.sample, args.partition, logger, ) print "Starting vcffiltering" final_vcf = start_vcffilter( final_bcf, args.genome, args.caller, args.Q, args.ex, args.rmsk, args.ab, args.prune, args.ovar, args.queue, args.sample, args.partition, logger, ) print "Start dbsnp" final_dbsnp_vcf = start_dbsnp(final_vcf, args.ex, args.dbsnp, args.ovar, args.queue, args.partition, logger) print "Start bcf2ref" start_bcf2ref( final_bcf, args.genome, args.Q, args.ex, args.dbsnp, args.rmsk, "genotyping/indels_for_filtering.vcf", args.oref, args.queue, args.sample, args.partition, logger, ) elif args.caller == "gatk": print "Start genotyping (gatk)" vcffiles = start_genotyping_gatk( final_bam, args.genome, args.fa, args.dbsnp, args.call_conf, args.args.call_emit, args.output_mode, args.queue, args.sample, args.partition, logger, ) print "Start vcffiltering (gatk)" final_vcfs = start_vcffilter_gatk( vcffiles, args.genome, args.fa, args.Q, args.rmsk, args.ab, args.prune, args.queue, args.sample, args.partition, args.logger, ) # remove queuing system outfiles genobox_modules.rm_files(["run_genobox_*", "semaphores.*"]) print "Done" print "--------------------------------------" print "Raw genotyping is written in genotyping/all.bcf" print "High confidence variants: %s" % args.ovar print "High confidence reference: %s" % args.oref print "--------------------------------------"
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab( rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuG, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition ) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 345600) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 345600) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger): '''Starts bam processing of input files''' import subprocess import genobox_modules from genobox_classes import Moab, Semaphore, Library import os # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600' cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600' # create library instance if library_file and library_file != 'None': if isinstance(library_file, Library): library = library_file else: library = Library(library_file) library.read() else: library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams) (bam2lib, lib2bam) = library.getBamLibs() ## CREATE CALLS ## # filter bam and sort (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000) # merge to libs (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir) # rmdup on libs (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir) # optional: realignment if realignment: (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known) else: # merge to final file (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False) ## SUBMIT JOBS ## print "Submitting jobs" filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition) mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition) rmdup_moab = Moab(rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition) # NB: If memory should be changed, also change java memory spec in rmdup function mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition) if realignment: realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition) # realignment calls needs to be written together in a shell-file or dependent on each other # # release jobs # print "Releasing jobs" #filtersort_moab.release() #mergelib_moab.release() #rmdup_moab.release() #mergefinal_moab.release() #if realignment: realign_moab.release() # semaphore print "Waiting for jobs to finish ..." if realignment: s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 2*86400) else: s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return final bamfile return final_bam
def start_alignment(args, logger): '''Start alignment of fastq files using BWA''' import genobox_modules from genobox_classes import Semaphore, Library import subprocess import os import random import string paths = genobox_modules.setSystem() home = os.getcwd() semaphore_ids = [] bamfiles = dict() if not os.path.exists('alignment'): os.makedirs('alignment') # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment) if hasattr(args, 'mapq'): library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl) else: library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, [30], args.libs, args.pl) # check for fa check_fa(args.fa, args.bwa6) # check for if trimming was performed (abgv only) and set correct files #(se_files, pe1_files, pe2_files) = check_trim(args) # start single end alignments if args.se: # get platform info (PL, PL2data) = library.getPL('Data') print "Submitting single end alignments" for key, value in PL2data.items(): if key == 'ILLUMINA' or key == 'HELICOS': fqtypes_se = [] # filter to only contain single end files toalign = [] for v in value: if v in args.se: toalign.append(v) for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwa_se_align( toalign, args.fa, fqtypes_se, args.qtrim, args.N, 'alignment/', args.bwa6, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'PACBIO': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwasw_pacbio( toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) elif key == 'IONTORRENT' or key == '454': toalign = [] for v in value: if v in args.se: toalign.append(v) fqtypes_se = [] for fq in toalign: if args.quals: fqtypes_se.append(args.quals) else: fqtypes_se.append( check_formats_fq(fq, args.gz, args.bwa6)) # submit (se_align_ids, bamfiles_se) = bwasw_iontorrent( toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger) semaphore_ids.extend(se_align_ids) bamfiles.update(bamfiles_se) # start paired end alignments if args.pe1: if len(args.pe1) != len(args.pe2): raise ValueError( 'Same number of files must be given to --pe1 and --pe2') # set fqtypes fqtypes_pe1 = [] fqtypes_pe2 = [] for fq in args.pe1: if args.quals: fqtypes_pe1.append(args.quals) else: fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6)) for fq in args.pe2: if args.quals: fqtypes_pe2.append(args.quals) else: fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6)) # submit print "Submitting paired end alignments" (pe_align_ids, bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1, fqtypes_pe2, args.qtrim, args.N, 'alignment/', args.bwa6, args.a, library, args.n, args.queue, args.add_aln, args.partition, logger) semaphore_ids.extend(pe_align_ids) bamfiles.update(bamfiles_pe) # update library library.update_with_tag('Data', 'BAM', bamfiles, True) # wait for jobs to finish print "Waiting for jobs to finish ..." s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 345600) s.wait() print "--------------------------------------" # return bamfiles return (bamfiles, library)