def realign_bam(in_bam, out_bam, fa, known=None): '''Runs realignment of bam''' import genobox_modules paths = genobox_modules.setSystem() calls = [] gatk_cmd = paths['GATK_home'] + 'GenomeAnalysisTK.jar' java_call = paths['java_home']+'java -Djava.io.tmpdir=/panvol1/simon/tmp/ -XX:ParallelGCThreads=8 -Xms4500m -Xmx4500m -jar %s ' % gatk_cmd realign_bam = out_bam.replace('.bam', '.realign.bam') # index bam cmd = paths['samtools_home'] + 'samtools ' # adding pipe to make it being written as a shell-file so all commands are submitted at the same time (fix dependencies) arg = 'index %s | cat - ' % in_bam c = cmd+arg #calls.append(cmd+arg) # realigner target creator if known: arg = '-I %s -R %s -T RealignerTargetCreator -known %s -o %s' % (in_bam, fa, known, in_bam+'.intervals') else: arg = '-I %s -R %s -T RealignerTargetCreator -o %s' % (in_bam, fa, in_bam+'.intervals') #calls.append(java_call+arg) c = '%s\n\n%s%s' % (c, java_call, arg) # realignment step arg = '-I %s -T IndelRealigner -R %s -targetIntervals %s -o %s' % (in_bam, fa, in_bam+'.intervals', realign_bam) #calls.append(java_call+arg) c = '%s\n\n%s%s' % (c, java_call, arg) calls = [c] return (calls, realign_bam)
def wait(self): '''Wait for files to be created''' from time import sleep import string import random import os import genobox_modules import subprocess paths = genobox_modules.setSystem() # add directory and set semaphore filename if not os.path.exists('semaphores/'): os.makedirs('semaphores/') rand = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(10)) semaphore_file = 'semaphores/' + self.file_prefix + '.' + rand semaphore_file_err = 'log/' + self.file_prefix + '.' + rand + '.err' # submit job depends = ':'.join(self.semaphore_ids) xqsub = '%sxqsub -d %s -l ncpus=1,mem=10mb,walltime=180,depend=%s -O %s -q %s -N semaphores -E %s -r y -t echo done' % (paths['pyscripts_home'], self.home, depends, semaphore_file, self.queue, semaphore_file_err) dummy_id = subprocess.check_output(xqsub, shell=True) # check for file to appear cnt = self.max_time while cnt > 0: if os.path.isfile(semaphore_file): break cnt -= self.check_interval sleep(self.check_interval) if cnt <= 0: raise SystemExit('%s did not finish in %is' % ())
def create_velvetg_calls(args): '''Return velvetg calls''' import genobox_modules paths = genobox_modules.setSystem() # create cmd cmds = [] if len(args.ksizes) == 1: cmd = '%svelvetg %s' % (paths['velvet_home'], args.outpath) cmds.append(cmd) elif len(args.ksizes) >= 2 and len(args.ksizes) <= 3: if len(args.ksizes) == 2: step = 2 elif len(args.ksizes) == 3: step = args.ksizes[2] for k in range(int(args.ksizes[0]), int(args.ksizes[1]), int(step)): cmd = '%svelvetg %s_%s' % (paths['velvet_home'], args.outpath, k) cmds.append(cmd) # create arg: cov_cut, exp_cov, ins_length, add_velvetg velvetg_calls = [] # add other parameters for i in range(len(cmds)): arg = ' -min_contig_lgth %i' % args.min_contig_lgth if args.cov_cut: arg = arg + ' -cov_cut %f' % args.cov_cut if args.exp_cov != "None": arg = arg + ' -exp_cov %s' % args.exp_cov if args.ins_length: arg = arg + ' -ins_length %i' % args.ins_length if args.add_velvetg: arg = arg + ' %s' % args.add_velvetg velvetg_calls.append(cmds[i]+arg) return velvetg_calls
def vcf_filter_prune(vcf, prune, vcfgz_out): '''Prune variants within N nt of each other''' paths = genobox_modules.setSystem() if prune != 0: # create header head_call = 'head -n 1000 %s | grep "#" > %s' % ( vcf, 'genotyping/header.vcf') logger.info(head_call) subprocess.check_call(head_call, shell=True) tmp_file = vcf + '.tmp' prune_script = paths['genobox_home'] + 'genobox_snppruning.R' prune_cmd = paths['R_home'] + 'R-2.12' prune_arg = ' --vanilla %i %s %s < %s' % (prune, vcf, tmp_file, prune_script) prune_call = prune_cmd + prune_arg logger.info(prune_call) subprocess.check_call(prune_call, shell=True) # add header header_call = 'cat genotyping/header.vcf %s | %sbgzip -c > %s' % ( tmp_file, paths['bin_home'], vcfgz_out) logger.info(header_call) subprocess.check_call(header_call, shell=True) # rm tmp_files rm_call = 'rm %s genotyping/header.vcf' % tmp_file logger.info(rm_call) subprocess.check_call(rm_call, shell=True) else: call = '%sbgzip -c %s > %s' % (paths['bin_home'], vcf, vcfgz_out) logger.info(call) subprocess.check_call(call, shell=True)
def mpileup(bam, chr_file, fa, prior, pp): '''Perform SNP calling on bam-file using samtools''' import genobox_modules import os paths = genobox_modules.setSystem() cmd = paths['genobox_home'] + 'genobox_mpileup.py' calls = [] outfiles = [] # if chromosome file is given if chr_file: chrs = get_genome(chr_file) for c in chrs: outfile = 'genotyping/tmp.' + c[2] + '.all.bcf' outfiles.append(outfile) arg = ' --bam %s --chr \"%s\" --fa %s --prior %s --pp %f --o %s' % ( bam, c[0], fa, prior, pp, outfile) calls.append(cmd + arg) else: tmpfile_name = os.path.split(bam)[1] outfile = 'genotyping/tmp.' + tmpfile_name + '.all.bcf' outfiles.append(outfile) arg = ' --bam %s --fa %s --prior %s --pp %f --o %s' % (bam, fa, prior, pp, outfile) calls.append(cmd + arg) return (calls, outfiles)
def vcf_filter_rmsk(vcfgz, rmsk, vcfgz_out): '''Removes variants called inside annotated repeat If no rmsk is given it simply copies the file''' import random import string import genobox_modules paths = genobox_modules.setSystem() if rmsk and rmsk != 'None': # create header N = 10 rand = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(N)) header = 'genotyping/tmp'+rand+'.header.vcf' header_call = '/usr/bin/gunzip -c %s | head -n 1000 | grep "#" > %s' % (vcfgz, header) logger.info(header_call) subprocess.check_call(header_call, shell=True) # perform rmsk filtering gunzip_call = '/usr/bin/gunzip -c %s' % vcfgz bgzip_call = paths['bin_home'] + 'bgzip -c > %s' % vcfgz_out bed_cmd = paths['bedtools_home'] + 'intersectBed' bed_arg = ' -v -a stdin -b %s | cat %s - | %s' % (rmsk, header, bgzip_call) bed_call = bed_cmd + bed_arg call = '%s | %s' % (gunzip_call, bed_call) logger.info(call) subprocess.check_call(call, shell=True) # rm tmp header file subprocess.check_call('rm %s' % header, shell=True) else: call = 'cp %s %s' % (vcfgz, vcfgz_out) logger.info(call) subprocess.check_call(call, shell=True)
def vcf_filter_rmsk(vcfgz, rmsk, vcfgz_out): """Removes variants called inside annotated repeat If no rmsk is given it simply copies the file""" import random import string import genobox_modules paths = genobox_modules.setSystem() if rmsk and rmsk != "None": # create header N = 10 rand = "".join(random.choice(string.ascii_uppercase + string.digits) for x in range(N)) header = "genotyping/tmp" + rand + ".header.vcf" header_call = '/usr/bin/gunzip -c %s | head -n 1000 | grep "#" > %s' % (vcfgz, header) logger.info(header_call) subprocess.check_call(header_call, shell=True) # perform rmsk filtering gunzip_call = "/usr/bin/gunzip -c %s" % vcfgz bgzip_call = paths["bin_home"] + "bgzip -c > %s" % vcfgz_out bed_cmd = paths["bedtools_home"] + "intersectBed" bed_arg = " -v -a stdin -b %s | cat %s - | %s" % (rmsk, header, bgzip_call) bed_call = bed_cmd + bed_arg call = "%s | %s" % (gunzip_call, bed_call) logger.info(call) subprocess.check_call(call, shell=True) # rm tmp header file subprocess.check_call("rm %s" % header, shell=True) else: call = "cp %s %s" % (vcfgz, vcfgz_out) logger.info(call) subprocess.check_call(call, shell=True)
def unified_genotyper(bam, genome, fa, dbsnp, call_conf, call_emit, output_mode): '''Perform genotyping on bam-file using GATK unified genotyper''' import genobox_modules import os paths = genobox_modules.setSystem() gatk_cmd = paths['GATK_home'] + 'GenomeAnalysisTK.jar' java_cmd = 'java -Djava.io.tmpdir=/panvol1/simon/tmp/ -XX:ParallelGCThreads=8 -Xms3000m -Xmx3000m -jar ' cmd = java_cmd + gatk_cmd calls = [] outfiles = [] basename = os.path.split(bam)[1] chrs = get_genome(genome) for c in chrs: outfile = 'genotyping/%s.%s.raw.vcf.gz' % (basename.replace('.bam', ''), c[2]) logfile = 'log/run_unified_genotyper.%s.%s.log' % (basename.replace('.bam', ''), c[2]) outfiles.append(outfile) arg = ' -T UnifiedGenotyper -R %s -I %s -o /dev/stdout -log %s -stand_call_conf %f -stand_emit_conf %f -L %s -baq CALCULATE_AS_NECESSARY --num_threads 1 -glm BOTH --output_mode %s ' % (fa, bam, logfile, call_conf, call_emit, c[2], output_mode) if dbsnp: arg = arg + '--dbsnp %s ' % dbsnp arg = arg + ''' | perl -ne 'if ($_ =~ m/^INFO/ or $_ =~ m/^WARN/) {} else {print $_}' | gzip -c - > %s''' % outfile calls.append(cmd+arg) return (calls, outfiles)
def vcf_filter_prune(vcf, prune, vcfgz_out): '''Prune variants within N nt of each other''' paths = genobox_modules.setSystem() if prune != 0: # create header head_call = 'head -n 1000 %s | grep "#" > %s' % (vcf, 'genotyping/header.vcf') logger.info(head_call) subprocess.check_call(head_call, shell=True) tmp_file = vcf + '.tmp' prune_script = paths['genobox_home'] + 'genobox_snppruning.R' prune_cmd = paths['R_home'] + 'R-2.12' prune_arg = ' --vanilla %i %s %s < %s' % (prune, vcf, tmp_file, prune_script) prune_call = prune_cmd + prune_arg logger.info(prune_call) subprocess.check_call(prune_call, shell=True) # add header header_call = 'cat genotyping/header.vcf %s | %sbgzip -c > %s' % (tmp_file, paths['bin_home'], vcfgz_out) logger.info(header_call) subprocess.check_call(header_call, shell=True) # rm tmp_files rm_call = 'rm %s genotyping/header.vcf' % tmp_file logger.info(rm_call) subprocess.check_call(rm_call, shell=True) else: call = '%sbgzip -c %s > %s' % (paths['bin_home'], vcf, vcfgz_out) logger.info(call) subprocess.check_call(call, shell=True)
def bcf2varfilter(bcf, genome, Q, vcf_prefix): '''Runs bcf through varfilter and writes to vcf''' paths = genobox_modules.setSystem() bcf_cmd = paths['samtools_svn_home'] + 'bcftools view' calls = [] vcf_files = [] vcfutils_cmd = paths['samtools_svn_home'] + 'vcfutils.pl' for chr in genome: d = chr[4] D = chr[5] vcf = vcf_prefix + chr[2] + '.vcf' vcf_files.append(vcf) bcf_arg = ' %s \"%s\"' % (bcf, chr[0]) bcf_call = bcf_cmd + bcf_arg vcfutils_arg = ' varFilter -d%s -D%s' % (d, D) vcfutils_call = vcfutils_cmd + vcfutils_arg qualf_call = """ perl -ane 'if ($_ =~ m/^#/) { print $_ } else { if ($F[5] > %f) { print $_ }}' > %s""" % ( Q, vcf) call = '%s | %s | %s' % (bcf_call, vcfutils_call, qualf_call) logger.info(call) subprocess.check_call(call, shell=True) return vcf_files
def bcf2varfilter(bcf, genome, Q, vcf_prefix): '''Runs bcf through varfilter and writes to vcf''' paths = genobox_modules.setSystem() bcf_cmd = paths['samtools_svn_home'] + 'bcftools view' calls = [] vcf_files = [] vcfutils_cmd = paths['samtools_svn_home'] + 'vcfutils.pl' for chr in genome: d = chr[4] D = chr[5] vcf = vcf_prefix + chr[2] + '.vcf' vcf_files.append(vcf) bcf_arg = ' %s \"%s\"' % (bcf, chr[0]) bcf_call = bcf_cmd + bcf_arg vcfutils_arg = ' varFilter -d%s -D%s' % (d, D) vcfutils_call = vcfutils_cmd + vcfutils_arg qualf_call = """ perl -ane 'if ($_ =~ m/^#/) { print $_ } else { if ($F[5] > %f) { print $_ }}' > %s""" % (Q, vcf) call = '%s | %s | %s' % (bcf_call, vcfutils_call, qualf_call) logger.info(call) subprocess.check_call(call, shell=True) return vcf_files
def paired_trim(args): '''Create paired end trim calls''' import os import genobox_modules paths = genobox_modules.setSystem() if len(args.pe1) != len(args.pe2): raise ValueError( 'same number of files must be given to --pe1 and --pe2') cmd = '%sgenobox_trim_pe.py' % (paths['genobox_home']) calls = [] outfiles_pe1 = [] outfiles_pe2 = [] for i, f in enumerate(args.pe1): if args.gz: outfile_pe1 = 'trimmed/' + os.path.split( args.pe1[i])[1] + '.trim.fq.gz' outfile_pe2 = 'trimmed/' + os.path.split( args.pe2[i])[1] + '.trim.fq.gz' else: outfile_pe1 = 'trimmed/' + os.path.split( args.pe1[i])[1] + '.trim.fq' outfile_pe2 = 'trimmed/' + os.path.split( args.pe2[i])[1] + '.trim.fq' outfiles_pe1.append(outfile_pe1) outfiles_pe2.append(outfile_pe2) arg = ' --i %s %s --min_length %i --min_baseq %i --min_avgq %i --adaptors %s --min_adaptor_match %i' % ( args.pe1[i], args.pe2[i], args.min_length, args.min_baseq, args.min_avgq, ' '.join(args.adaptors), args.min_adaptor_match) if args.keep_n: arg = arg + ' --keep_n' if args.gz: arg = arg + ' --gz' calls.append(cmd + arg) return (calls, outfiles_pe1, outfiles_pe2)
def mpileup(bam, chr_file, fa, prior, pp): '''Perform SNP calling on bam-file using samtools''' import genobox_modules import os paths = genobox_modules.setSystem() cmd = paths['genobox_home'] + 'genobox_mpileup.py' calls = [] outfiles = [] # if chromosome file is given if chr_file: chrs = get_genome(chr_file) for c in chrs: outfile = 'genotyping/tmp.' + c[2] + '.all.bcf' outfiles.append(outfile) arg = ' --bam %s --chr \"%s\" --fa %s --prior %s --pp %f --o %s' % (bam, c[0], fa, prior, pp, outfile) calls.append(cmd+arg) else: tmpfile_name = os.path.split(bam)[1] outfile = 'genotyping/tmp.' + tmpfile_name + '.all.bcf' outfiles.append(outfile) arg = ' --bam %s --fa %s --prior %s --pp %f --o %s' % (bam, fa, prior, pp, outfile) calls.append(cmd+arg) return (calls, outfiles)
def get_best_assembly(args): '''Identify the best assembly from several k-mers''' # read in stats.txt files for each assembly. Calc sum of contigs and N50. import genobox_modules paths = genobox_modules.setSystem() cmd = '%sR-2.12 --vanilla ' % paths['R_home'] # set argument if len(args.ksizes) == 1: arg = ' %s %s' % (args.outpath, args.ksizes[0]) elif len(args.ksizes) >= 2: if len(args.ksizes) == 2: step = 2 elif len(args.ksizes) == 3: step = args.ksizes[2] arg_list = [] for k in range(int(args.ksizes[0]), int(args.ksizes[1]), int(step)): out = '%s_%s/stats.txt %s' % (args.outpath, k, k) arg_list.append(out) arg = ' '.join(arg_list) call = [cmd + arg + ' < %sgenobox_denovo_velvet_parse.R' % (paths['genobox_home'])] return call
def clean(): '''Clean sample directory''' import genobox_modules paths = genobox_modules.setSystem() call = '%sgenobox_denovo_velvet_clean.py' % (paths['genobox_home']) return [call]
def accept_assembly(args): '''Parse best assembly and remove other assemblies''' import genobox_modules paths = genobox_modules.setSystem() call = '%sgenobox_denovo_velvet_accept.py %s' % (paths['genobox_home'], args.outpath) return [call]
def picardFilterSort(i, q, o): '''Filters bam on quality and sort using picard''' paths = genobox_modules.setSystem() call = '''%sjava -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms1500m -Xmx1500m -jar %s/ViewSam.jar INPUT=%s ALIGNMENT_STATUS=Aligned VALIDATION_STRINGENCY=LENIENT | perl -ane 'if ($_ =~ m/^@/) {print $_;} else {if ($F[4] >= %i) { print $_ }}' | %sjava -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms4500m -Xmx4500m -jar %s/SortSam.jar INPUT=/dev/stdin OUTPUT=%s SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT TMP_DIR=/panvol1/simon/tmp MAX_RECORDS_IN_RAM=1000000''' % ( paths['java_home'], paths['picard_home'], i, q, paths['java_home'], paths['picard_home'], o) subprocess.call(call, shell=True)
def vcf_tabix(vcf_gz): """Run tabix on vcf.gz""" paths = genobox_modules.setSystem() tabix_call = paths["bin_home"] + "tabix -p vcf -f %s" % (vcf_gz) logger.info(tabix_call) subprocess.check_call(tabix_call, shell=True)
def vcf_tabix(vcf_gz): '''Run tabix on vcf.gz''' paths = genobox_modules.setSystem() tabix_call = paths['bin_home'] + 'tabix -p vcf -f %s' % (vcf_gz) logger.info(tabix_call) subprocess.check_call(tabix_call, shell=True)
def merge(reads, format, interleaved): '''Perform merging''' import genobox_modules paths = genobox_modules.setSystem() # shuffle <file1> <file2> <out> if format.find('fastq') > -1: cmd = '%sshuffleSequences_fastq.pl %s %s %s' % (paths['velvet_home'], reads[0], reads[1], interleaved) if format.find('fasta') > -1: cmd = '%sshuffleSequences_fasta.pl %s %s %s' % (paths['velvet_home'], reads[0], reads[1], interleaved) return cmd
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger): '''Starts genotyping using samtools of input bam file''' import subprocess import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import os if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create calls bamindex_calls = bam_index(bam) (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp) bcfcombine_calls = bcf_combine(bcffiles, o) bcfindex_calls = bcf_index(o) consensus_calls = consensus(o, sample) # submit jobs # print "Submitting jobs" bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition) mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition) bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition) bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition) # release jobs # print "Releasing jobs" #bamindex_moab.release() #mpileup_moab.release() #bcfcombine_moab.release() #bcfindex_moab.release() #consensus_moab.release() # semaphore (consensus is currently not waited for) print "Waiting for jobs to finish ..." s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400) s.wait() print "--------------------------------------" # remove temporary files genobox_modules.rm_files(bcffiles) # return output bcf return o
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger): '''Start variant vcf-filter using gatk''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('tmp'): os.makedirs('tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' vcffilter_calls = [] cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py' # for each chromosome for v in vcfs: arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q) if rmsk: arg = arg + ' --rmsk %s' % rmsk if ab != 0.5: arg = arg + ' --ab %f' % ab if prune != 0: arg = arg + ' --prune %i' % prune vcffilter_calls.append(cmd + arg) # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2 * 86400) s.wait() print "--------------------------------------"
def bcf_index(bcf): '''Index bcf file''' import genobox_modules paths = genobox_modules.setSystem() calls = [] cmd = paths['samtools_home'] + 'bcftools' arg = ' index %s' % (bcf) calls.append(cmd + arg) return calls
def get_saturation(bam): '''Perform saturation calculations''' import os import genobox_modules paths = genobox_modules.setSystem() bamf = os.path.split(bam)[1] + '.saturation' c1 = paths['genobox_home'] + 'genobox_bamsaturation.py --bams %s --subsample --sample stats --blocks 20 | cat -' % (bam) c2 = 'R-2.12 --vanilla stats/stats_Map.txt stats/stats_Map.txt stats/%s < %sgenobox_bamsaturation_plot.R' % (bamf, paths['genobox_home']) return [c1, c2]
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger): '''Annotate vcf.gz file with dbSNP, exchanging chromsome names to dbSNP version sort vcf and the input to dbSNP ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not dbsnp or dbsnp == 'None': print "No dbsnp file given - skipping" print "--------------------------------------" return vcf if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create command cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py' arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o) dbsnp_calls = [cmd + arg] # submit jobs print "Submitting jobs" dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition) # release jobs # print "Releasing jobs" #dbsnp_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400) s.wait() print "--------------------------------------" return o
def bcf_combine(bcfs, outfile): '''Concatenate bcfs to a single bcf ''' import genobox_modules paths = genobox_modules.setSystem() calls = [] cmd = paths['samtools_home'] + 'bcftools' arg = ' cat %s > %s' % (' '.join(bcfs), outfile) calls.append(cmd + arg) return calls
def bcf_index(bcf): '''Index bcf file''' import genobox_modules paths = genobox_modules.setSystem() calls = [] cmd = paths['samtools_home'] + 'bcftools' arg = ' index %s' % (bcf) calls.append(cmd+arg) return calls
def bcf_combine(bcfs, outfile): '''Concatenate bcfs to a single bcf ''' import genobox_modules paths = genobox_modules.setSystem() calls = [] cmd = paths['samtools_home'] + 'bcftools' arg = ' cat %s > %s' % (' '.join(bcfs), outfile) calls.append(cmd+arg) return calls
def bed_genomeCov(bam): '''Start bedtools genomeCoverageBed''' import os import genobox_modules paths = genobox_modules.setSystem() # set bam-file sans paths (input is abspath(bam)) bamf = os.path.split(bam)[1] call = paths['bedtools_bin'] + 'bedtools genomecov -ibam %s > stats/%s.coverage' % (bam, bamf) return [call]
def sam_flagstat(bam): '''Start samtools flagstat''' import os import genobox_modules paths = genobox_modules.setSystem() # set bam-file sans paths (input is abspath(bam)) bamf = os.path.split(bam)[1] call = paths['samtools_home'] + 'samtools flagstat %s > stats/%s.flagstat' % (bam, bamf) return [call]
def vcf_bgzip_tabix(vcf): '''Run bgzip and tabix on vcf''' paths = genobox_modules.setSystem() bgzip_call = paths['bin_home'] + 'bgzip -f %s' % vcf logger.info(bgzip_call) subprocess.check_call(bgzip_call, shell=True) tabix_call = paths['bin_home'] + 'tabix -p vcf -f %s.gz' % (vcf) logger.info(tabix_call) subprocess.check_call(tabix_call, shell=True)
def plot_coverage(bam): '''Use output from genomeCoverageBed to plot coverage plots''' import os import genobox_modules paths = genobox_modules.setSystem() # set bam-file sans paths (input is abspath(bam)) bamf = os.path.split(bam)[1] call = 'R-2.12 --vanilla stats/%s.coverage %s stats/%s.coverage.pdf < %sgenobox_plotcov.R' % (bamf, bamf, bamf, paths['genobox_home']) return [call]
def bwasw_iontorrent(fastqs, fa, fqtypes, alignpath, bwa6, library, threads, queue, partition, logger): '''Start alignment of fastq files using BWA-SW Iontorrent data''' import subprocess import genobox_modules from genobox_classes import Moab import os paths = genobox_modules.setSystem() home = os.getcwd() # setting cpus cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' if threads != 1: if partition == 'uv' or partition == 'uv2': cpuB = 'procs=%s,mem=5gb,walltime=172800,flags=sharedmem' % threads else: if threads > 8: cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=172800' % threads else: cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=172800' % threads else: cpuB = cpuA # align if bwa6: cmd = paths['bwa_6_2_home'] + 'bwa ' else: cmd = paths['bwa_home'] + 'bwa ' bwa_align = [] bamfiles = [] bamfiles_dict = dict() for i,fq in enumerate(fastqs): f = os.path.split(fq)[1] bamfile = alignpath + f + '.bam' bamfiles.append(bamfile) bamfiles_dict[fq] = bamfile if fqtypes[i] == 'Illumina': raise ValueError('BWA-SW should not align reads with Illumina Qualities') elif fqtypes[i] == 'Sanger': arg = ' bwasw -t %i %s %s | %ssamtools view -Sb - > %s' % (threads, fa, fq, paths['samtools_home'], bamfile) bwa_align.append(cmd+arg) # submit jobs # create moab instance for the align_calls and dispatch to queue bwa_align_moab = Moab(bwa_align, logfile=logger, runname='run_genobox_bwaalign', queue=queue, cpu=cpuB, partition=partition) # release jobs print "Releasing jobs" #bwa_align_moab.release() return (bwa_align_moab.ids, bamfiles_dict)
def python_avgdepth(bam): '''Start genobox_bam2avgdepth.py''' import os import genobox_modules paths = genobox_modules.setSystem() # set bam-file sans paths (input is abspath(bam)) bamf = os.path.split(bam)[1] call = paths['genobox_home'] + 'genobox_bam2avgdepth1.py %s > stats/%s.avgdepth' % (bam, bamf) return [call]
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger): '''Start variant vcf-filter using gatk''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not os.path.exists('genotyping'): os.makedirs('genotyping') if not os.path.exists('tmp'): os.makedirs('tmp') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' vcffilter_calls = [] cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py' # for each chromosome for v in vcfs: arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q) if rmsk: arg = arg + ' --rmsk %s' % rmsk if ab != 0.5: arg = arg + ' --ab %f' % ab if prune != 0: arg = arg + ' --prune %i' % prune vcffilter_calls.append(cmd+arg) # submit jobs print "Submitting jobs" vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition) # release jobs # print "Releasing jobs" #vcffilter_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2*86400) s.wait() print "--------------------------------------" # return filename of final vcf
def mapdamage(bam, fa): '''Run mapdamage on input bam''' import os import genobox_modules paths = genobox_modules.setSystem() # set bam-file sans paths (input is abspath(bam)) bamf = 'mapdamage_' + os.path.split(bam)[1] c1 = paths['mapdamage_home'] + 'mapdamage-0.3.6.pl map -i %s -r %s -c' % (bam, fa) # create call to move results file to stats dir c2 = 'mv %s stats/%s' % (bam, bamf) return [c1, c2]
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger): '''Annotate vcf.gz file with dbSNP, exchanging chromsome names to dbSNP version sort vcf and the input to dbSNP ''' import genobox_modules from genobox_classes import Moab from genobox_classes import Semaphore import subprocess import os if not dbsnp or dbsnp == 'None': print "No dbsnp file given - skipping" print "--------------------------------------" return vcf if not os.path.exists('genotyping'): os.makedirs('genotyping') # set queueing paths = genobox_modules.setSystem() home = os.getcwd() cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800' cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800' cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800' cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800' cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800' # create command cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py' arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o) dbsnp_calls = [cmd+arg] # submit jobs print "Submitting jobs" dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition) # release jobs # print "Releasing jobs" #dbsnp_moab.release() # semaphore print "Waiting for jobs to finish ..." s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2*86400) s.wait() print "--------------------------------------" return o
def consensus(bcf, sample): '''Create consensus fastq from bcf-file''' import genobox_modules paths = genobox_modules.setSystem() if sample == 'None': consensus_fq = 'genotyping/cns.fq' else: consensus_fq = 'genotyping/%s.cns.fq' % sample calls = [] call = '%sbcftools view %s | %svcfutils.pl vcf2fq > %s' % (paths['samtools_home'], bcf, paths['samtools_home'], consensus_fq) calls.append(call) return calls
def bam_index(bam): '''Index bam-file''' import genobox_modules import os.path paths = genobox_modules.setSystem() # skip index creation if it already exists calls = [] if not os.path.isfile(bam+'.bai'): cmd = paths['samtools_home'] + 'samtools' arg = ' index %s' % (bam) calls.append(cmd+arg) else: calls.append('sleep 0.01') return calls
def consensus(bcf, sample): '''Create consensus fastq from bcf-file''' import genobox_modules paths = genobox_modules.setSystem() if sample == 'None': consensus_fq = 'genotyping/cns.fq' else: consensus_fq = 'genotyping/%s.cns.fq' % sample calls = [] call = '%sbcftools view %s | %svcfutils.pl vcf2fq > %s' % ( paths['samtools_home'], bcf, paths['samtools_home'], consensus_fq) calls.append(call) return calls
def vcf_annotate_dbsnp(vcfgz, dbsnp, vcf_out_gz): """Annotate vcf.gz with dbsnp""" paths = genobox_modules.setSystem() if dbsnp and dbsnp != "None": gunzip_call = "/usr/bin/gunzip -c %s" % vcfgz fill_call = paths["bin_home"] + "fill-rsIDs -r %s | %sbgzip -c > %s" % (dbsnp, paths["bin_home"], vcf_out_gz) dbsnp_call = "%s | %s" % (gunzip_call, fill_call) logger.info(dbsnp_call) subprocess.check_call(dbsnp_call, shell=True) else: call = "cp %s %s" % (vcfgz, vcf_out_gz) logger.info(call) subprocess.check_call(call, shell=True)
def vcf_annotate_dbsnp(vcfgz, dbsnp, vcf_out_gz): '''Annotate vcf.gz with dbsnp''' paths = genobox_modules.setSystem() if dbsnp and dbsnp != 'None': gunzip_call = '/usr/bin/gunzip -c %s' % vcfgz fill_call = paths['bin_home'] + 'fill-rsIDs -r %s | %sbgzip -c > %s' % (dbsnp, paths['bin_home'], vcf_out_gz) dbsnp_call = '%s | %s' % (gunzip_call, fill_call) logger.info(dbsnp_call) subprocess.check_call(dbsnp_call, shell=True) else: call = 'cp %s %s' % (vcfgz, vcf_out_gz) logger.info(call) subprocess.check_call(call, shell=True)
def bam_index(bam): '''Index bam-file''' import genobox_modules import os.path paths = genobox_modules.setSystem() # skip index creation if it already exists calls = [] if not os.path.isfile(bam + '.bai'): cmd = paths['samtools_home'] + 'samtools' arg = ' index %s' % (bam) calls.append(cmd + arg) else: calls.append('sleep 0.01') return calls
def write_indels_for_filtering(vcf, ex): '''Extracts positions that should not be high confidence because they are deletions (not removed in vcf-file)''' import genobox_modules import subprocess paths = genobox_modules.setSystem() # extracting header and indels using perl oneliner if not ex or ex == 'None': call = '''gzip -dc %s | perl -ne 'if ($_ =~ m/^#/) { print $_ } else { if ($_ =~ INDEL) { print $_ }}' > genotyping/indels_for_filtering.vcf ''' % (vcf) else: ex_call = '%sgenobox_exchangeids.py --b %s' % (paths['genobox_home'], ex) call = '''gzip -dc %s | perl -ne 'if ($_ =~ m/^#/) { print $_ } else { if ($_ =~ INDEL) { print $_ }}' | %s > genotyping/indels_for_filtering.vcf ''' % (vcf, ex_call) logger.info(call) subprocess.check_call(call, shell=True)
def extract_unmapped_reads(bamfiles): '''Generate calls to extract unmapped reads from bamfiles''' import genobox_modules paths = genobox_modules.setSystem() cmd = '%ssamtools view -h -b -f 4' % (paths['samtools_home']) calls = [] unmapped = {} for id, bam in bamfiles.items(): unmap_bam = bam + '.unmapped.bam' unmapped[id] = unmap_bam arg = ' %s > %s' % (bam, unmap_bam) calls.append(cmd + arg) return (calls, unmapped)
def write_indels_for_filtering(var_vcf, ex, indel_vcf): '''Create indels_for_filtering file ''' import genobox_modules import subprocess paths = genobox_modules.setSystem() grep_call = 'grep -v \"#\" %s | grep "INDEL" | cat header.vcf - > tmp_file_indels' % ( var_vcf) logger.info(grep_call) subprocess.check_call(grep_call, shell=True) ex_cmd = paths['genobox_home'] + 'genobox_exchangeids.py' ex_arg = ' --a tmp_file_indels --x 0 --b %s --o %s' % (ex, indel_vcf) ex_call = cmd + arg logger.info(ex_call) subprocess.check_call(ex_call, shell=True)
def merge_bam(libs, lib_infiles, add_suffix=False, final_suffix='', tmpdir='/panvol1/simon/tmp/'): '''Merge bam files to libraries''' import genobox_modules paths = genobox_modules.setSystem() calls = [] outfiles = [] java_call = paths[ 'java_home'] + 'java -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms4500m -Xmx4500m -jar ' picard_cmd = paths['picard_home'] + 'MergeSamFiles.jar' for i in range(len(libs)): lib = libs[i] # set input and output files # add suffix to files (this is if they are given as original filenames, before filter+sort) if add_suffix: list_bams = [] for infile in lib_infiles[i]: list_bams.append(infile + '.flt.sort.bam') else: list_bams = lib_infiles[i] # add suffix to outfile if set and add alignment to path if it is not already there if lib.startswith('alignment/'): out_bam = lib + final_suffix else: out_bam = 'alignment/' + lib + final_suffix outfiles.append(out_bam) if len(list_bams) == 1: call = 'cp %s %s' % (' '.join(list_bams), out_bam) else: #sam_cmd = paths['samtools_home'] + 'samtools merge' #sam_arg = ' %s %s' % (out_bam, ' '.join(list_bams)) #call = sam_cmd+sam_arg arg = ' INPUT=%s OUTPUT=%s TMP_DIR=%s ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT' % ( ' INPUT='.join(list_bams), out_bam, tmpdir) call = java_call + picard_cmd + arg calls.append(call) return (calls, outfiles)
def check_fa(fa, bwa6): '''Checks for a fa of the input fasta file. If not present creates it''' import genobox_modules import subprocess import os import sys paths = genobox_modules.setSystem() # check if fa exists if bwa6: index_suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa'] else: index_suffixes = [ '.amb', '.ann', '.bwt', '.pac', '.rbwt', '.rpac', '.rsa', '.sa' ] for suf in index_suffixes: f = fa + suf if os.path.exists(f): pass else: sys.stderr.write('%s not found, creating bwa index\n' % fa) if bwa6: call = paths['bwa_6_2_home'] + 'bwa index -a is %s' % fa else: call = paths['bwa_home'] + 'bwa index -a is %s' % fa try: subprocess.check_call(call, shell=True) except: sys.stderr.write( 'bwa index -a is failed, trying bwa index -a bwtsw\n') if bwa6: call = paths['bwa_6_2_home'] + 'bwa index -a bwtsw %s' % fa else: call = paths['bwa_home'] + 'bwa index -a bwtsw %s' % fa try: subprocess.check_call(call, shell=True) except: raise TypeError('bwa index could not be created from %s' % fa) break