Beispiel #1
0
def realign_bam(in_bam, out_bam, fa, known=None):
   '''Runs realignment of bam'''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   calls = []
   
   gatk_cmd = paths['GATK_home'] + 'GenomeAnalysisTK.jar'
   java_call = paths['java_home']+'java -Djava.io.tmpdir=/panvol1/simon/tmp/ -XX:ParallelGCThreads=8 -Xms4500m -Xmx4500m -jar %s ' % gatk_cmd
   
   realign_bam = out_bam.replace('.bam', '.realign.bam')
   
   # index bam
   cmd = paths['samtools_home'] + 'samtools '
   # adding pipe to make it being written as a shell-file so all commands are submitted at the same time (fix dependencies)
   arg = 'index %s | cat - ' % in_bam
   c = cmd+arg
   #calls.append(cmd+arg)
   
   # realigner target creator
   if known:
      arg = '-I %s -R %s -T RealignerTargetCreator -known %s -o %s' % (in_bam, fa, known, in_bam+'.intervals')
   else:
      arg = '-I %s -R %s -T RealignerTargetCreator -o %s' % (in_bam, fa, in_bam+'.intervals')
   #calls.append(java_call+arg)
   c = '%s\n\n%s%s' % (c, java_call, arg)
   
   # realignment step
   arg = '-I %s -T IndelRealigner -R %s -targetIntervals %s -o %s' % (in_bam, fa, in_bam+'.intervals', realign_bam)
   #calls.append(java_call+arg)
   c = '%s\n\n%s%s' % (c, java_call, arg)
   
   calls = [c]
   
   return (calls, realign_bam)
Beispiel #2
0
 def wait(self):
    '''Wait for files to be created'''
    
    from time import sleep
    import string
    import random
    import os
    import genobox_modules
    import subprocess
    
    paths = genobox_modules.setSystem()
    
    # add directory and set semaphore filename
    if not os.path.exists('semaphores/'):
       os.makedirs('semaphores/')
    
    rand = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(10))
    semaphore_file = 'semaphores/' + self.file_prefix + '.' + rand
    semaphore_file_err = 'log/' + self.file_prefix + '.' + rand + '.err'
    
    # submit job 
    depends = ':'.join(self.semaphore_ids)
    xqsub = '%sxqsub -d %s -l ncpus=1,mem=10mb,walltime=180,depend=%s -O %s -q %s -N semaphores -E %s -r y -t echo done' % (paths['pyscripts_home'], self.home, depends, semaphore_file, self.queue, semaphore_file_err)
    dummy_id = subprocess.check_output(xqsub, shell=True)
    
    # check for file to appear
    cnt = self.max_time
    while cnt > 0:
       if os.path.isfile(semaphore_file):
          break
       cnt -= self.check_interval
       sleep(self.check_interval)
    if cnt <= 0:
       raise SystemExit('%s did not finish in %is' % ())
Beispiel #3
0
def create_velvetg_calls(args):
   '''Return velvetg calls'''
   
   import genobox_modules   
   paths = genobox_modules.setSystem()
   
   # create cmd
   cmds = []
   if len(args.ksizes) == 1:
      cmd = '%svelvetg %s' % (paths['velvet_home'], args.outpath)
      cmds.append(cmd)
   elif len(args.ksizes) >= 2 and len(args.ksizes) <= 3:
      if len(args.ksizes) == 2:
         step = 2
      elif len(args.ksizes) == 3:
         step = args.ksizes[2]
      
      for k in range(int(args.ksizes[0]), int(args.ksizes[1]), int(step)):
         cmd = '%svelvetg %s_%s' % (paths['velvet_home'], args.outpath, k)
         cmds.append(cmd)
   
   # create arg: cov_cut, exp_cov, ins_length, add_velvetg
   velvetg_calls = []
   # add other parameters
   for i in range(len(cmds)):
      arg = ' -min_contig_lgth %i' % args.min_contig_lgth
      if args.cov_cut: arg = arg + ' -cov_cut %f' % args.cov_cut
      if args.exp_cov != "None": arg = arg + ' -exp_cov %s' % args.exp_cov
      if args.ins_length: arg = arg + ' -ins_length %i' % args.ins_length
      if args.add_velvetg: arg = arg + ' %s' % args.add_velvetg
      velvetg_calls.append(cmds[i]+arg)
    
   return velvetg_calls
Beispiel #4
0
def vcf_filter_prune(vcf, prune, vcfgz_out):
    '''Prune variants within N nt of each other'''

    paths = genobox_modules.setSystem()

    if prune != 0:
        # create header
        head_call = 'head -n 1000 %s | grep "#" > %s' % (
            vcf, 'genotyping/header.vcf')
        logger.info(head_call)
        subprocess.check_call(head_call, shell=True)

        tmp_file = vcf + '.tmp'
        prune_script = paths['genobox_home'] + 'genobox_snppruning.R'
        prune_cmd = paths['R_home'] + 'R-2.12'
        prune_arg = ' --vanilla %i %s %s < %s' % (prune, vcf, tmp_file,
                                                  prune_script)
        prune_call = prune_cmd + prune_arg
        logger.info(prune_call)
        subprocess.check_call(prune_call, shell=True)

        # add header
        header_call = 'cat genotyping/header.vcf %s | %sbgzip -c > %s' % (
            tmp_file, paths['bin_home'], vcfgz_out)
        logger.info(header_call)
        subprocess.check_call(header_call, shell=True)

        # rm tmp_files
        rm_call = 'rm %s genotyping/header.vcf' % tmp_file
        logger.info(rm_call)
        subprocess.check_call(rm_call, shell=True)
    else:
        call = '%sbgzip -c %s > %s' % (paths['bin_home'], vcf, vcfgz_out)
        logger.info(call)
        subprocess.check_call(call, shell=True)
Beispiel #5
0
def mpileup(bam, chr_file, fa, prior, pp):
    '''Perform SNP calling on bam-file using samtools'''

    import genobox_modules
    import os

    paths = genobox_modules.setSystem()
    cmd = paths['genobox_home'] + 'genobox_mpileup.py'
    calls = []
    outfiles = []

    # if chromosome file is given
    if chr_file:
        chrs = get_genome(chr_file)
        for c in chrs:
            outfile = 'genotyping/tmp.' + c[2] + '.all.bcf'
            outfiles.append(outfile)
            arg = ' --bam %s --chr \"%s\" --fa %s --prior %s --pp %f --o %s' % (
                bam, c[0], fa, prior, pp, outfile)
            calls.append(cmd + arg)
    else:
        tmpfile_name = os.path.split(bam)[1]
        outfile = 'genotyping/tmp.' + tmpfile_name + '.all.bcf'
        outfiles.append(outfile)
        arg = ' --bam %s --fa %s --prior %s --pp %f --o %s' % (bam, fa, prior,
                                                               pp, outfile)
        calls.append(cmd + arg)
    return (calls, outfiles)
Beispiel #6
0
def vcf_filter_rmsk(vcfgz, rmsk, vcfgz_out):
   '''Removes variants called inside annotated repeat
   If no rmsk is given it simply copies the file'''
   
   import random
   import string
   import genobox_modules
   
   paths = genobox_modules.setSystem()
   if rmsk and rmsk != 'None':
      # create header
      N = 10
      rand = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in range(N))
      header = 'genotyping/tmp'+rand+'.header.vcf'
      header_call = '/usr/bin/gunzip -c %s | head -n 1000 | grep "#" > %s' % (vcfgz, header)
      logger.info(header_call)
      subprocess.check_call(header_call, shell=True)
      
      # perform rmsk filtering
      gunzip_call = '/usr/bin/gunzip -c %s' % vcfgz
      bgzip_call = paths['bin_home'] + 'bgzip -c > %s' % vcfgz_out
      
      bed_cmd = paths['bedtools_home'] + 'intersectBed'
      bed_arg = ' -v -a stdin -b %s | cat %s - | %s' % (rmsk, header, bgzip_call)
      bed_call = bed_cmd + bed_arg
      call = '%s | %s' % (gunzip_call, bed_call)
      logger.info(call)
      subprocess.check_call(call, shell=True)
      
      # rm tmp header file
      subprocess.check_call('rm %s' % header, shell=True)
   else:
      call = 'cp %s %s' % (vcfgz, vcfgz_out)
      logger.info(call)
      subprocess.check_call(call, shell=True)
Beispiel #7
0
def vcf_filter_rmsk(vcfgz, rmsk, vcfgz_out):
    """Removes variants called inside annotated repeat
   If no rmsk is given it simply copies the file"""

    import random
    import string
    import genobox_modules

    paths = genobox_modules.setSystem()
    if rmsk and rmsk != "None":
        # create header
        N = 10
        rand = "".join(random.choice(string.ascii_uppercase + string.digits) for x in range(N))
        header = "genotyping/tmp" + rand + ".header.vcf"
        header_call = '/usr/bin/gunzip -c %s | head -n 1000 | grep "#" > %s' % (vcfgz, header)
        logger.info(header_call)
        subprocess.check_call(header_call, shell=True)

        # perform rmsk filtering
        gunzip_call = "/usr/bin/gunzip -c %s" % vcfgz
        bgzip_call = paths["bin_home"] + "bgzip -c > %s" % vcfgz_out

        bed_cmd = paths["bedtools_home"] + "intersectBed"
        bed_arg = " -v -a stdin -b %s | cat %s - | %s" % (rmsk, header, bgzip_call)
        bed_call = bed_cmd + bed_arg
        call = "%s | %s" % (gunzip_call, bed_call)
        logger.info(call)
        subprocess.check_call(call, shell=True)

        # rm tmp header file
        subprocess.check_call("rm %s" % header, shell=True)
    else:
        call = "cp %s %s" % (vcfgz, vcfgz_out)
        logger.info(call)
        subprocess.check_call(call, shell=True)
def unified_genotyper(bam, genome, fa, dbsnp,  call_conf, call_emit, output_mode):
   '''Perform genotyping on bam-file using GATK unified genotyper'''
   
   import genobox_modules
   import os
   
   paths = genobox_modules.setSystem()
   gatk_cmd = paths['GATK_home'] + 'GenomeAnalysisTK.jar'
   java_cmd = 'java -Djava.io.tmpdir=/panvol1/simon/tmp/ -XX:ParallelGCThreads=8 -Xms3000m -Xmx3000m -jar '
   cmd = java_cmd + gatk_cmd
   
   calls = []
   outfiles = []
   basename = os.path.split(bam)[1]
   
   chrs = get_genome(genome)
   for c in chrs:
      outfile = 'genotyping/%s.%s.raw.vcf.gz' % (basename.replace('.bam', ''), c[2])
      logfile = 'log/run_unified_genotyper.%s.%s.log' % (basename.replace('.bam', ''), c[2])
      outfiles.append(outfile)
      arg = ' -T UnifiedGenotyper -R %s -I %s -o /dev/stdout -log %s -stand_call_conf %f -stand_emit_conf %f -L %s -baq CALCULATE_AS_NECESSARY --num_threads 1 -glm BOTH --output_mode %s ' % (fa, bam, logfile, call_conf, call_emit, c[2], output_mode)
      if dbsnp: arg = arg + '--dbsnp %s ' % dbsnp
      arg = arg + ''' | perl -ne 'if ($_ =~ m/^INFO/ or $_ =~ m/^WARN/) {} else {print $_}' | gzip -c - > %s''' % outfile
      calls.append(cmd+arg)
   
   return (calls, outfiles)
Beispiel #9
0
def vcf_filter_prune(vcf, prune, vcfgz_out):
   '''Prune variants within N nt of each other'''
   
   paths = genobox_modules.setSystem()
   
   if prune != 0:
      # create header
      head_call = 'head -n 1000 %s | grep "#" > %s' % (vcf, 'genotyping/header.vcf')
      logger.info(head_call)
      subprocess.check_call(head_call, shell=True)
      
      tmp_file = vcf + '.tmp'
      prune_script = paths['genobox_home'] + 'genobox_snppruning.R'
      prune_cmd = paths['R_home'] + 'R-2.12'
      prune_arg = ' --vanilla %i %s %s < %s' % (prune, vcf, tmp_file, prune_script)
      prune_call = prune_cmd + prune_arg
      logger.info(prune_call)
      subprocess.check_call(prune_call, shell=True)
      
      # add header
      header_call = 'cat genotyping/header.vcf %s | %sbgzip -c > %s' % (tmp_file, paths['bin_home'], vcfgz_out)
      logger.info(header_call)
      subprocess.check_call(header_call, shell=True)
      
      # rm tmp_files
      rm_call = 'rm %s genotyping/header.vcf' % tmp_file
      logger.info(rm_call)
      subprocess.check_call(rm_call, shell=True)
   else:
      call = '%sbgzip -c %s > %s' % (paths['bin_home'], vcf, vcfgz_out)
      logger.info(call)
      subprocess.check_call(call, shell=True)
Beispiel #10
0
def bcf2varfilter(bcf, genome, Q, vcf_prefix):
    '''Runs bcf through varfilter and writes to vcf'''

    paths = genobox_modules.setSystem()
    bcf_cmd = paths['samtools_svn_home'] + 'bcftools view'
    calls = []
    vcf_files = []
    vcfutils_cmd = paths['samtools_svn_home'] + 'vcfutils.pl'
    for chr in genome:
        d = chr[4]
        D = chr[5]
        vcf = vcf_prefix + chr[2] + '.vcf'
        vcf_files.append(vcf)

        bcf_arg = ' %s \"%s\"' % (bcf, chr[0])
        bcf_call = bcf_cmd + bcf_arg

        vcfutils_arg = ' varFilter -d%s -D%s' % (d, D)
        vcfutils_call = vcfutils_cmd + vcfutils_arg

        qualf_call = """ perl -ane 'if ($_ =~ m/^#/) { print $_ } else { if ($F[5] > %f) { print $_ }}' > %s""" % (
            Q, vcf)
        call = '%s | %s | %s' % (bcf_call, vcfutils_call, qualf_call)
        logger.info(call)
        subprocess.check_call(call, shell=True)
    return vcf_files
Beispiel #11
0
def bcf2varfilter(bcf, genome, Q, vcf_prefix):
   '''Runs bcf through varfilter and writes to vcf'''
   
   paths = genobox_modules.setSystem()
   bcf_cmd = paths['samtools_svn_home'] + 'bcftools view'
   calls = []
   vcf_files = []
   vcfutils_cmd = paths['samtools_svn_home'] + 'vcfutils.pl'
   for chr in genome:
      d = chr[4]
      D = chr[5]
      vcf = vcf_prefix + chr[2] + '.vcf'
      vcf_files.append(vcf)
      
      bcf_arg = ' %s \"%s\"' % (bcf, chr[0])
      bcf_call = bcf_cmd + bcf_arg
      
      vcfutils_arg = ' varFilter -d%s -D%s' % (d, D)
      vcfutils_call = vcfutils_cmd + vcfutils_arg
      
      qualf_call = """ perl -ane 'if ($_ =~ m/^#/) { print $_ } else { if ($F[5] > %f) { print $_ }}' > %s""" % (Q, vcf)
      call = '%s | %s | %s' % (bcf_call, vcfutils_call, qualf_call)
      logger.info(call)
      subprocess.check_call(call, shell=True)
   return vcf_files
Beispiel #12
0
def paired_trim(args):
    '''Create paired end trim calls'''

    import os
    import genobox_modules
    paths = genobox_modules.setSystem()

    if len(args.pe1) != len(args.pe2):
        raise ValueError(
            'same number of files must be given to --pe1 and --pe2')

    cmd = '%sgenobox_trim_pe.py' % (paths['genobox_home'])
    calls = []
    outfiles_pe1 = []
    outfiles_pe2 = []
    for i, f in enumerate(args.pe1):
        if args.gz:
            outfile_pe1 = 'trimmed/' + os.path.split(
                args.pe1[i])[1] + '.trim.fq.gz'
            outfile_pe2 = 'trimmed/' + os.path.split(
                args.pe2[i])[1] + '.trim.fq.gz'
        else:
            outfile_pe1 = 'trimmed/' + os.path.split(
                args.pe1[i])[1] + '.trim.fq'
            outfile_pe2 = 'trimmed/' + os.path.split(
                args.pe2[i])[1] + '.trim.fq'
        outfiles_pe1.append(outfile_pe1)
        outfiles_pe2.append(outfile_pe2)
        arg = ' --i %s %s --min_length %i --min_baseq %i --min_avgq %i --adaptors %s --min_adaptor_match %i' % (
            args.pe1[i], args.pe2[i], args.min_length, args.min_baseq,
            args.min_avgq, ' '.join(args.adaptors), args.min_adaptor_match)
        if args.keep_n: arg = arg + ' --keep_n'
        if args.gz: arg = arg + ' --gz'
        calls.append(cmd + arg)
    return (calls, outfiles_pe1, outfiles_pe2)
Beispiel #13
0
def mpileup(bam, chr_file, fa, prior, pp):
   '''Perform SNP calling on bam-file using samtools'''
   
   import genobox_modules
   import os
   
   paths = genobox_modules.setSystem()
   cmd = paths['genobox_home'] + 'genobox_mpileup.py'
   calls = []
   outfiles = []
   
   # if chromosome file is given
   if chr_file:
      chrs = get_genome(chr_file)
      for c in chrs:
         outfile = 'genotyping/tmp.' + c[2] + '.all.bcf'
         outfiles.append(outfile)         
         arg = ' --bam %s --chr \"%s\" --fa %s --prior %s --pp %f --o %s' % (bam, c[0], fa, prior, pp, outfile)
         calls.append(cmd+arg)
   else:
      tmpfile_name = os.path.split(bam)[1]
      outfile = 'genotyping/tmp.' + tmpfile_name + '.all.bcf'
      outfiles.append(outfile)
      arg = ' --bam %s --fa %s --prior %s --pp %f --o %s' % (bam, fa, prior, pp, outfile)
      calls.append(cmd+arg)
   return (calls, outfiles)
Beispiel #14
0
def get_best_assembly(args):
   '''Identify the best assembly from several k-mers'''
   
   # read in stats.txt files for each assembly. Calc sum of contigs and N50.
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   cmd = '%sR-2.12 --vanilla ' % paths['R_home']
   
   # set argument
   if len(args.ksizes) == 1:
      arg = ' %s %s' % (args.outpath, args.ksizes[0])
   elif len(args.ksizes) >= 2:
      if len(args.ksizes) == 2:
         step = 2
      elif len(args.ksizes) == 3:
         step = args.ksizes[2]
      
      arg_list = []
      for k in range(int(args.ksizes[0]), int(args.ksizes[1]), int(step)):
         out = '%s_%s/stats.txt %s' % (args.outpath, k, k)
         arg_list.append(out)
      arg = ' '.join(arg_list)
   
   call = [cmd + arg + ' < %sgenobox_denovo_velvet_parse.R' % (paths['genobox_home'])]
   return call
Beispiel #15
0
def clean():
   '''Clean sample directory'''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   call = '%sgenobox_denovo_velvet_clean.py' % (paths['genobox_home'])
   return [call]
Beispiel #16
0
def accept_assembly(args):
   '''Parse best assembly and remove other assemblies'''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   call = '%sgenobox_denovo_velvet_accept.py %s' % (paths['genobox_home'], args.outpath)
   return [call]
Beispiel #17
0
def picardFilterSort(i, q, o):
    '''Filters bam on quality and sort using picard'''

    paths = genobox_modules.setSystem()
    call = '''%sjava -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms1500m -Xmx1500m -jar %s/ViewSam.jar INPUT=%s ALIGNMENT_STATUS=Aligned VALIDATION_STRINGENCY=LENIENT | perl -ane 'if ($_ =~ m/^@/) {print $_;} else {if ($F[4] >= %i) { print $_ }}' | %sjava -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms4500m -Xmx4500m -jar %s/SortSam.jar INPUT=/dev/stdin OUTPUT=%s SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT TMP_DIR=/panvol1/simon/tmp MAX_RECORDS_IN_RAM=1000000''' % (
        paths['java_home'], paths['picard_home'], i, q, paths['java_home'],
        paths['picard_home'], o)
    subprocess.call(call, shell=True)
Beispiel #18
0
def vcf_tabix(vcf_gz):
    """Run tabix on vcf.gz"""

    paths = genobox_modules.setSystem()

    tabix_call = paths["bin_home"] + "tabix -p vcf -f %s" % (vcf_gz)
    logger.info(tabix_call)
    subprocess.check_call(tabix_call, shell=True)
Beispiel #19
0
def vcf_tabix(vcf_gz):
   '''Run tabix on vcf.gz'''
   
   paths = genobox_modules.setSystem()
   
   tabix_call = paths['bin_home'] + 'tabix -p vcf -f %s' % (vcf_gz)
   logger.info(tabix_call)
   subprocess.check_call(tabix_call, shell=True)
Beispiel #20
0
 def merge(reads, format, interleaved):
    '''Perform merging'''
    
    import genobox_modules
    paths = genobox_modules.setSystem()
    
    # shuffle <file1> <file2> <out>
    if format.find('fastq') > -1: cmd = '%sshuffleSequences_fastq.pl %s %s %s' % (paths['velvet_home'], reads[0], reads[1], interleaved)
    if format.find('fasta') > -1: cmd = '%sshuffleSequences_fasta.pl %s %s %s' % (paths['velvet_home'], reads[0], reads[1], interleaved)
    return cmd
Beispiel #21
0
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger):
   '''Starts genotyping using samtools of input bam file'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create calls
   bamindex_calls = bam_index(bam)
   (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp)
   bcfcombine_calls = bcf_combine(bcffiles, o)
   bcfindex_calls = bcf_index(o)
   consensus_calls = consensus(o, sample)
   
   # submit jobs #
   print "Submitting jobs"   
   bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition)
   mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition)
   bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition)
   bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #bamindex_moab.release()
   #mpileup_moab.release()
   #bcfcombine_moab.release()
   #bcfindex_moab.release()
   #consensus_moab.release()
      
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # remove temporary files
   genobox_modules.rm_files(bcffiles)
   
   # return output bcf
   return o
Beispiel #22
0
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir,
                         partition, logger):
    '''Start variant vcf-filter using gatk'''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    if not os.path.exists('tmp'):
        os.makedirs('tmp')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    vcffilter_calls = []
    cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py'

    # for each chromosome
    for v in vcfs:
        arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q)
        if rmsk: arg = arg + ' --rmsk %s' % rmsk
        if ab != 0.5: arg = arg + ' --ab %f' % ab
        if prune != 0: arg = arg + ' --prune %i' % prune
        vcffilter_calls.append(cmd + arg)

    # submit jobs
    print "Submitting jobs"
    vcffilter_moab = Moab(vcffilter_calls,
                          logfile=logger,
                          runname='run_genobox_vcffilter_gatk',
                          queue=queue,
                          cpu=cpuF,
                          partition=partition)

    # release jobs #
    print "Releasing jobs"
    #vcffilter_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20,
                  2 * 86400)
    s.wait()
    print "--------------------------------------"
Beispiel #23
0
def bcf_index(bcf):
    '''Index bcf file'''

    import genobox_modules
    paths = genobox_modules.setSystem()

    calls = []
    cmd = paths['samtools_home'] + 'bcftools'
    arg = ' index %s' % (bcf)
    calls.append(cmd + arg)
    return calls
Beispiel #24
0
def get_saturation(bam):
   '''Perform saturation calculations'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   bamf = os.path.split(bam)[1] + '.saturation'
   c1 = paths['genobox_home'] + 'genobox_bamsaturation.py --bams %s --subsample --sample stats --blocks 20 | cat -' % (bam)
   c2 = 'R-2.12 --vanilla stats/stats_Map.txt stats/stats_Map.txt stats/%s < %sgenobox_bamsaturation_plot.R' % (bamf, paths['genobox_home'])
   return [c1, c2]
Beispiel #25
0
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger):
    '''Annotate vcf.gz file with dbSNP,
   exchanging chromsome names to dbSNP version
   sort vcf and the input to dbSNP
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not dbsnp or dbsnp == 'None':
        print "No dbsnp file given - skipping"
        print "--------------------------------------"
        return vcf

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # create command
    cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py'
    arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o)
    dbsnp_calls = [cmd + arg]

    # submit jobs
    print "Submitting jobs"
    dbsnp_moab = Moab(dbsnp_calls,
                      logfile=logger,
                      runname='run_genobox_dbsnp',
                      queue=queue,
                      cpu=cpuC,
                      partition=partition)

    # release jobs #
    print "Releasing jobs"
    #dbsnp_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    return o
Beispiel #26
0
def bcf_combine(bcfs, outfile):
    '''Concatenate bcfs to a single bcf '''

    import genobox_modules
    paths = genobox_modules.setSystem()

    calls = []
    cmd = paths['samtools_home'] + 'bcftools'
    arg = ' cat %s > %s' % (' '.join(bcfs), outfile)
    calls.append(cmd + arg)
    return calls
Beispiel #27
0
def bcf_index(bcf):
   '''Index bcf file'''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   calls = []
   cmd = paths['samtools_home'] + 'bcftools'
   arg = ' index %s' % (bcf)
   calls.append(cmd+arg)
   return calls
Beispiel #28
0
def bcf_combine(bcfs, outfile):
   '''Concatenate bcfs to a single bcf '''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   calls = []
   cmd = paths['samtools_home'] + 'bcftools'
   arg = ' cat %s > %s' % (' '.join(bcfs), outfile)
   calls.append(cmd+arg)
   return calls
Beispiel #29
0
def bed_genomeCov(bam):
   '''Start bedtools genomeCoverageBed'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   # set bam-file sans paths (input is abspath(bam))
   bamf = os.path.split(bam)[1]   
   
   call = paths['bedtools_bin'] + 'bedtools genomecov -ibam %s > stats/%s.coverage' % (bam, bamf)
   return [call]
Beispiel #30
0
def sam_flagstat(bam):
   '''Start samtools flagstat'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   # set bam-file sans paths (input is abspath(bam))
   bamf = os.path.split(bam)[1]   
   
   call = paths['samtools_home'] + 'samtools flagstat %s > stats/%s.flagstat' % (bam, bamf)
   return [call]
Beispiel #31
0
def vcf_bgzip_tabix(vcf):
    '''Run bgzip and tabix on vcf'''

    paths = genobox_modules.setSystem()

    bgzip_call = paths['bin_home'] + 'bgzip -f %s' % vcf
    logger.info(bgzip_call)
    subprocess.check_call(bgzip_call, shell=True)

    tabix_call = paths['bin_home'] + 'tabix -p vcf -f %s.gz' % (vcf)
    logger.info(tabix_call)
    subprocess.check_call(tabix_call, shell=True)
Beispiel #32
0
def plot_coverage(bam):
   '''Use output from genomeCoverageBed to plot coverage plots'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   # set bam-file sans paths (input is abspath(bam))
   bamf = os.path.split(bam)[1]   
   
   call = 'R-2.12 --vanilla stats/%s.coverage %s stats/%s.coverage.pdf < %sgenobox_plotcov.R' % (bamf, bamf, bamf, paths['genobox_home'])
   return [call]
Beispiel #33
0
def bwasw_iontorrent(fastqs, fa, fqtypes, alignpath, bwa6, library, threads, queue, partition, logger):
   '''Start alignment of fastq files using BWA-SW Iontorrent data'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   import os
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   
   # setting cpus
   cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   if threads != 1:
      if partition == 'uv' or partition == 'uv2':
         cpuB = 'procs=%s,mem=5gb,walltime=172800,flags=sharedmem' % threads
      else:
         if threads > 8:
            cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=172800' % threads
         else:
            cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=172800' % threads
   else:
      cpuB = cpuA
   
   # align
   if bwa6:
      cmd = paths['bwa_6_2_home'] + 'bwa '
   else:
      cmd = paths['bwa_home'] + 'bwa '
   
   bwa_align = []
   bamfiles = []
   bamfiles_dict = dict()
   for i,fq in enumerate(fastqs):
      f = os.path.split(fq)[1]
      bamfile = alignpath + f + '.bam'
      bamfiles.append(bamfile)
      bamfiles_dict[fq] = bamfile      
      if fqtypes[i] == 'Illumina':
         raise ValueError('BWA-SW should not align reads with Illumina Qualities')
      elif fqtypes[i] == 'Sanger':
         arg = ' bwasw -t %i %s %s |  %ssamtools view -Sb - > %s' % (threads, fa, fq, paths['samtools_home'], bamfile)
      bwa_align.append(cmd+arg)
   
   # submit jobs
   # create moab instance for the align_calls and dispatch to queue
   bwa_align_moab = Moab(bwa_align, logfile=logger, runname='run_genobox_bwaalign', queue=queue, cpu=cpuB, partition=partition)
   
   # release jobs
   print "Releasing jobs"
   #bwa_align_moab.release()
   
   return (bwa_align_moab.ids, bamfiles_dict)
Beispiel #34
0
def python_avgdepth(bam):
   '''Start genobox_bam2avgdepth.py'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   # set bam-file sans paths (input is abspath(bam))
   bamf = os.path.split(bam)[1]   
   
   call = paths['genobox_home'] + 'genobox_bam2avgdepth1.py %s > stats/%s.avgdepth' % (bam, bamf)
   return [call]
Beispiel #35
0
def vcf_bgzip_tabix(vcf):
   '''Run bgzip and tabix on vcf'''
   
   paths = genobox_modules.setSystem()
   
   bgzip_call = paths['bin_home'] + 'bgzip -f %s' % vcf
   logger.info(bgzip_call)
   subprocess.check_call(bgzip_call, shell=True)
   
   tabix_call = paths['bin_home'] + 'tabix -p vcf -f %s.gz' % (vcf)
   logger.info(tabix_call)
   subprocess.check_call(tabix_call, shell=True)
Beispiel #36
0
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger):
   '''Start variant vcf-filter using gatk'''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   if not os.path.exists('tmp'):
      os.makedirs('tmp')

   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   vcffilter_calls = []
   cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py'
   
   # for each chromosome
   for v in vcfs:
      arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q)
      if rmsk: arg = arg + ' --rmsk %s' % rmsk
      if ab != 0.5: arg = arg + ' --ab %f' % ab
      if prune != 0: arg = arg + ' --prune %i' % prune
      vcffilter_calls.append(cmd+arg)
   
   # submit jobs
   print "Submitting jobs"
   vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #vcffilter_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # return filename of final vcf
   
Beispiel #37
0
def mapdamage(bam, fa):
   '''Run mapdamage on input bam'''
   
   import os
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   # set bam-file sans paths (input is abspath(bam))
   bamf = 'mapdamage_' + os.path.split(bam)[1]
   c1 = paths['mapdamage_home'] + 'mapdamage-0.3.6.pl map -i %s -r %s -c' % (bam, fa)
   
   # create call to move results file to stats dir
   c2 = 'mv %s stats/%s' % (bam, bamf)
   return [c1, c2]
Beispiel #38
0
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger):
   '''Annotate vcf.gz file with dbSNP,
   exchanging chromsome names to dbSNP version
   sort vcf and the input to dbSNP
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not dbsnp or dbsnp == 'None':
      print "No dbsnp file given - skipping"
      print "--------------------------------------"
      return vcf
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create command
   cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py'
   arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o)
   dbsnp_calls = [cmd+arg]
   
   # submit jobs
   print "Submitting jobs"
   dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #dbsnp_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   return o
Beispiel #39
0
def consensus(bcf, sample):
   '''Create consensus fastq from bcf-file'''
   
   import genobox_modules
   paths = genobox_modules.setSystem()
   
   if sample == 'None':
      consensus_fq = 'genotyping/cns.fq'
   else:
      consensus_fq = 'genotyping/%s.cns.fq' % sample
   
   calls = []
   call = '%sbcftools view %s | %svcfutils.pl vcf2fq > %s' % (paths['samtools_home'], bcf, paths['samtools_home'], consensus_fq)
   calls.append(call)
   return calls
Beispiel #40
0
def bam_index(bam):
   '''Index bam-file'''
   
   import genobox_modules
   import os.path
   paths = genobox_modules.setSystem()
   
   # skip index creation if it already exists
   calls = []
   if not os.path.isfile(bam+'.bai'):
      cmd = paths['samtools_home'] + 'samtools'
      arg = ' index %s' % (bam)
      calls.append(cmd+arg)
   else:
      calls.append('sleep 0.01')
   return calls
Beispiel #41
0
def consensus(bcf, sample):
    '''Create consensus fastq from bcf-file'''

    import genobox_modules
    paths = genobox_modules.setSystem()

    if sample == 'None':
        consensus_fq = 'genotyping/cns.fq'
    else:
        consensus_fq = 'genotyping/%s.cns.fq' % sample

    calls = []
    call = '%sbcftools view %s | %svcfutils.pl vcf2fq > %s' % (
        paths['samtools_home'], bcf, paths['samtools_home'], consensus_fq)
    calls.append(call)
    return calls
Beispiel #42
0
def vcf_annotate_dbsnp(vcfgz, dbsnp, vcf_out_gz):
    """Annotate vcf.gz with dbsnp"""

    paths = genobox_modules.setSystem()

    if dbsnp and dbsnp != "None":
        gunzip_call = "/usr/bin/gunzip -c %s" % vcfgz
        fill_call = paths["bin_home"] + "fill-rsIDs -r %s | %sbgzip -c > %s" % (dbsnp, paths["bin_home"], vcf_out_gz)

        dbsnp_call = "%s | %s" % (gunzip_call, fill_call)
        logger.info(dbsnp_call)
        subprocess.check_call(dbsnp_call, shell=True)
    else:
        call = "cp %s %s" % (vcfgz, vcf_out_gz)
        logger.info(call)
        subprocess.check_call(call, shell=True)
Beispiel #43
0
def vcf_annotate_dbsnp(vcfgz, dbsnp, vcf_out_gz):
   '''Annotate vcf.gz with dbsnp'''
   
   paths = genobox_modules.setSystem()
   
   if dbsnp and dbsnp != 'None':
      gunzip_call = '/usr/bin/gunzip -c %s' % vcfgz
      fill_call = paths['bin_home'] + 'fill-rsIDs -r %s | %sbgzip -c > %s' % (dbsnp, paths['bin_home'], vcf_out_gz)
      
      dbsnp_call = '%s | %s' % (gunzip_call, fill_call)
      logger.info(dbsnp_call)
      subprocess.check_call(dbsnp_call, shell=True)
   else:
      call = 'cp %s %s' % (vcfgz, vcf_out_gz)
      logger.info(call)
      subprocess.check_call(call, shell=True)
Beispiel #44
0
def bam_index(bam):
    '''Index bam-file'''

    import genobox_modules
    import os.path
    paths = genobox_modules.setSystem()

    # skip index creation if it already exists
    calls = []
    if not os.path.isfile(bam + '.bai'):
        cmd = paths['samtools_home'] + 'samtools'
        arg = ' index %s' % (bam)
        calls.append(cmd + arg)
    else:
        calls.append('sleep 0.01')
    return calls
Beispiel #45
0
def write_indels_for_filtering(vcf, ex):
   '''Extracts positions that should not be high confidence because they are deletions (not removed in vcf-file)'''
   
   import genobox_modules
   import subprocess
   
   paths = genobox_modules.setSystem()
   
   # extracting header and indels using perl oneliner
   if not ex or ex == 'None':
      call = '''gzip -dc %s | perl -ne 'if ($_ =~ m/^#/) { print $_ } else { if ($_ =~ INDEL) { print $_ }}' > genotyping/indels_for_filtering.vcf ''' % (vcf)
   else:
      ex_call = '%sgenobox_exchangeids.py --b %s' % (paths['genobox_home'], ex)
      call = '''gzip -dc %s | perl -ne 'if ($_ =~ m/^#/) { print $_ } else { if ($_ =~ INDEL) { print $_ }}' | %s > genotyping/indels_for_filtering.vcf ''' % (vcf, ex_call)
   
   logger.info(call)
   subprocess.check_call(call, shell=True)
Beispiel #46
0
def extract_unmapped_reads(bamfiles):
    '''Generate calls to extract unmapped reads from bamfiles'''

    import genobox_modules

    paths = genobox_modules.setSystem()
    cmd = '%ssamtools view -h -b -f 4' % (paths['samtools_home'])

    calls = []
    unmapped = {}
    for id, bam in bamfiles.items():
        unmap_bam = bam + '.unmapped.bam'
        unmapped[id] = unmap_bam
        arg = ' %s > %s' % (bam, unmap_bam)
        calls.append(cmd + arg)

    return (calls, unmapped)
Beispiel #47
0
def write_indels_for_filtering(var_vcf, ex, indel_vcf):
    '''Create indels_for_filtering file '''

    import genobox_modules
    import subprocess

    paths = genobox_modules.setSystem()

    grep_call = 'grep -v \"#\" %s | grep "INDEL" | cat header.vcf - > tmp_file_indels' % (
        var_vcf)
    logger.info(grep_call)
    subprocess.check_call(grep_call, shell=True)

    ex_cmd = paths['genobox_home'] + 'genobox_exchangeids.py'
    ex_arg = ' --a tmp_file_indels --x 0 --b %s --o %s' % (ex, indel_vcf)
    ex_call = cmd + arg
    logger.info(ex_call)
    subprocess.check_call(ex_call, shell=True)
Beispiel #48
0
def merge_bam(libs,
              lib_infiles,
              add_suffix=False,
              final_suffix='',
              tmpdir='/panvol1/simon/tmp/'):
    '''Merge bam files to libraries'''

    import genobox_modules
    paths = genobox_modules.setSystem()
    calls = []
    outfiles = []
    java_call = paths[
        'java_home'] + 'java -XX:ParallelGCThreads=8 -XX:+UseParallelGC -XX:-UsePerfData -Xms4500m -Xmx4500m -jar '
    picard_cmd = paths['picard_home'] + 'MergeSamFiles.jar'
    for i in range(len(libs)):
        lib = libs[i]

        # set input and output files
        # add suffix to files (this is if they are given as original filenames, before filter+sort)
        if add_suffix:
            list_bams = []
            for infile in lib_infiles[i]:
                list_bams.append(infile + '.flt.sort.bam')
        else:
            list_bams = lib_infiles[i]

        # add suffix to outfile if set and add alignment to path if it is not already there
        if lib.startswith('alignment/'):
            out_bam = lib + final_suffix
        else:
            out_bam = 'alignment/' + lib + final_suffix
        outfiles.append(out_bam)

        if len(list_bams) == 1:
            call = 'cp %s %s' % (' '.join(list_bams), out_bam)
        else:
            #sam_cmd = paths['samtools_home'] + 'samtools merge'
            #sam_arg = ' %s %s' % (out_bam, ' '.join(list_bams))
            #call = sam_cmd+sam_arg
            arg = ' INPUT=%s OUTPUT=%s TMP_DIR=%s ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT' % (
                ' INPUT='.join(list_bams), out_bam, tmpdir)
            call = java_call + picard_cmd + arg
        calls.append(call)
    return (calls, outfiles)
Beispiel #49
0
def check_fa(fa, bwa6):
    '''Checks for a fa of the input fasta file. If not present creates it'''

    import genobox_modules
    import subprocess
    import os
    import sys

    paths = genobox_modules.setSystem()

    # check if fa exists
    if bwa6:
        index_suffixes = ['.amb', '.ann', '.bwt', '.pac', '.sa']
    else:
        index_suffixes = [
            '.amb', '.ann', '.bwt', '.pac', '.rbwt', '.rpac', '.rsa', '.sa'
        ]

    for suf in index_suffixes:
        f = fa + suf
        if os.path.exists(f):
            pass
        else:
            sys.stderr.write('%s not found, creating bwa index\n' % fa)
            if bwa6:
                call = paths['bwa_6_2_home'] + 'bwa index -a is %s' % fa
            else:
                call = paths['bwa_home'] + 'bwa index -a is %s' % fa
            try:
                subprocess.check_call(call, shell=True)
            except:
                sys.stderr.write(
                    'bwa index -a is failed, trying bwa index -a bwtsw\n')
                if bwa6:
                    call = paths['bwa_6_2_home'] + 'bwa index -a bwtsw %s' % fa
                else:
                    call = paths['bwa_home'] + 'bwa index -a bwtsw %s' % fa
                try:
                    subprocess.check_call(call, shell=True)
                except:
                    raise TypeError('bwa index could not be created from %s' %
                                    fa)
            break