Beispiel #1
0
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir,
                         partition, logger):
    '''Start variant vcf-filter using gatk'''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    if not os.path.exists('tmp'):
        os.makedirs('tmp')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    vcffilter_calls = []
    cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py'

    # for each chromosome
    for v in vcfs:
        arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q)
        if rmsk: arg = arg + ' --rmsk %s' % rmsk
        if ab != 0.5: arg = arg + ' --ab %f' % ab
        if prune != 0: arg = arg + ' --prune %i' % prune
        vcffilter_calls.append(cmd + arg)

    # submit jobs
    print "Submitting jobs"
    vcffilter_moab = Moab(vcffilter_calls,
                          logfile=logger,
                          runname='run_genobox_vcffilter_gatk',
                          queue=queue,
                          cpu=cpuF,
                          partition=partition)

    # release jobs #
    print "Releasing jobs"
    #vcffilter_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20,
                  2 * 86400)
    s.wait()
    print "--------------------------------------"
Beispiel #2
0
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger):
    '''Annotate vcf.gz file with dbSNP,
   exchanging chromsome names to dbSNP version
   sort vcf and the input to dbSNP
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not dbsnp or dbsnp == 'None':
        print "No dbsnp file given - skipping"
        print "--------------------------------------"
        return vcf

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # create command
    cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py'
    arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o)
    dbsnp_calls = [cmd + arg]

    # submit jobs
    print "Submitting jobs"
    dbsnp_moab = Moab(dbsnp_calls,
                      logfile=logger,
                      runname='run_genobox_dbsnp',
                      queue=queue,
                      cpu=cpuC,
                      partition=partition)

    # release jobs #
    print "Releasing jobs"
    #dbsnp_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    return o
Beispiel #3
0
def start_trim(args, logger):
    '''Start trimming from genobox.py'''

    import genobox_modules
    from genobox_classes import Moab, Semaphore
    import subprocess
    import os
    import sys

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    if args.partition == 'uv':
        cpuA = 'procs=2,mem=512mb,walltime=172800,flags=sharedmem'
        cpuC = 'procs=1,mem=2gb,walltime=172800,flags=sharedmem'
        cpuE = 'procs=1,mem=5gb,walltime=172800,flags=sharedmem'
        cpuB = 'procs=16,mem=10gb,walltime=172800,flags=sharedmem'
        cpuF = 'procs=2,mem=%s,walltime=172800,flags=sharedmem' % args.m
    else:
        cpuA = 'nodes=1:ppn=2,mem=512mb,walltime=172800'
        cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
        cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
        cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
        cpuF = 'nodes=1:ppn=2,mem=%s,walltime=172800' % args.m

    # create path
    if not os.path.exists('trimmed'):
        os.makedirs('trimmed')

    # create calls
    (single_calls, se_files) = single_trim(args)
    (paired_calls, pe1_files, pe2_files) = paired_trim(args)

    # submit jobs
    print "Submitting jobs"
    if args.se:
        single_moab = Moab(single_calls,
                           logfile=logger,
                           runname='run_genobox_trimse',
                           queue=args.queue,
                           cpu=cpuA,
                           partition=args.partition)
    if args.pe1 and args.pe2:
        paired_moab = Moab(paired_calls,
                           logfile=logger,
                           runname='run_genobox_trimpe',
                           queue=args.queue,
                           cpu=cpuA,
                           partition=args.partition)

    # release jobs
    print "Releasing jobs"
    #if args.se:
    #   single_moab.release()
    #if args.pe1 and args.pe2:
    #   paired_moab.release()

    # wait for jobs to finish
    print "Waiting for jobs to finish ..."
    semaphore_ids = []
    if args.se:
        semaphore_ids = semaphore_ids + single_moab.ids
    if args.pe1 and args.pe2:
        semaphore_ids = semaphore_ids + paired_moab.ids
    s = Semaphore(semaphore_ids, home, 'read_trimming', args.queue, 60, 86400)
    s.wait()
    print "--------------------------------------"
    sys.stderr.write('Done\n')

    # return trimmed files
    return (se_files, pe1_files, pe2_files)
Beispiel #4
0
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger):
   '''Start variant vcf-filter
   
   Genome file must be given, format is a line for each chromosome:
   chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth
   
   Filtering steps:
   vcfutils.pl varFilter
   annotated repeats using rmsk
   heterozygote variants on haploid chromosomes
   allelic balance
   pruning of variants within N nt of each other
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   if not os.path.exists('genotyping/tmp'):
      os.makedirs('genotyping/tmp')

   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   if caller == 'samtools':
      
      # create command
      cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py'
      if dir and dir != 'None':
         outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1])
      else:
         outfile = o
      arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (bcf, genome, caller, Q, rmsk, ab, prune, outfile)
      vcffilter_calls = [cmd+arg]
   
   
   # submit jobs
   print "Submitting jobs"
   vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   vcffilter_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # return filename of final vcf
   return o
Beispiel #5
0
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir,
                    partition, logger):
    '''Start variant vcf-filter
   
   Genome file must be given, format is a line for each chromosome:
   chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth
   
   Filtering steps:
   vcfutils.pl varFilter
   annotated repeats using rmsk
   heterozygote variants on haploid chromosomes
   allelic balance
   pruning of variants within N nt of each other
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    if not os.path.exists('genotyping/tmp'):
        os.makedirs('genotyping/tmp')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    if caller == 'samtools':

        # create command
        cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py'
        if dir and dir != 'None':
            outfile = '%s/%s.%s' % (os.path.split(o)[0], dir,
                                    os.path.split(o)[1])
        else:
            outfile = o
        arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (
            bcf, genome, caller, Q, rmsk, ab, prune, outfile)
        vcffilter_calls = [cmd + arg]

    # submit jobs
    print "Submitting jobs"
    vcffilter_moab = Moab(vcffilter_calls,
                          logfile=logger,
                          runname='run_genobox_vcffilter',
                          queue=queue,
                          cpu=cpuE,
                          partition=partition)

    # release jobs #
    print "Releasing jobs"
    #vcffilter_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    # return filename of final vcf
    return o
Beispiel #6
0
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam,
                     realignment, known, fa, sample, partition, logger):
    '''Starts bam processing of input files'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab, Semaphore, Library
    import os

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600'
    cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600'
    cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600'

    # create library instance
    if library_file and library_file != 'None':
        if isinstance(library_file, Library):
            library = library_file
        else:
            library = Library(library_file)
            library.read()
    else:
        library = genobox_modules.initialize_library(libfile=library_file,
                                                     sample=sample,
                                                     mapq=mapq,
                                                     libs=libs,
                                                     bams=bams)

    (bam2lib, lib2bam) = library.getBamLibs()

    ## CREATE CALLS ##

    # filter bam and sort
    (filter_sort_calls,
     filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000)

    # merge to libs
    (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(),
                                            lib2bam.values(),
                                            add_suffix=True,
                                            final_suffix='.flt.sort.bam',
                                            tmpdir=tmpdir)

    # rmdup on libs
    (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir)

    # optional: realignment
    if realignment:
        (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files],
                                                    add_suffix=False)
        (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa,
                                                  known)
    else:
        # merge to final file
        (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files],
                                                   add_suffix=False)

    ## SUBMIT JOBS ##

    print "Submitting jobs"
    filtersort_moab = Moab(filter_sort_calls,
                           logfile=logger,
                           runname='run_genobox_filtersort',
                           queue=queue,
                           cpu=cpuH,
                           partition=partition)
    mergelib_moab = Moab(merge_lib_calls,
                         logfile=logger,
                         runname='run_genobox_lib_merge',
                         queue=queue,
                         cpu=cpuE,
                         depend=True,
                         depend_type='complex',
                         depend_val=map(len, lib2bam.values()),
                         depend_ids=filtersort_moab.ids,
                         partition=partition)
    rmdup_moab = Moab(
        rmdup_calls,
        logfile=logger,
        runname='run_genobox_rmdup',
        queue=queue,
        cpu=cpuG,
        depend=True,
        depend_type='one2one',
        depend_val=[1],
        depend_ids=mergelib_moab.ids,
        partition=partition
    )  # NB: If memory should be changed, also change java memory spec in rmdup function
    mergefinal_moab = Moab(merge_final_call,
                           logfile=logger,
                           runname='run_genobox_final_merge',
                           queue=queue,
                           cpu=cpuC,
                           depend=True,
                           depend_type='conc',
                           depend_val=[len(rmdup_moab.ids)],
                           depend_ids=rmdup_moab.ids,
                           partition=partition)
    if realignment:
        realign_moab = Moab(realign_calls,
                            logfile=logger,
                            runname='run_genobox_realignment',
                            queue=queue,
                            cpu=cpuE,
                            depend=True,
                            depend_type='one2one',
                            depend_val=[1],
                            depend_ids=mergefinal_moab.ids,
                            partition=partition)
    # realignment calls needs to be written together in a shell-file or dependent on each other #

    # release jobs #
    print "Releasing jobs"
    #filtersort_moab.release()
    #mergelib_moab.release()
    #rmdup_moab.release()
    #mergefinal_moab.release()
    #if realignment: realign_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    if realignment:
        s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20,
                      345600)
    else:
        s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20,
                      345600)
    s.wait()
    print "--------------------------------------"

    # return final bamfile
    return final_bam
Beispiel #7
0
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger):
   '''Extract high confidence same-as-reference bases from bcf, options are to:
   
   exchange ids
   annotate using dbsnp
   filter rmsk
   filter ambiguous indel positions
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # read genome file
   genome = get_genome(genome_file)
   
   # create commands
   bcf2ref_calls = []
   cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py'
   for chr in genome:
      # set outfile name
      if len(genome) == 1:
         if dir and dir != 'None':
            outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1])
         else:
            outfile = o
      else:
         if dir and dir != 'None':
            outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1])
         else:
            outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1])
      
      arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile)
      bcf2ref_calls.append(cmd+arg)
   
   # submit jobs
   print "Submitting jobs"
   bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition)
   
   # release jobs
   print "Releasing jobs"
   bcf2ref_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
Beispiel #8
0
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger):
   '''Starts genotyping using samtools of input bam file'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create calls
   bamindex_calls = bam_index(bam)
   (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp)
   bcfcombine_calls = bcf_combine(bcffiles, o)
   bcfindex_calls = bcf_index(o)
   consensus_calls = consensus(o, sample)
   
   # submit jobs #
   print "Submitting jobs"   
   bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition)
   mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition)
   bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition)
   bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   bamindex_moab.release()
   mpileup_moab.release()
   bcfcombine_moab.release()
   bcfindex_moab.release()
   #consensus_moab.release()
      
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # remove temporary files
   genobox_modules.rm_files(bcffiles)
   
   # return output bcf
   return o
Beispiel #9
0
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir,
                  partition, logger):
    '''Extract high confidence same-as-reference bases from bcf, options are to:
   
   exchange ids
   annotate using dbsnp
   filter rmsk
   filter ambiguous indel positions
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # read genome file
    genome = get_genome(genome_file)

    # create commands
    bcf2ref_calls = []
    cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py'
    for chr in genome:
        # set outfile name
        if len(genome) == 1:
            if dir and dir != 'None':
                outfile = '%s/%s.%s' % (os.path.split(o)[0], dir,
                                        os.path.split(o)[1])
            else:
                outfile = o
        else:
            if dir and dir != 'None':
                outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2],
                                           os.path.split(o)[1])
            else:
                outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2],
                                        os.path.split(o)[1])

        arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (
            bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels,
            outfile)
        bcf2ref_calls.append(cmd + arg)

    # submit jobs
    print "Submitting jobs"
    bcf2ref_moab = Moab(bcf2ref_calls,
                        logfile=logger,
                        runname='run_genobox_bcf2ref',
                        queue=queue,
                        cpu=cpuE,
                        partition=partition)

    # release jobs
    print "Releasing jobs"
    #bcf2ref_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"
Beispiel #10
0
def start_assembly(args, logger):
   '''Start assembly'''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuV = 'nodes=1:ppn=%i,mem=%s,walltime=172800' % (args.n, args.m)
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # set kmersizes (if auto)
   if args.ksizes == ['auto']:
      args.ksizes = set_kmersizes(args)   
   
   # trimming calls
   if args.trim: illuminatrim_calls = illumina_trim(args, int(args.ksizes[0]), 15, 20, 15, False)
   
   # checking if files needs to be interleaved
   interleave_dict = {}    
   interleave_dict['shortPaired'] = interleave(args.shortPaired, args.sample)[0] ; args.shortPaired = interleave(args.shortPaired, args.sample)[1]
   interleave_dict['shortPaired2'] = interleave(args.shortPaired2, args.sample)[0] ; args.shortPaired2 = interleave(args.shortPaired2, args.sample)[1]
   interleave_dict['longPaired'] = interleave(args.longPaired, args.sample)[0] ; args.longPaired = interleave(args.longPaired, args.sample)[1]
   
   # interleave calls
   interleave_calls = []
   for key,value in interleave_dict.items():
      if value:
         interleave_calls.append(value)
   
   # velvet calls
   velveth_calls = create_velveth_calls(args)
   velvetg_calls = create_velvetg_calls(args)
   
   # velvet parse calls
   velvetparse_calls = get_best_assembly(args)
   velvetaccept_calls = accept_assembly(args)
   velvetclean_calls = clean()
   
   # set environment variable:
   env_var = 'OMP_NUM_THREADS=%i' % int(args.n - 1)
   
   # submit and release jobs
   print "Submitting jobs"
   # if trimming is needed
   if args.trim:
      illuminatrim_moab = Moab(illuminatrim_calls, logfile=logger, runname='run_genobox_trim', queue=args.queue, cpu=cpuF)
      # if no interleaving is needed
      if len(interleave_calls) == 0:
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
      # if interleaving is needed
      else:
         interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids)
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
   # if no trimming
   else:
      # if no interleaving is needed
      if len(interleave_calls) == 0:
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
      # if interleaving is needed
      else:
         interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF)
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
   
   # submit job for velvetparse if more than one ksize was chosen
   if len(args.ksizes) > 1:
      velvetparse_moab = Moab(velvetparse_calls, logfile=logger, runname='run_genobox_velvetparse', queue=args.queue, cpu=cpuA, depend=True, depend_type='conc', depend_val=[len(velvetg_calls)], depend_ids=velvetg_moab.ids)
      velvetaccept_moab = Moab(velvetaccept_calls, logfile=logger, runname='run_genobox_velvetaccept', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetparse_moab.ids) 
      velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetaccept_moab.ids)
   else:
      velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetg_moab.ids)
   
   # release jobs
   print "Releasing jobs"
   if args.trim and len(illuminatrim_calls) > 0: illuminatrim_moab.release()
   if len(interleave_calls) > 0: interleave_moab.release()
   velveth_moab.release()
   velvetg_moab.release()
   if len(args.ksizes) > 1: 
      velvetparse_moab.release()
      velvetaccept_moab.release()
   velvetclean_moab.release()
   
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(velvetclean_moab.ids, home, 'velvet', args.queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
Beispiel #11
0
def start_bamstats(args, bam, partition, logger, wait=True):
   '''Starts calculation of bam statistics'''
   
   # samtools flagstat
   # bedtools genomeCoverageBed
   # python avgdepth
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('stats'):
      os.makedirs('stats')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=7gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   cpuUV = 'procs=1,mem=%i,walltime=172800,flags=sharedmem'
   
   # create calls
   if args.mapdamage:
      mapdamage_calls = mapdamapge(bam, args.fa)
   else:
      flagstat_calls = sam_flagstat(bam)
      coverage_calls = bed_genomeCov(bam)
      plotcoverage_calls = plot_coverage(bam)
      avgdepth_calls = python_avgdepth(bam)
      saturation_calls = get_saturation(bam)
   
   
   # submit jobs
   print "Submitting jobs"
   if args.mapdamage:
      mapdamage_moab = Moab(mapdamage_calls, logfile=logger, runname='run_genobox_mapdamage', queue=args.queue, cpu=cpuA, partition=partition)
   else:
      flagstat_moab = Moab(flagstat_calls, logfile=logger, runname='run_genobox_flagstat', queue=args.queue, cpu=cpuC, partition=partition)
      coverage_moab = Moab(coverage_calls, logfile=logger, runname='run_genobox_coverage', queue=args.queue, cpu=cpuC, partition=partition)
      plotcoverage_moab = Moab(plotcoverage_calls, logfile=logger, runname='run_genobox_plotcoverage', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=coverage_moab.ids, partition=partition)
      avgdepth_moab = Moab(avgdepth_calls, logfile=logger, runname='run_genobox_avgdepth', queue=args.queue, cpu=cpuE, partition=partition)
      #saturation_moab = Moab(saturation_calls, logfile=logger, runname='run_genobox_saturation', queue=args.queue, cpu=cpuE, partition=partition)
   
   # release jobs
   print "Releasing jobs"
      
   # wait for jobs to finish
   if wait:
      print "Waiting for jobs to finish ..."
      if args.mapdamage:
         semaphore_ids = mapdamage_moab.ids
      else:
         semaphore_ids = flagstat_moab.ids + coverage_moab.ids + plotcoverage_moab.ids + avgdepth_moab.ids
      
      s = Semaphore(semaphore_ids, home, 'bam_stats', args.queue, 20, 86400) 
      s.wait()
      print "--------------------------------------"
   else:
      print "Jobs running, continuing"
      print "--------------------------------------"
Beispiel #12
0
def bwa_se_align(fastqs, fa, fqtypes, qtrim, N, alignpath, bwa6, library,
                 threads, queue, add_aln, partition, logger):
    '''Start alignment using bwa of fastq reads on index'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab
    import os
    paths = genobox_modules.setSystem()
    home = os.getcwd()

    # setting cpus
    cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
    if threads != 1:
        if partition == 'uv' or partition == 'uv2':
            cpuB = 'procs=%s,mem=5gb,walltime=345600,flags=sharedmem' % threads
        else:
            if threads > 8:
                cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads
            else:
                cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads
    else:
        cpuB = cpuA

    # get readgroups
    RG = library.getRG('Data')
    #RG = genobox_modules.read_groups_from_libfile('Data', library)

    # align
    if bwa6:
        cmd = paths['bwa_6_2_home'] + 'bwa aln '
    else:
        cmd = paths['bwa_home'] + 'bwa aln '

    if add_aln: cmd = cmd + add_aln
    bwa_align = []
    saifiles = []
    for i, fq in enumerate(fastqs):
        f = os.path.split(fq)[1]
        saifile = alignpath + f + '.sai'
        saifiles.append(saifile)
        if fqtypes[i] == 'Illumina':
            arg = ' -I -t %i -q %i %s %s > %s' % (threads, qtrim, fa, fq,
                                                  saifile)
        elif fqtypes[i] == 'Sanger':
            arg = ' -t %i -q %i %s %s > %s' % (threads, qtrim, fa, fq, saifile)
        elif fqtypes[i] == 'Solexa':
            raise ValueError(
                'File %s is in Solexa format, convert to Sanger first\n' % fq)
        bwa_align.append(cmd + arg)

    # samse
    bwa_samse = []
    bamfiles = []
    bamfiles_dict = dict()
    for i, fq in enumerate(fastqs):
        f = os.path.split(fq)[1]
        bamfile = alignpath + f + '.bam'
        bamfiles.append(bamfile)
        bamfiles_dict[fq] = bamfile
        if bwa6:
            p = paths['bwa_6_2_home']
        else:
            p = paths['bwa_home']

        call = '%sbwa samse -n %i -r \"%s\" %s %s %s | %ssamtools view -Sb - > %s' % (
            p, N, '\\t'.join(
                RG[fq]), fa, saifiles[i], fq, paths['samtools_home'], bamfile)
        bwa_samse.append(call)

    # submit jobs
    # create moab instance for the align_calls and dispatch to queue
    bwa_align_moab = Moab(bwa_align,
                          logfile=logger,
                          runname='run_genobox_bwaalign',
                          queue=queue,
                          cpu=cpuB,
                          partition=partition)
    bwa_samse_moab = Moab(bwa_samse,
                          logfile=logger,
                          runname='run_genobox_bwasamse',
                          queue=queue,
                          cpu=cpuA,
                          depend=True,
                          depend_type='one2one',
                          depend_val=[1],
                          depend_ids=bwa_align_moab.ids,
                          partition=partition)

    # release jobs
    print "Releasing jobs"
    #bwa_align_moab.release()
    #bwa_samse_moab.release()

    return (bwa_samse_moab.ids, bamfiles_dict)
Beispiel #13
0
def bwasw_iontorrent(fastqs, fa, fqtypes, alignpath, bwa6, library, threads,
                     queue, partition, logger):
    '''Start alignment of fastq files using BWA-SW Iontorrent data'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab
    import os
    paths = genobox_modules.setSystem()
    home = os.getcwd()

    # setting cpus
    cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
    if threads != 1:
        if partition == 'uv' or partition == 'uv2':
            cpuB = 'procs=%s,mem=5gb,walltime=345600,flags=sharedmem' % threads
        else:
            if threads > 8:
                cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads
            else:
                cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads
    else:
        cpuB = cpuA

    # align
    if bwa6:
        cmd = paths['bwa_6_2_home'] + 'bwa '
    else:
        cmd = paths['bwa_home'] + 'bwa '

    bwa_align = []
    bamfiles = []
    bamfiles_dict = dict()
    for i, fq in enumerate(fastqs):
        f = os.path.split(fq)[1]
        bamfile = alignpath + f + '.bam'
        bamfiles.append(bamfile)
        bamfiles_dict[fq] = bamfile
        if fqtypes[i] == 'Illumina':
            raise ValueError(
                'BWA-SW should not align reads with Illumina Qualities')
        elif fqtypes[i] == 'Sanger':
            arg = ' bwasw -t %i %s %s |  %ssamtools view -Sb - > %s' % (
                threads, fa, fq, paths['samtools_home'], bamfile)
        bwa_align.append(cmd + arg)

    # submit jobs
    # create moab instance for the align_calls and dispatch to queue
    bwa_align_moab = Moab(bwa_align,
                          logfile=logger,
                          runname='run_genobox_bwaalign',
                          queue=queue,
                          cpu=cpuB,
                          partition=partition)

    # release jobs
    print "Releasing jobs"
    #bwa_align_moab.release()

    return (bwa_align_moab.ids, bamfiles_dict)
Beispiel #14
0
def bwa_pe_align(pe1, pe2, fa, fqtypes_pe1, fqtypes_pe2, qtrim, N, alignpath,
                 bwa6, a, library, threads, queue, add_aln, partition, logger):
    '''Start alignment using bwa of paired end fastq reads on index'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab
    import os
    paths = genobox_modules.setSystem()
    home = os.getcwd()

    # setting cpus
    cpuA = 'nodes=1:ppn=1,mem=7gb,walltime=345600'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
    if threads != 1:
        if partition == 'uv' or partition == 'uv2':
            cpuB = 'procs=%s,mem=5gb,walltime=172800,flags=sharedmem' % threads
        else:
            if threads > 8:
                cpuB = 'nodes=1:ppn=%s,mem=7gb,walltime=345600' % threads
            else:
                cpuB = 'nodes=1:ppn=%s,mem=5gb,walltime=345600' % threads
    else:
        cpuB = cpuA

    # get readgroups
    RG = library.getRG('Data')
    #RG = genobox_modules.read_groups_from_libfile('Data', library)

    # align and sampe
    if bwa6:
        cmd = paths['bwa_6_2_home'] + 'bwa '
    else:
        cmd = paths['bwa_home'] + 'bwa '

    bwa_align = []
    sam2bam_calls = []
    bwa_align1_calls = []
    bwa_align2_calls = []
    bwa_sampe_calls = []

    saifiles1 = []
    saifiles2 = []
    bamfiles = []
    bamfiles_dict = dict()

    for i, fq in enumerate(pe1):
        # set input fastq format
        if fqtypes_pe1[i] != fqtypes_pe2[i]:
            raise ValueError('Fastq formats are not the same for %s and %s' %
                             (pe1[i], pe2[i]))
        elif fqtypes_pe1[i] == 'Sanger':
            bwa_cmd = '%s aln' % cmd
        elif fqtypes_pe1[i] == 'Illumina':
            bwa_cmd = '%s aln -I ' % cmd
        else:
            raise ValueError('fqtype must be Sanger or Illumina')

        if add_aln: bwa_cmd = bwa_cmd + add_aln

        # set filenames
        f1 = os.path.split(pe1[i])[1]
        f2 = os.path.split(pe2[i])[1]
        saifile1 = alignpath + f1 + '.sai'
        saifile2 = alignpath + f2 + '.sai'
        saifiles1.append(saifile1)
        saifiles2.append(saifile2)
        bamfiles.append(alignpath + f1 + '.bam')

        bamfiles_dict[pe1[i]] = alignpath + f1 + '.bam'
        bamfiles_dict[pe2[i]] = alignpath + f1 + '.bam'

        if bwa6:
            p = paths['bwa_6_2_home']
        else:
            p = paths['bwa_home']

        # generate calls
        bwa_align1 = '%s -t %s -q %i %s -f %s %s ' % (bwa_cmd, threads, qtrim,
                                                      fa, saifiles1[i], pe1[i])
        bwa_align2 = '%s -t %s -q %i %s -f %s %s ' % (bwa_cmd, threads, qtrim,
                                                      fa, saifiles2[i], pe2[i])
        sampecall = '%sbwa sampe -n %i -a %i -r \"%s\" %s %s %s %s %s | %ssamtools view -Sb - > %s' % (
            p, N, a, '\\t'.join(RG[fq]), fa, saifiles1[i], saifiles2[i],
            pe1[i], pe2[i], paths['samtools_home'], bamfiles[i])
        bwa_align1_calls.append(bwa_align1)
        bwa_align2_calls.append(bwa_align2)
        bwa_sampe_calls.append(sampecall)

    # submit jobs
    # create moab instance for the align_calls and dispatch to queue
    bwa_align1_moab = Moab(bwa_align1_calls,
                           logfile=logger,
                           runname='run_genobox_bwaalign1',
                           queue=queue,
                           cpu=cpuB,
                           partition=partition)
    bwa_align2_moab = Moab(bwa_align2_calls,
                           logfile=logger,
                           runname='run_genobox_bwaalign2',
                           queue=queue,
                           cpu=cpuB,
                           partition=partition)

    # set jobids in the correct way
    bwa_alignids = []
    for i in range(len(bwa_align1_moab.ids)):
        bwa_alignids.append(bwa_align1_moab.ids[i])
        bwa_alignids.append(bwa_align2_moab.ids[i])

    # submit sampe
    bwa_sampe_moab = Moab(bwa_sampe_calls,
                          logfile=logger,
                          runname='run_genobox_bwasampe',
                          queue=queue,
                          cpu=cpuA,
                          depend=True,
                          depend_type='conc',
                          depend_val=[2],
                          depend_ids=bwa_alignids,
                          partition=partition)

    # release jobs
    print "Releasing jobs"
    #bwa_align1_moab.release()
    #bwa_align2_moab.release()
    #bwa_sampe_moab.release()

    return (bwa_sampe_moab.ids, bamfiles_dict)
Beispiel #15
0
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition,
                     logger):
    '''Starts genotyping using samtools of input bam file'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # create calls
    bamindex_calls = bam_index(bam)
    (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp)
    bcfcombine_calls = bcf_combine(bcffiles, o)
    bcfindex_calls = bcf_index(o)
    consensus_calls = consensus(o, sample)

    # submit jobs #
    print "Submitting jobs"
    bamindex_moab = Moab(bamindex_calls,
                         logfile=logger,
                         runname='run_genobox_bamindex',
                         queue=queue,
                         cpu=cpuC,
                         partition=partition)
    mpileup_moab = Moab(mpileup_calls,
                        logfile=logger,
                        runname='run_genobox_mpileup',
                        queue=queue,
                        cpu=cpuF,
                        depend=True,
                        depend_type='expand',
                        depend_val=[len(mpileup_calls)],
                        depend_ids=bamindex_moab.ids,
                        partition=partition)
    bcfcombine_moab = Moab(bcfcombine_calls,
                           logfile=logger,
                           runname='run_genobox_bcfcombine',
                           queue=queue,
                           cpu=cpuC,
                           depend=True,
                           depend_type='conc',
                           depend_val=[len(mpileup_calls)],
                           depend_ids=mpileup_moab.ids,
                           partition=partition)
    bcfindex_moab = Moab(bcfindex_calls,
                         logfile=logger,
                         runname='run_genobox_bcfindex',
                         queue=queue,
                         cpu=cpuC,
                         depend=True,
                         depend_type='one2one',
                         depend_val=[1],
                         depend_ids=bcfcombine_moab.ids,
                         partition=partition)
    #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)

    # release jobs #
    print "Releasing jobs"
    #bamindex_moab.release()
    #mpileup_moab.release()
    #bcfcombine_moab.release()
    #bcfindex_moab.release()
    #consensus_moab.release()

    # semaphore (consensus is currently not waited for)
    print "Waiting for jobs to finish ..."
    s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    # remove temporary files
    genobox_modules.rm_files(bcffiles)

    # return output bcf
    return o