Ejemplo n.º 1
0
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir,
                         partition, logger):
    '''Start variant vcf-filter using gatk'''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    if not os.path.exists('tmp'):
        os.makedirs('tmp')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    vcffilter_calls = []
    cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py'

    # for each chromosome
    for v in vcfs:
        arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q)
        if rmsk: arg = arg + ' --rmsk %s' % rmsk
        if ab != 0.5: arg = arg + ' --ab %f' % ab
        if prune != 0: arg = arg + ' --prune %i' % prune
        vcffilter_calls.append(cmd + arg)

    # submit jobs
    print "Submitting jobs"
    vcffilter_moab = Moab(vcffilter_calls,
                          logfile=logger,
                          runname='run_genobox_vcffilter_gatk',
                          queue=queue,
                          cpu=cpuF,
                          partition=partition)

    # release jobs #
    print "Releasing jobs"
    #vcffilter_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20,
                  2 * 86400)
    s.wait()
    print "--------------------------------------"
Ejemplo n.º 2
0
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition, logger):
   '''Starts genotyping using samtools of input bam file'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create calls
   bamindex_calls = bam_index(bam)
   (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp)
   bcfcombine_calls = bcf_combine(bcffiles, o)
   bcfindex_calls = bcf_index(o)
   consensus_calls = consensus(o, sample)
   
   # submit jobs #
   print "Submitting jobs"   
   bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition)
   mpileup_moab = Moab(mpileup_calls, logfile=logger, runname='run_genobox_mpileup', queue=queue, cpu=cpuF, depend=True, depend_type='expand', depend_val=[len(mpileup_calls)], depend_ids=bamindex_moab.ids, partition=partition)
   bcfcombine_moab = Moab(bcfcombine_calls, logfile=logger, runname='run_genobox_bcfcombine', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(mpileup_calls)], depend_ids=mpileup_moab.ids, partition=partition)
   bcfindex_moab = Moab(bcfindex_calls, logfile=logger, runname='run_genobox_bcfindex', queue=queue, cpu=cpuC, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #bamindex_moab.release()
   #mpileup_moab.release()
   #bcfcombine_moab.release()
   #bcfindex_moab.release()
   #consensus_moab.release()
      
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # remove temporary files
   genobox_modules.rm_files(bcffiles)
   
   # return output bcf
   return o
Ejemplo n.º 3
0
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger):
    '''Annotate vcf.gz file with dbSNP,
   exchanging chromsome names to dbSNP version
   sort vcf and the input to dbSNP
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not dbsnp or dbsnp == 'None':
        print "No dbsnp file given - skipping"
        print "--------------------------------------"
        return vcf

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # create command
    cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py'
    arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o)
    dbsnp_calls = [cmd + arg]

    # submit jobs
    print "Submitting jobs"
    dbsnp_moab = Moab(dbsnp_calls,
                      logfile=logger,
                      runname='run_genobox_dbsnp',
                      queue=queue,
                      cpu=cpuC,
                      partition=partition)

    # release jobs #
    print "Releasing jobs"
    #dbsnp_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    return o
Ejemplo n.º 4
0
def start_vcffilter_gatk(vcfs, genome, fa, Q, rmsk, ab, prune, queue, dir, partition, logger):
   '''Start variant vcf-filter using gatk'''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   if not os.path.exists('tmp'):
      os.makedirs('tmp')

   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=7gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   vcffilter_calls = []
   cmd = paths['genobox_home'] + 'genobox_vcffilter_gatk_h.py'
   
   # for each chromosome
   for v in vcfs:
      arg = ' --vcf %s --fa %s --genome %s --Q %f' % (v, fa, genome, Q)
      if rmsk: arg = arg + ' --rmsk %s' % rmsk
      if ab != 0.5: arg = arg + ' --ab %f' % ab
      if prune != 0: arg = arg + ' --prune %i' % prune
      vcffilter_calls.append(cmd+arg)
   
   # submit jobs
   print "Submitting jobs"
   vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter_gatk', queue=queue, cpu=cpuF, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #vcffilter_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(vcffilter_moab.ids, home, 'vcffilter_gatk', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # return filename of final vcf
   
Ejemplo n.º 5
0
def start_dbsnp(vcf, ex, dbsnp, o, queue, partition, logger):
   '''Annotate vcf.gz file with dbSNP,
   exchanging chromsome names to dbSNP version
   sort vcf and the input to dbSNP
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not dbsnp or dbsnp == 'None':
      print "No dbsnp file given - skipping"
      print "--------------------------------------"
      return vcf
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create command
   cmd = paths['genobox_home'] + 'genobox_dbsnp_h.py'
   arg = ' --vcf %s --ex %s --dbsnp %s --o %s' % (vcf, ex, dbsnp, o)
   dbsnp_calls = [cmd+arg]
   
   # submit jobs
   print "Submitting jobs"
   dbsnp_moab = Moab(dbsnp_calls, logfile=logger, runname='run_genobox_dbsnp', queue=queue, cpu=cpuC, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #dbsnp_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(dbsnp_moab.ids, home, 'dbsnp', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   return o
Ejemplo n.º 6
0
def start_genotyping_gatk(bam, genome, fa, dbsnp, call_conf, call_emit, output_mode, queue, sample, partition, logger):
   '''Starts genotyping using samtools of input bam file'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=3gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # create calls
   bamindex_calls = bam_index(bam)
   (gatk_calls, vcffiles) = unified_genotyper(bam, genome, fa, dbsnp, call_conf, call_emit, output_mode)
   
   # submit jobs #
   print "Submitting jobs"   
   bamindex_moab = Moab(bamindex_calls, logfile=logger, runname='run_genobox_bamindex', queue=queue, cpu=cpuC, partition=partition)
   gatk_moab = Moab(gatk_calls, logfile=logger, runname='run_genobox_genotyping_gatk', queue=queue, cpu=cpuE, depend=True, depend_type='expand', depend_val=[len(gatk_calls)], depend_ids=bamindex_moab.ids, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   #bamindex_moab.release()
   #gatk_moab.release()
      
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(gatk_moab.ids, home, 'genotyping', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # remove temporary files
   #genobox_modules.rm_files(bcffiles)
   
   # return output variant files
   return vcffiles
Ejemplo n.º 7
0
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir, partition, logger):
   '''Start variant vcf-filter
   
   Genome file must be given, format is a line for each chromosome:
   chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth
   
   Filtering steps:
   vcfutils.pl varFilter
   annotated repeats using rmsk
   heterozygote variants on haploid chromosomes
   allelic balance
   pruning of variants within N nt of each other
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   if not os.path.exists('genotyping/tmp'):
      os.makedirs('genotyping/tmp')

   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   if caller == 'samtools':
      
      # create command
      cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py'
      if dir and dir != 'None':
         outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1])
      else:
         outfile = o
      arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (bcf, genome, caller, Q, rmsk, ab, prune, outfile)
      vcffilter_calls = [cmd+arg]
   
   
   # submit jobs
   print "Submitting jobs"
   vcffilter_moab = Moab(vcffilter_calls, logfile=logger, runname='run_genobox_vcffilter', queue=queue, cpu=cpuE, partition=partition)
   
   # release jobs #
   print "Releasing jobs"
   vcffilter_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # return filename of final vcf
   return o
Ejemplo n.º 8
0
def start_vcffilter(bcf, genome, caller, Q, ex, rmsk, ab, prune, o, queue, dir,
                    partition, logger):
    '''Start variant vcf-filter
   
   Genome file must be given, format is a line for each chromosome:
   chrom\tchrom_len\tchrom_short_name\haploid/diploid\tlow_depth\thigh_depth
   
   Filtering steps:
   vcfutils.pl varFilter
   annotated repeats using rmsk
   heterozygote variants on haploid chromosomes
   allelic balance
   pruning of variants within N nt of each other
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    if not os.path.exists('genotyping/tmp'):
        os.makedirs('genotyping/tmp')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    if caller == 'samtools':

        # create command
        cmd = paths['genobox_home'] + 'genobox_vcffilter_h.py'
        if dir and dir != 'None':
            outfile = '%s/%s.%s' % (os.path.split(o)[0], dir,
                                    os.path.split(o)[1])
        else:
            outfile = o
        arg = ' --bcf %s --genome %s --caller %s --Q %f --rmsk %s --ab %f --prune %i --o %s' % (
            bcf, genome, caller, Q, rmsk, ab, prune, outfile)
        vcffilter_calls = [cmd + arg]

    # submit jobs
    print "Submitting jobs"
    vcffilter_moab = Moab(vcffilter_calls,
                          logfile=logger,
                          runname='run_genobox_vcffilter',
                          queue=queue,
                          cpu=cpuE,
                          partition=partition)

    # release jobs #
    print "Releasing jobs"
    #vcffilter_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(vcffilter_moab.ids, home, 'vcffilter', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    # return filename of final vcf
    return o
Ejemplo n.º 9
0
def start_alignment(args, logger):
    '''Start alignment of fastq files using BWA'''

    import genobox_modules
    from genobox_classes import Semaphore, Library
    import subprocess
    import os
    import random
    import string

    paths = genobox_modules.setSystem()
    home = os.getcwd()
    semaphore_ids = []
    bamfiles = dict()

    if not os.path.exists('alignment'):
        os.makedirs('alignment')

    # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment)
    if hasattr(args, 'mapq'):
        library = genobox_modules.initialize_library(args.libfile, args.se,
                                                     args.pe1, args.pe2,
                                                     args.sample, args.mapq,
                                                     args.libs, args.pl)
    else:
        library = genobox_modules.initialize_library(args.libfile, args.se,
                                                     args.pe1, args.pe2,
                                                     args.sample, [30],
                                                     args.libs, args.pl)

    # check for fa
    check_fa(args.fa, args.bwa6)

    # check for if trimming was performed (abgv only) and set correct files
    #(se_files, pe1_files, pe2_files) = check_trim(args)

    # start single end alignments
    if args.se:

        # get platform info
        (PL, PL2data) = library.getPL('Data')

        print "Submitting single end alignments"
        for key, value in PL2data.items():
            if key == 'ILLUMINA' or key == 'HELICOS':
                fqtypes_se = []
                # filter to only contain single end files
                toalign = []
                for v in value:
                    if v in args.se: toalign.append(v)
                for fq in toalign:
                    if args.quals:
                        fqtypes_se.append(args.quals)
                    else:
                        fqtypes_se.append(
                            check_formats_fq(fq, args.gz, args.bwa6))

                # submit
                (se_align_ids, bamfiles_se) = bwa_se_align(
                    toalign, args.fa, fqtypes_se, args.qtrim, args.N,
                    'alignment/', args.bwa6, library, args.n, args.queue,
                    args.add_aln, args.partition, logger)
                semaphore_ids.extend(se_align_ids)
                bamfiles.update(bamfiles_se)
            elif key == 'PACBIO':
                toalign = []
                for v in value:
                    if v in args.se: toalign.append(v)
                fqtypes_se = []
                for fq in toalign:
                    if args.quals:
                        fqtypes_se.append(args.quals)
                    else:
                        fqtypes_se.append(
                            check_formats_fq(fq, args.gz, args.bwa6))

                # submit
                (se_align_ids, bamfiles_se) = bwasw_pacbio(
                    toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6,
                    library, args.n, args.queue, args.partition, logger)
                semaphore_ids.extend(se_align_ids)
                bamfiles.update(bamfiles_se)
            elif key == 'IONTORRENT' or key == '454':
                toalign = []
                for v in value:
                    if v in args.se: toalign.append(v)
                fqtypes_se = []
                for fq in toalign:
                    if args.quals:
                        fqtypes_se.append(args.quals)
                    else:
                        fqtypes_se.append(
                            check_formats_fq(fq, args.gz, args.bwa6))

                # submit
                (se_align_ids, bamfiles_se) = bwasw_iontorrent(
                    toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6,
                    library, args.n, args.queue, args.partition, logger)
                semaphore_ids.extend(se_align_ids)
                bamfiles.update(bamfiles_se)

    # start paired end alignments
    if args.pe1:
        if len(args.pe1) != len(args.pe2):
            raise ValueError(
                'Same number of files must be given to --pe1 and --pe2')

        # set fqtypes
        fqtypes_pe1 = []
        fqtypes_pe2 = []
        for fq in args.pe1:
            if args.quals:
                fqtypes_pe1.append(args.quals)
            else:
                fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6))

        for fq in args.pe2:
            if args.quals:
                fqtypes_pe2.append(args.quals)
            else:
                fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6))

        # submit
        print "Submitting paired end alignments"
        (pe_align_ids,
         bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1,
                                     fqtypes_pe2, args.qtrim, args.N,
                                     'alignment/', args.bwa6, args.a, library,
                                     args.n, args.queue, args.add_aln,
                                     args.partition, logger)
        semaphore_ids.extend(pe_align_ids)
        bamfiles.update(bamfiles_pe)

    # update library
    library.update_with_tag('Data', 'BAM', bamfiles, True)

    # wait for jobs to finish
    print "Waiting for jobs to finish ..."

    s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 345600)
    s.wait()

    print "--------------------------------------"

    # return bamfiles
    return (bamfiles, library)
Ejemplo n.º 10
0
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam, realignment, known, fa, sample, partition, logger):
   '''Starts bam processing of input files'''
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab, Semaphore, Library
   import os
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600'
   cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600'
   
   # create library instance
   if library_file and library_file != 'None':
      if isinstance(library_file, Library):
         library = library_file
      else:
         library = Library(library_file)
         library.read()
   else:
      library = genobox_modules.initialize_library(libfile=library_file, sample=sample, mapq=mapq, libs=libs, bams=bams)
   
   (bam2lib, lib2bam) = library.getBamLibs()
      
   ## CREATE CALLS ##
   
   # filter bam and sort
   (filter_sort_calls, filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000)
   
   # merge to libs
   (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(), lib2bam.values(), add_suffix=True, final_suffix='.flt.sort.bam', tmpdir=tmpdir)
   
   # rmdup on libs
   (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir)
   
   # optional: realignment
   if realignment:
      (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False)
      (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa, known)
   else:
      # merge to final file
      (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files], add_suffix=False)
   
   
   ## SUBMIT JOBS ##
   
   print "Submitting jobs"
   filtersort_moab = Moab(filter_sort_calls, logfile=logger, runname='run_genobox_filtersort', queue=queue, cpu=cpuH, partition=partition)
   mergelib_moab = Moab(merge_lib_calls, logfile=logger, runname='run_genobox_lib_merge', queue=queue, cpu=cpuE, depend=True, depend_type='complex', depend_val=map(len, lib2bam.values()), depend_ids=filtersort_moab.ids, partition=partition)
   rmdup_moab = Moab(rmdup_calls, logfile=logger, runname='run_genobox_rmdup', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergelib_moab.ids, partition=partition)          # NB: If memory should be changed, also change java memory spec in rmdup function
   mergefinal_moab = Moab(merge_final_call, logfile=logger, runname='run_genobox_final_merge', queue=queue, cpu=cpuC, depend=True, depend_type='conc', depend_val=[len(rmdup_moab.ids)], depend_ids=rmdup_moab.ids, partition=partition)
   if realignment:
      realign_moab = Moab(realign_calls, logfile=logger, runname='run_genobox_realignment', queue=queue, cpu=cpuE, depend=True, depend_type='one2one', depend_val=[1], depend_ids=mergefinal_moab.ids, partition=partition)
   # realignment calls needs to be written together in a shell-file or dependent on each other #
   
   # release jobs #
   print "Releasing jobs"
   #filtersort_moab.release()
   #mergelib_moab.release()
   #rmdup_moab.release()
   #mergefinal_moab.release()
   #if realignment: realign_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..." 
   if realignment:
      s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20, 2*86400)
   else:
      s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
   
   # return final bamfile
   return final_bam
Ejemplo n.º 11
0
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir, partition, logger):
   '''Extract high confidence same-as-reference bases from bcf, options are to:
   
   exchange ids
   annotate using dbsnp
   filter rmsk
   filter ambiguous indel positions
   '''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import subprocess
   import os
   
   if not os.path.exists('genotyping'):
      os.makedirs('genotyping')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # read genome file
   genome = get_genome(genome_file)
   
   # create commands
   bcf2ref_calls = []
   cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py'
   for chr in genome:
      # set outfile name
      if len(genome) == 1:
         if dir and dir != 'None':
            outfile = '%s/%s.%s' % (os.path.split(o)[0], dir, os.path.split(o)[1])
         else:
            outfile = o
      else:
         if dir and dir != 'None':
            outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2], os.path.split(o)[1])
         else:
            outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2], os.path.split(o)[1])
      
      arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels, outfile)
      bcf2ref_calls.append(cmd+arg)
   
   # submit jobs
   print "Submitting jobs"
   bcf2ref_moab = Moab(bcf2ref_calls, logfile=logger, runname='run_genobox_bcf2ref', queue=queue, cpu=cpuE, partition=partition)
   
   # release jobs
   print "Releasing jobs"
   #bcf2ref_moab.release()
   
   # semaphore
   print "Waiting for jobs to finish ..."
   s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
Ejemplo n.º 12
0
def start_assembly(args, logger):
   '''Start assembly'''
   
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuV = 'nodes=1:ppn=%i,mem=%s,walltime=172800' % (args.n, args.m)
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   
   # set kmersizes (if auto)
   if args.ksizes == ['auto']:
      args.ksizes = set_kmersizes(args)   
   
   # trimming calls
   if args.trim: illuminatrim_calls = illumina_trim(args, int(args.ksizes[0]), 15, 20, 15, False)
   
   # checking if files needs to be interleaved
   interleave_dict = {}    
   interleave_dict['shortPaired'] = interleave(args.shortPaired, args.sample)[0] ; args.shortPaired = interleave(args.shortPaired, args.sample)[1]
   interleave_dict['shortPaired2'] = interleave(args.shortPaired2, args.sample)[0] ; args.shortPaired2 = interleave(args.shortPaired2, args.sample)[1]
   interleave_dict['longPaired'] = interleave(args.longPaired, args.sample)[0] ; args.longPaired = interleave(args.longPaired, args.sample)[1]
   
   # interleave calls
   interleave_calls = []
   for key,value in interleave_dict.items():
      if value:
         interleave_calls.append(value)
   
   # velvet calls
   velveth_calls = create_velveth_calls(args)
   velvetg_calls = create_velvetg_calls(args)
   
   # velvet parse calls
   velvetparse_calls = get_best_assembly(args)
   velvetaccept_calls = accept_assembly(args)
   velvetclean_calls = clean()
   
   # set environment variable:
   env_var = 'OMP_NUM_THREADS=%i' % int(args.n - 1)
   
   # submit and release jobs
   print "Submitting jobs"
   # if trimming is needed
   if args.trim:
      illuminatrim_moab = Moab(illuminatrim_calls, logfile=logger, runname='run_genobox_trim', queue=args.queue, cpu=cpuF)
      # if no interleaving is needed
      if len(interleave_calls) == 0:
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
      # if interleaving is needed
      else:
         interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF, depend=True, depend_type='all', depend_val=[1], depend_ids=illuminatrim_moab.ids)
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
   # if no trimming
   else:
      # if no interleaving is needed
      if len(interleave_calls) == 0:
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
      # if interleaving is needed
      else:
         interleave_moab = Moab(interleave_calls, logfile=logger, runname='run_genobox_interleave', queue=args.queue, cpu=cpuF)
         velveth_moab = Moab(velveth_calls, logfile=logger, runname='run_genobox_velveth', queue=args.queue, cpu=cpuV, depend=True, depend_type='all', depend_val=[1], depend_ids=interleave_moab.ids, env=env_var)
         velvetg_moab = Moab(velvetg_calls, logfile=logger, runname='run_genobox_velvetg', queue=args.queue, cpu=cpuV, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velveth_moab.ids)
   
   # submit job for velvetparse if more than one ksize was chosen
   if len(args.ksizes) > 1:
      velvetparse_moab = Moab(velvetparse_calls, logfile=logger, runname='run_genobox_velvetparse', queue=args.queue, cpu=cpuA, depend=True, depend_type='conc', depend_val=[len(velvetg_calls)], depend_ids=velvetg_moab.ids)
      velvetaccept_moab = Moab(velvetaccept_calls, logfile=logger, runname='run_genobox_velvetaccept', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetparse_moab.ids) 
      velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetaccept_moab.ids)
   else:
      velvetclean_moab = Moab(velvetclean_calls, logfile=logger, runname='run_genobox_velvetclean', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=velvetg_moab.ids)
   
   # release jobs
   print "Releasing jobs"
   if args.trim and len(illuminatrim_calls) > 0: illuminatrim_moab.release()
   if len(interleave_calls) > 0: interleave_moab.release()
   velveth_moab.release()
   velvetg_moab.release()
   if len(args.ksizes) > 1: 
      velvetparse_moab.release()
      velvetaccept_moab.release()
   velvetclean_moab.release()
   
   # semaphore (consensus is currently not waited for)
   print "Waiting for jobs to finish ..."
   s = Semaphore(velvetclean_moab.ids, home, 'velvet', args.queue, 20, 2*86400)
   s.wait()
   print "--------------------------------------"
Ejemplo n.º 13
0
def start_bcf2ref(bcf, genome_file, Q, ex, dbsnp, rmsk, indels, o, queue, dir,
                  partition, logger):
    '''Extract high confidence same-as-reference bases from bcf, options are to:
   
   exchange ids
   annotate using dbsnp
   filter rmsk
   filter ambiguous indel positions
   '''

    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import subprocess
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # read genome file
    genome = get_genome(genome_file)

    # create commands
    bcf2ref_calls = []
    cmd = paths['genobox_home'] + 'genobox_bcf2ref_h.py'
    for chr in genome:
        # set outfile name
        if len(genome) == 1:
            if dir and dir != 'None':
                outfile = '%s/%s.%s' % (os.path.split(o)[0], dir,
                                        os.path.split(o)[1])
            else:
                outfile = o
        else:
            if dir and dir != 'None':
                outfile = '%s/%s.%s.%s' % (os.path.split(o)[0], dir, chr[2],
                                           os.path.split(o)[1])
            else:
                outfile = '%s/%s.%s' % (os.path.split(o)[0], chr[2],
                                        os.path.split(o)[1])

        arg = ' --bcf %s --chr_id \"%s\" --chr %s --d %s --D %s --Q %f --ex %s --dbsnp %s --rmsk %s --indels %s --o %s' % (
            bcf, chr[0], chr[2], chr[4], chr[5], Q, ex, dbsnp, rmsk, indels,
            outfile)
        bcf2ref_calls.append(cmd + arg)

    # submit jobs
    print "Submitting jobs"
    bcf2ref_moab = Moab(bcf2ref_calls,
                        logfile=logger,
                        runname='run_genobox_bcf2ref',
                        queue=queue,
                        cpu=cpuE,
                        partition=partition)

    # release jobs
    print "Releasing jobs"
    #bcf2ref_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    s = Semaphore(bcf2ref_moab.ids, home, 'bcf2ref', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"
Ejemplo n.º 14
0
def start_bamstats(args, bam, partition, logger, wait=True):
   '''Starts calculation of bam statistics'''
   
   # samtools flagstat
   # bedtools genomeCoverageBed
   # python avgdepth
   
   import subprocess
   import genobox_modules
   from genobox_classes import Moab
   from genobox_classes import Semaphore   
   import os
   
   if not os.path.exists('stats'):
      os.makedirs('stats')
   
   # set queueing
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
   cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
   cpuE = 'nodes=1:ppn=1,mem=7gb,walltime=172800'
   cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
   cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
   cpuUV = 'procs=1,mem=%i,walltime=172800,flags=sharedmem'
   
   # create calls
   if args.mapdamage:
      mapdamage_calls = mapdamapge(bam, args.fa)
   else:
      flagstat_calls = sam_flagstat(bam)
      coverage_calls = bed_genomeCov(bam)
      plotcoverage_calls = plot_coverage(bam)
      avgdepth_calls = python_avgdepth(bam)
      saturation_calls = get_saturation(bam)
   
   
   # submit jobs
   print "Submitting jobs"
   if args.mapdamage:
      mapdamage_moab = Moab(mapdamage_calls, logfile=logger, runname='run_genobox_mapdamage', queue=args.queue, cpu=cpuA, partition=partition)
   else:
      flagstat_moab = Moab(flagstat_calls, logfile=logger, runname='run_genobox_flagstat', queue=args.queue, cpu=cpuC, partition=partition)
      coverage_moab = Moab(coverage_calls, logfile=logger, runname='run_genobox_coverage', queue=args.queue, cpu=cpuC, partition=partition)
      plotcoverage_moab = Moab(plotcoverage_calls, logfile=logger, runname='run_genobox_plotcoverage', queue=args.queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=coverage_moab.ids, partition=partition)
      avgdepth_moab = Moab(avgdepth_calls, logfile=logger, runname='run_genobox_avgdepth', queue=args.queue, cpu=cpuE, partition=partition)
      #saturation_moab = Moab(saturation_calls, logfile=logger, runname='run_genobox_saturation', queue=args.queue, cpu=cpuE, partition=partition)
   
   # release jobs
   print "Releasing jobs"
      
   # wait for jobs to finish
   if wait:
      print "Waiting for jobs to finish ..."
      if args.mapdamage:
         semaphore_ids = mapdamage_moab.ids
      else:
         semaphore_ids = flagstat_moab.ids + coverage_moab.ids + plotcoverage_moab.ids + avgdepth_moab.ids
      
      s = Semaphore(semaphore_ids, home, 'bam_stats', args.queue, 20, 86400) 
      s.wait()
      print "--------------------------------------"
   else:
      print "Jobs running, continuing"
      print "--------------------------------------"
Ejemplo n.º 15
0
def start_trim(args, logger):
    '''Start trimming from genobox.py'''

    import genobox_modules
    from genobox_classes import Moab, Semaphore
    import subprocess
    import os
    import sys

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    if args.partition == 'uv':
        cpuA = 'procs=2,mem=512mb,walltime=172800,flags=sharedmem'
        cpuC = 'procs=1,mem=2gb,walltime=172800,flags=sharedmem'
        cpuE = 'procs=1,mem=5gb,walltime=172800,flags=sharedmem'
        cpuB = 'procs=16,mem=10gb,walltime=172800,flags=sharedmem'
        cpuF = 'procs=2,mem=%s,walltime=172800,flags=sharedmem' % args.m
    else:
        cpuA = 'nodes=1:ppn=2,mem=512mb,walltime=172800'
        cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
        cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
        cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'
        cpuF = 'nodes=1:ppn=2,mem=%s,walltime=172800' % args.m

    # create path
    if not os.path.exists('trimmed'):
        os.makedirs('trimmed')

    # create calls
    (single_calls, se_files) = single_trim(args)
    (paired_calls, pe1_files, pe2_files) = paired_trim(args)

    # submit jobs
    print "Submitting jobs"
    if args.se:
        single_moab = Moab(single_calls,
                           logfile=logger,
                           runname='run_genobox_trimse',
                           queue=args.queue,
                           cpu=cpuA,
                           partition=args.partition)
    if args.pe1 and args.pe2:
        paired_moab = Moab(paired_calls,
                           logfile=logger,
                           runname='run_genobox_trimpe',
                           queue=args.queue,
                           cpu=cpuA,
                           partition=args.partition)

    # release jobs
    print "Releasing jobs"
    #if args.se:
    #   single_moab.release()
    #if args.pe1 and args.pe2:
    #   paired_moab.release()

    # wait for jobs to finish
    print "Waiting for jobs to finish ..."
    semaphore_ids = []
    if args.se:
        semaphore_ids = semaphore_ids + single_moab.ids
    if args.pe1 and args.pe2:
        semaphore_ids = semaphore_ids + paired_moab.ids
    s = Semaphore(semaphore_ids, home, 'read_trimming', args.queue, 60, 86400)
    s.wait()
    print "--------------------------------------"
    sys.stderr.write('Done\n')

    # return trimmed files
    return (se_files, pe1_files, pe2_files)
Ejemplo n.º 16
0
def start_bamprocess(library_file, bams, mapq, libs, tmpdir, queue, final_bam,
                     realignment, known, fa, sample, partition, logger):
    '''Starts bam processing of input files'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab, Semaphore, Library
    import os

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=345600'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=345600'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=345600'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=345600'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=345600'
    cpuG = 'nodes=1:ppn=1,mem=6gb,walltime=345600'
    cpuH = 'nodes=1:ppn=2,mem=7gb,walltime=345600'

    # create library instance
    if library_file and library_file != 'None':
        if isinstance(library_file, Library):
            library = library_file
        else:
            library = Library(library_file)
            library.read()
    else:
        library = genobox_modules.initialize_library(libfile=library_file,
                                                     sample=sample,
                                                     mapq=mapq,
                                                     libs=libs,
                                                     bams=bams)

    (bam2lib, lib2bam) = library.getBamLibs()

    ## CREATE CALLS ##

    # filter bam and sort
    (filter_sort_calls,
     filter_sort_files) = bam_filter_sort(lib2bam, bam2lib, 1500000000)

    # merge to libs
    (merge_lib_calls, librarys) = merge_bam(lib2bam.keys(),
                                            lib2bam.values(),
                                            add_suffix=True,
                                            final_suffix='.flt.sort.bam',
                                            tmpdir=tmpdir)

    # rmdup on libs
    (rmdup_calls, rmdup_files) = rmdup(librarys, tmpdir)

    # optional: realignment
    if realignment:
        (merge_final_call, sample_file) = merge_bam([final_bam], [rmdup_files],
                                                    add_suffix=False)
        (realign_calls, final_file) = realign_bam(final_bam, final_bam, fa,
                                                  known)
    else:
        # merge to final file
        (merge_final_call, final_file) = merge_bam([final_bam], [rmdup_files],
                                                   add_suffix=False)

    ## SUBMIT JOBS ##

    print "Submitting jobs"
    filtersort_moab = Moab(filter_sort_calls,
                           logfile=logger,
                           runname='run_genobox_filtersort',
                           queue=queue,
                           cpu=cpuH,
                           partition=partition)
    mergelib_moab = Moab(merge_lib_calls,
                         logfile=logger,
                         runname='run_genobox_lib_merge',
                         queue=queue,
                         cpu=cpuE,
                         depend=True,
                         depend_type='complex',
                         depend_val=map(len, lib2bam.values()),
                         depend_ids=filtersort_moab.ids,
                         partition=partition)
    rmdup_moab = Moab(
        rmdup_calls,
        logfile=logger,
        runname='run_genobox_rmdup',
        queue=queue,
        cpu=cpuG,
        depend=True,
        depend_type='one2one',
        depend_val=[1],
        depend_ids=mergelib_moab.ids,
        partition=partition
    )  # NB: If memory should be changed, also change java memory spec in rmdup function
    mergefinal_moab = Moab(merge_final_call,
                           logfile=logger,
                           runname='run_genobox_final_merge',
                           queue=queue,
                           cpu=cpuC,
                           depend=True,
                           depend_type='conc',
                           depend_val=[len(rmdup_moab.ids)],
                           depend_ids=rmdup_moab.ids,
                           partition=partition)
    if realignment:
        realign_moab = Moab(realign_calls,
                            logfile=logger,
                            runname='run_genobox_realignment',
                            queue=queue,
                            cpu=cpuE,
                            depend=True,
                            depend_type='one2one',
                            depend_val=[1],
                            depend_ids=mergefinal_moab.ids,
                            partition=partition)
    # realignment calls needs to be written together in a shell-file or dependent on each other #

    # release jobs #
    print "Releasing jobs"
    #filtersort_moab.release()
    #mergelib_moab.release()
    #rmdup_moab.release()
    #mergefinal_moab.release()
    #if realignment: realign_moab.release()

    # semaphore
    print "Waiting for jobs to finish ..."
    if realignment:
        s = Semaphore(realign_moab.ids, home, 'bam_processing', queue, 20,
                      345600)
    else:
        s = Semaphore(mergefinal_moab.ids, home, 'bam_processing', queue, 20,
                      345600)
    s.wait()
    print "--------------------------------------"

    # return final bamfile
    return final_bam
Ejemplo n.º 17
0
def start_alignment(args, logger):
   '''Start alignment of fastq files using BWA'''
   
   import genobox_modules
   from genobox_classes import Semaphore, Library
   import subprocess
   import os
   import random
   import string
   
   paths = genobox_modules.setSystem()
   home = os.getcwd()
   semaphore_ids = []
   bamfiles = dict()
   
   if not os.path.exists('alignment'):
      os.makedirs('alignment')
   
   # initialize library file from given arguments (if args.mapq is defined then its called from abgv, else it is called from alignment)
   if hasattr(args, 'mapq'):
      library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, args.mapq, args.libs, args.pl)
   else:
      library = genobox_modules.initialize_library(args.libfile, args.se, args.pe1, args.pe2, args.sample, [30], args.libs, args.pl)
   
   # check for fa
   check_fa(args.fa, args.bwa6)
   
   # check for if trimming was performed (abgv only) and set correct files
   #(se_files, pe1_files, pe2_files) = check_trim(args)
   
   # start single end alignments
   if args.se:
            
      # get platform info
      (PL, PL2data) = library.getPL('Data')      
      
      print "Submitting single end alignments"
      for key,value in PL2data.items():
         if key == 'ILLUMINA' or key == 'HELICOS':
            fqtypes_se = []
            # filter to only contain single end files
            toalign = []
            for v in value:
               if v in args.se: toalign.append(v)
            for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6))
            # submit
            (se_align_ids, bamfiles_se) = bwa_se_align(toalign, args.fa, fqtypes_se, args.qtrim, args.N, 'alignment/', args.bwa6, library, args.n, args.queue, args.add_aln, args.partition, logger)
            semaphore_ids.extend(se_align_ids)
            bamfiles.update(bamfiles_se)
         elif key == 'PACBIO':
            toalign = []
            for v in value:
               if v in args.se: toalign.append(v)
            fqtypes_se = []
            for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6))
            (se_align_ids, bamfiles_se) = bwasw_pacbio(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger)
            semaphore_ids.extend(se_align_ids)
            bamfiles.update(bamfiles_se)
         elif key == 'IONTORRENT':
            toalign = []
            for v in value:
               if v in args.se: toalign.append(v)
            fqtypes_se = []
            for fq in toalign: fqtypes_se.append(check_formats_fq(fq, args.gz, args.bwa6))
            (se_align_ids, bamfiles_se) = bwasw_iontorrent(toalign, args.fa, fqtypes_se, 'alignment/', args.bwa6, library, args.n, args.queue, args.partition, logger)
            semaphore_ids.extend(se_align_ids)
            bamfiles.update(bamfiles_se)
   
   # start paired end alignments
   if args.pe1:
      if len(args.pe1) != len(args.pe2):
         raise ValueError('Same number of files must be given to --pe1 and --pe2')
      
      # set fqtypes
      fqtypes_pe1 = []
      fqtypes_pe2 = []
      for fq in args.pe1: fqtypes_pe1.append(check_formats_fq(fq, args.gz, args.bwa6))
      for fq in args.pe2: fqtypes_pe2.append(check_formats_fq(fq, args.gz, args.bwa6))
      
      print "Submitting paired end alignments"
      (pe_align_ids, bamfiles_pe) = bwa_pe_align(args.pe1, args.pe2, args.fa, fqtypes_pe1, fqtypes_pe2, args.qtrim, args.N, 'alignment/', args.bwa6, args.a, library, args.n, args.queue, args.add_aln, args.partition, logger)            
      semaphore_ids.extend(pe_align_ids)
      bamfiles.update(bamfiles_pe)
   
   # update library
   library.update_with_tag('Data', 'BAM', bamfiles, True)
   
   # wait for jobs to finish
   print "Waiting for jobs to finish ..." 
   
   s = Semaphore(semaphore_ids, home, 'bwa_alignment', args.queue, 60, 172800)
   s.wait()
   
   print "--------------------------------------"
   
   # return bamfiles   
   return (bamfiles, library)
Ejemplo n.º 18
0
def start_genotyping(bam, chr, fa, prior, pp, queue, o, sample, partition,
                     logger):
    '''Starts genotyping using samtools of input bam file'''

    import subprocess
    import genobox_modules
    from genobox_classes import Moab
    from genobox_classes import Semaphore
    import os

    if not os.path.exists('genotyping'):
        os.makedirs('genotyping')

    # set queueing
    paths = genobox_modules.setSystem()
    home = os.getcwd()
    cpuA = 'nodes=1:ppn=1,mem=512mb,walltime=172800'
    cpuC = 'nodes=1:ppn=1,mem=2gb,walltime=172800'
    cpuE = 'nodes=1:ppn=1,mem=5gb,walltime=172800'
    cpuF = 'nodes=1:ppn=2,mem=2gb,walltime=172800'
    cpuB = 'nodes=1:ppn=16,mem=10gb,walltime=172800'

    # create calls
    bamindex_calls = bam_index(bam)
    (mpileup_calls, bcffiles) = mpileup(bam, chr, fa, prior, pp)
    bcfcombine_calls = bcf_combine(bcffiles, o)
    bcfindex_calls = bcf_index(o)
    consensus_calls = consensus(o, sample)

    # submit jobs #
    print "Submitting jobs"
    bamindex_moab = Moab(bamindex_calls,
                         logfile=logger,
                         runname='run_genobox_bamindex',
                         queue=queue,
                         cpu=cpuC,
                         partition=partition)
    mpileup_moab = Moab(mpileup_calls,
                        logfile=logger,
                        runname='run_genobox_mpileup',
                        queue=queue,
                        cpu=cpuF,
                        depend=True,
                        depend_type='expand',
                        depend_val=[len(mpileup_calls)],
                        depend_ids=bamindex_moab.ids,
                        partition=partition)
    bcfcombine_moab = Moab(bcfcombine_calls,
                           logfile=logger,
                           runname='run_genobox_bcfcombine',
                           queue=queue,
                           cpu=cpuC,
                           depend=True,
                           depend_type='conc',
                           depend_val=[len(mpileup_calls)],
                           depend_ids=mpileup_moab.ids,
                           partition=partition)
    bcfindex_moab = Moab(bcfindex_calls,
                         logfile=logger,
                         runname='run_genobox_bcfindex',
                         queue=queue,
                         cpu=cpuC,
                         depend=True,
                         depend_type='one2one',
                         depend_val=[1],
                         depend_ids=bcfcombine_moab.ids,
                         partition=partition)
    #consensus_moab = Moab(consensus_calls, logfile=logger, runname='run_genobox_consensus', queue=queue, cpu=cpuA, depend=True, depend_type='one2one', depend_val=[1], depend_ids=bcfcombine_moab.ids, partition=partition)

    # release jobs #
    print "Releasing jobs"
    #bamindex_moab.release()
    #mpileup_moab.release()
    #bcfcombine_moab.release()
    #bcfindex_moab.release()
    #consensus_moab.release()

    # semaphore (consensus is currently not waited for)
    print "Waiting for jobs to finish ..."
    s = Semaphore(bcfindex_moab.ids, home, 'genotyping', queue, 20, 2 * 86400)
    s.wait()
    print "--------------------------------------"

    # remove temporary files
    genobox_modules.rm_files(bcffiles)

    # return output bcf
    return o