Example #1
0
    def execute(self):
        options = []
        if (self.params.Maximuminsertsize):
            #integer#Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. [Default: 100000].
            options.append('MAX_INSERT_SIZE=' +
                           str(self.params.Maximuminsertsize))
        if (self.params.Adaptersequence):
            #string#This option may be specified 0 or more times.
            for x in re.split(',| ', self.params.Adaptersequence):
                options.append('ADAPTER_SEQUENCE=' + x)
        if (self.params.Metricaccumulationlevel):
            #enum#The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE, LIBRARY, READ_GROUP} This option may be specified 0 or more times.
            for x in re.split(',| ', self.params.Metricaccumulationlevel):
                options.append('METRIC_ACCUMULATION_LEVEL=' + x)

        if (self.params.Validationstringency):
            #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.
            options.append('VALIDATION_STRINGENCY=' +
                           self.params.Validationstringency)

        if (self.params.Isbisulfitesequenced):
            #boolean#Whether the SAM or BAM file consists of bisulfite sequenced reads. [Default: false].
            options.append('IS_BISULFITE_SEQUENCED=true')
        if (self.params.Assumesorted == False):
            #boolean#If true (default), then the sort order in the header file will be ignored. [Default: true].
            #default is true
            options.append('ASSUME_SORTED=false')

        if (self.params.Compressionlevel):
            #integer#Compression level for all compressed files created (e.g. BAM and GELI)
            options.append('COMPRESSION_LEVEL=' +
                           str(self.params.Compressionlevel))
        if (self.params.CreateIndex):
            options.append('CREATE_INDEX=true')
        #boolean#Whether to create a BAM index when writing a coordinate-sorted BAM file

        #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out
        #cmd_run = ['java','-Xmx750M','-jar','/opt/bin/picard.jar','CollectAlignmentSummaryMetrics']
        cmd_run = [
            'java', '-Xmx2g', '-jar', '/opt/bin/picard.jar',
            'CollectAlignmentSummaryMetrics'
        ]
        for i in range(len(self.inputs.reads)):
            fileNamePath, fileExtension = os.path.splitext(
                self.inputs.reads[i])
            cmd_run2 = []
            cmd_run2.extend(cmd_run)
            cmd_run2.extend([
                'INPUT=' + self.inputs.reads[i],
                'OUTPUT=' + fileNamePath + '.summary_metrics.txt'
            ])
            Process(*cmd_run2).run()

            self.outputs.out.add_file(fileNamePath + '.summary_metrics.txt')
            self.outputs.out[-1].meta = self.inputs.reads[i].make_metadata()
    def execute(self):
        options=[]        
        if(self.params.Assumesorted):
        #boolean#[ASSUME_SORTED]If true, assume that the input files are in the same sort order as the requested output sort order, even if their headers say otherwise. [Default: false]
            options.append( 'ASSUME_SORTED=true')
        if(self.params.Sortorder):
        #enum#[SORT_ORDER]Desired sort order. [default: coordinate]
            options.append( 'SORT_ORDER='+ self.params.Sortorder)
        if(self.params.MergeSequenceDictionary):
        #boolean#[MERGE_SEQUENCE_DICTIONARIES] Merge the sequence dictionaries
            options.append( 'MERGE_SEQUENCE_DICTIONARIES=true')
        if(self.params.CreateIndex):
            options.append( 'CREATE_INDEX=true')
        #boolean#Whether to create a BAM index when writing a coordinate-sorted BAM file
      
        if(self.params.Validationstringency):
        #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.
            options.append('VALIDATION_STRINGENCY='+self.params.Validationstringency)

        if(self.params.Compressionlevel):
        #integer#Compression level for all compressed files created (e.g. BAM and GELI)
            options.append('COMPRESSION_LEVEL='+str(self.params.Compressionlevel))
            
        #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out
        # 2g
        cmd_run = ['java', '-Xmx56g','-jar','/opt/bin/picard.jar','MergeSamFiles','USE_THREADING=true']    
        cmd_run.extend(options)
        for i in range(len(self.inputs.reads)):
            cmd_run.append('INPUT='+self.inputs.reads[i])
            
        fileNamePath, fileExtension = os.path.splitext(self.inputs.reads[0])

        cmd_run.append( 'OUTPUT='+fileNamePath+'sorted'+ fileExtension )
        Process(*cmd_run).run()
            
        self.outputs.out = fileNamePath+'sorted'+ fileExtension  
        self.outputs.out.meta = self.inputs.reads[0].make_metadata()
        
        if (self.params.CreateIndex and os.path.exists(fileNamePath+'sorted'+ fileExtension +'.bai') ):
            self.outputs.ind = fileNamePath+'sorted'+ fileExtension +'.bai' 
            self.outputs.ind.meta = self.inputs.reads[0].make_metadata()        
    def execute(self):
        options = []
        if (self.params.Minimunmappingquality):
            #integer#[MINIMUM_MAPPING_QUALITY]Minimum mapping quality for a read to contribute coverage. Default value: 20. This option can be set to 'null' to clear the default value
            options.append('MINIMUM_MAPPING_QUALITY=' +
                           str(self.params.Minimunmappingquality))
        if (self.params.Minimumbasequality):
            #integer#[MINIMUM_BASE_QUALITY]Minimum base quality for a base to contribute coverage. Default value: 20. This option can be set to 'null' to clear the default value.
            options.append('MINIMUM_BASE_QUALITY=' +
                           str(self.params.Minimumbasequality))
        if (self.params.Coveragecap):
            #integer#[COVERAGE_CAP]Treat bases with coverage exceeding this value as if they had coverage at this value. Default value: 250. This option can be set to 'null' to clear the default value.
            options.append('COVERAGE_CAP=' + str(self.params.Coveragecap))
        if (self.params.Stopafter):
            #float#[STOP_AFTER]For debugging purposes, stop after processing this many genomic bases. Default value: -1. This option can be set to 'null' to clear the default value.
            options.append('STOP_AFTER=' + str(self.params.Stopafter))
        if (self.params.Validationstringency):
            #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.
            options.append('VALIDATION_STRINGENCY=' +
                           self.params.Validationstringency)

        #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out
        #cmd_run = ['java','-Xmx6g','-jar','/opt/bin/picard.jar','CollectWgsMetrics','REFERENCE_SEQUENCE=/opt/db/human_g1k_v37_decoy.fasta']
        cmd_run = [
            'java', '-Xmx7g', '-jar', '/opt/bin/picard.jar',
            'CollectWgsMetrics',
            'REFERENCE_SEQUENCE=/opt/db/human_g1k_v37_decoy.fasta'
        ]

        fileNamePath, fileExtension = os.path.splitext(self.inputs.reads)
        cmd_run2 = []
        cmd_run2.extend(cmd_run)
        cmd_run2.extend([
            'INPUT=' + self.inputs.reads,
            'OUTPUT=' + fileNamePath + '.wgs_metrics.txt'
        ])
        Process(*cmd_run2).run()

        self.outputs.out = fileNamePath + '.wgs_metrics.txt'
        self.outputs.out.meta = self.inputs.reads.make_metadata()
    def execute(self):
        options = []
        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])

        if (self.params.AllowNonoverlappingCommandLineSamples):
            #boolean#[--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES]Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.
            options.append('--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES')
        if (self.params.ExcludeSampleName):
            #string#[--exclude_sample_name] Exclude genotypes from this sample. Can be specified multiple times
            options.extend(
                ['--exclude_sample_name', self.params.ExcludeSampleName])
        if (self.params.ExcludeFiltered):
            #boolean#[-ef]Don't include filtered loci in the analysis
            options.append('-ef')
        if (self.params.ExcludeNonVariants):
            #boolean#[-env]Don't include loci found to be non-variant after the subsetting procedure
            options.append('-env')
        if (self.params.KeepOriginalAc):
            #boolean#[--keepOriginalAC]Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)
            options.append('--keepOriginalAC')
        if (self.params.MaxIndelSize):
            #integer#[--maxIndelSize]indel size select
            options.extend(['--maxIndelSize', str(self.params.MaxIndelSize)])
        if (self.params.MendelianViolation):
            #boolean#[-mv]output mendelian violation sites only
            options.append('-mv')
        if (self.params.Mvq):
            #float#[-mvq]Minimum genotype QUAL score for each trio member required to accept a site as a violation
            options.extend(['-mvq', str(self.params.Mvq)])
        if (self.params.Regenotype):
            #boolean#[-regenotype]re-genotype the selected samples based on their GLs (or PLs)
            options.append('-regenotype')
        if (self.params.RemoveFractionGenotypes):
            #float#[-fractionGenotypes]Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall
            options.extend([
                '-fractionGenotypes',
                str(self.params.RemoveFractionGenotypes)
            ])
        if (self.params.RestrictAllelesTo):
            #enum#[--restrictAllelesTo]Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC
            options.extend(
                ['--restrictAllelesTo', self.params.RestrictAllelesTo])
        if (self.params.SampleExpressions):
            #string#[-se]Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times
            options.extend(['-se', self.params.SampleExpressions])
        if (self.params.SampleName):
            #string#[-sn]Include genotypes from this sample. Can be specified multiple times
            options.extend(['-sn', self.params.SampleName])
        if (self.params.SelectExpressions):
            #string#[-select]One or more criteria to use when selecting the data
            options.extend(['-select', self.params.SelectExpressions])
        if (self.params.SelectRandomFraction):
            #float#[-fraction]Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track
            options.extend(
                ['-fraction',
                 str(self.params.SelectRandomFraction)])

        #if(self.params.SelectTypeToInclude):
        #enum#[-selectType] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times
        #    options.extend([ '-selectType', self.params.SelectTypeToInclude])
        if (self.params.SelectTypeToInclude):
            for x in re.split(',| ', self.params.SelectTypeToInclude):
                options.extend(['-selectType', str(x)])

        out_file_name = "snp_out.vcf"
        fileNamePath, fileExtension = os.path.splitext(self.inputs.inp)
        out_file_name = fileNamePath + ".snp_out.vcf"
        #Process('java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'SelectVariants', '--variant', self.inputs.inp, '-selectType', 'SNP', '-o', out_file_name).run()
        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])
        if (self.inputs.exome_bed):
            for x in self.inputs.exome_bed:
                options.extend(['-L', x])

        if (self.inputs.concordance):
            options.extend(['--concordance', self.inputs.concordance])
        if (self.inputs.discordance):
            options.extend(['--discordance', self.inputs.discordance])
        if (self.inputs.keepIDs):
            options.extend(['--keepIDs', self.inputs.keepIDs])

        run_cmd = [
            'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar',
            '/opt/bin/GenomeAnalysisTK.jar', '-T', 'SelectVariants'
        ]
        run_cmd.extend(options)

        run_cmd.extend([
            '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--variant',
            self.inputs.inp, '-o', out_file_name
        ])
        Process(*run_cmd).run()

        self.outputs.out = out_file_name
        self.outputs.out.meta = self.inputs.inp.make_metadata()
Example #5
0
    def execute(self):
        options = []
        if (self.params.Minimumseedlength):
            #integer#[-k]Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [default: 19]
            options.extend(['-k', self.params.Minimumseedlength])
        if (self.params.Bandwidthforbandedalignment):
            #integer#[-w]Band width in the banded alignment [default: 100]
            options.extend(['-w', self.params.Bandwidthforbandedalignment])
        if (self.params.OffdiagonalXdropoff):
            #integer#[-d]Stop extension when the difference between the best and the current extension score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [default: 100]
            options.extend(['-d', self.params.OffdiagonalXdropoff])
        if (self.params.TriggerreseedingforaMEMlongerthanminSeedLenFLOAT):
            #float#[-r]This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [default: 1.5]
            options.extend([
                '-r',
                str(self.params.
                    TriggerreseedingforaMEMlongerthanminSeedLenFLOAT)
            ])
        if (self.params.SkipseedswithmorethanINToccurrences):
            #integer#[-c]Discard a MEM if it has more than INT occurence in the genome. This is an insensitive parameter. [default: 500]
            options.extend(
                ['-c', self.params.SkipseedswithmorethanINToccurrences])
        if (self.params.Dropchainfraction):
            #float#[-D]Drop chains shorter than FLOAT fraction of the longest overlapping chain.
            options.extend(['-D', str(self.params.Dropchainfraction)])
        if (self.params.Dropchainlength):
            #integer#[-W]Discard a chain if seeded bases shorter than INT.
            options.extend(['-W', self.params.Dropchainlength])
        if (self.params.Materescuerounds):
            #integer#[-m] Perform at most INT rounds of mate rescues for each read.
            options.extend(['-m', self.params.Materescuerounds])
        if (self.params.Skipmaterescue):
            #boolean#[-S] Skip mate rescue
            options.append('-S')
        if (self.params.SkippairingmaterescueperformedunlessSalsoinuse):
            #boolean#[-P] In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair.
            options.append('-P')
        if (self.params.Discardexactmatches):
            #boolean#[-e] Discard full-length exact matches
            options.append('-e')
        if (self.params.Readtype and self.params.Readtype != "None"):
            #enum#[-x] Read type. Setting -x changes multiple parameters unless overridden pacbio: -k17 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -L0; pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001
            options.extend(['-x', self.params.Readtype])
        if (self.params.Scoreforasequencematch):
            #integer#[-A] Score for a sequence match. [default: 1]
            options.extend(['-A', self.params.Scoreforasequencematch])
        if (self.params.Penaltyforamismatch):
            #integer#[-B] Penalty for a mismatch. [default: 4]
            options.extend(['-B', self.params.Penaltyforamismatch])

        if (self.params.Gapopenpenaltyfordeletions
                and self.params.Gapopenpenaltyforinsertions):
            #integer#[-O] Gap open penalty for deletions [default: 6]
            options.extend([
                '-O',
                str(self.params.Gapopenpenaltyfordeletions) + "," +
                str(self.params.Gapopenpenaltyforinsertions)
            ])
        else:
            if (self.params.Gapopenpenaltyfordeletions):
                #integer#[-O] Gap open penalty for deletions [default: 6]
                options.extend(['-O', self.params.Gapopenpenaltyfordeletions])
            if (self.params.Gapopenpenaltyforinsertions):
                #integer#[-O] Gap open penalty for insertions [default: 6]
                options.extend(['-O', self.params.Gapopenpenaltyforinsertions])

        if (self.params.Gapextensionpenaltyfordeletion
                and self.params.Gapextensionpenaltyforinsertion):
            #integer#[-O] Gap open penalty for deletions [default: 6]
            options.extend([
                '-E',
                str(self.params.Gapextensionpenaltyfordeletion) + "," +
                str(self.params.Gapextensionpenaltyforinsertion)
            ])
        else:
            if (self.params.Gapextensionpenaltyfordeletion):
                #integer#[-E] Gap extension penalty for deletion. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [default: 1]
                options.extend(
                    ['-E', self.params.Gapextensionpenaltyfordeletion])
            if (self.params.Gapextensionpenaltyforinsertion):
                #integer#[-E] Gap extension penalty for insertion. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [default: 1]
                options.extend(
                    ['-E', self.params.Gapextensionpenaltyforinsertion])

        if (self.params.Penaltyfor5endclipping
                and self.params.Penaltyfor3endclipping):
            #integer#[-O] Gap open penalty for deletions [default: 6]
            options.extend([
                '-L',
                str(self.params.Penaltyfor5endclipping) + "," +
                str(self.params.Penaltyfor3endclipping)
            ])
        else:
            if (self.params.Penaltyfor5endclipping):
                #integer#[-L] When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [default: 5]
                options.extend(['-L', self.params.Penaltyfor5endclipping])
            if (self.params.Penaltyfor3endclipping):
                #integer#[-L] When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [default: 5]
                options.extend(['-L', self.params.Penaltyfor3endclipping])

        if (self.params.Penaltyforanunpairedreadpair):
            #integer#[-U] BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [default: 17]
            options.extend(['-U', self.params.Penaltyforanunpairedreadpair])
#        if(self.params.Firstqueryfileconsistsofinterleavedpairedendsequences):
#boolean#Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details.
        if (self.params.XAtag):
            #integer#[-h]If #hits < INT, output all in the XA tag
            options.extend(['-h', self.params.XAtag])
        if (self.params.Scorethreshold):
            #integer#[-T]Minimum score to output [default: 30]
            options.extend(['-T', self.params.Scorethreshold])
        if (self.params.OutputallalignmentsforSEorunpairedPE):
            #boolean#[-a]Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments.
            options.append('-a')
        if (self.params.AppendappendFASTAQcommenttoSAMoutput):
            #boolean#[-C]This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output.
            options.append('-C')
        if (self.params.Usesoftclippingforsupplementaryalignments):
            #boolean#[-Y]Use soft clipping for supplementary alignments.
            options.append('-Y')
        if (self.params.Markshortersplithitsassecondary):
            #boolean#[-M]Mark shorter split hits as secondary (for Picard compatibility).
            options.append('-M')
        if (self.params.Completereadgroupheaderline):
            #string#[-R]Specify the read group in a format like '@RG\tID:foo\tSM:bar'. This value takes precedence over per-attribute parameters. [default: constructed from per-attribute parameters or inferred from metadata]
            options.extend(['-R', self.params.Completereadgroupheaderline])
        else:
            if 'ReadGroup' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('ReadGroup')
            elif 'Readgroup' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('Readgroup')
            elif 'readgroup' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('readgroup')
            elif 'RG' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('RG')
            elif 'rg' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('rg')
            elif 'Rg' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('Rg')
            elif 'rG' in self.inputs.reads[0].meta:
                sampleName = self.inputs.reads[0].meta.get('rG')
            else:
                sampleName = os.path.splitext(
                    os.path.basename(self.inputs.reads[0]))[0]

            if 'SampleName' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('SampleName')
            elif 'sampleName' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('sampleName')
            elif 'Samplename' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('Samplename')
            elif 'samplename' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('samplename')
            elif 'sample' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('sample')
            elif 'Sample' in self.inputs.reads[0].meta:
                smN = self.inputs.reads[0].meta.get('Sample')
            else:
                smN = 'DefaultSampleName'
            options.extend([
                '-R', '\"@RG\tID:' + re.split(' ', sampleName)[0] + '\tSM:' +
                re.split(' ', smN)[0] + '\tPL:ILLUMINA\"'
            ])


#        if(self.params.Outputformat):

#enum#Select format to output. Sorted BAM option will output coordinate sorted BAM.
#        if(self.params.Filteroutsecondaryalignments):
#boolean#Set to true to filter out secondary alignments. Works only with output format set to BAM or Sorted BAM
#        if(self.params.Duplication):
#enum#Remove duplicates reads from all output files. Implies: Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.
#        if(self.params.SorternumberofGBs):
#integer#If set to zero, auto-detect best algorithm, else set desired value. [default: 0]
#            options.extend([ 'default: 0', self.params.SorternumberofGBs])
#        if(self.params.SplitfileslargerthanGB):
#integer#Files larger than this value will be split, into this sized chunks for alignment.This number is considered for compressed (.gz) files. For uncompressed files a 3x larger value will be taken. This value is in GB.

#bwa mem [options] <idxbase> <in1.reads>
        run_cmd = ["/opt/bin/bwa", "mem", '-t', '12']
        run_cmd.extend(options)
        run_cmd.extend(['/opt/db/human_g1k_v37_decoy.fasta'])
        for i in range(len(self.inputs.reads)):
            run_cmd.append(self.inputs.reads[i])

        fileNamePath, fileExtension = os.path.splitext(self.inputs.reads[0])
        #run_cmd.extend(['>',fileNamePath+'.sam'])
        #
        if (self.params.Outputformat != 'BAM'
                and self.params.Outputformat != 'Sorted BAM'):
            Process(*run_cmd, stdout=(fileNamePath + '.sam')).run()

        filter1 = []

        if (self.params.Outputformat == 'BAM'
                or self.params.Outputformat == 'Sorted BAM'):
            options2 = ['|', '/opt/bin/bfr', '-b', '256M']
            options2.extend([
                '|',
                '/opt/bin/sambamba_v0.4.7',
                'view',
                '--sam-input',
                '-f',
                'bam',
                '-t',
                '2',
            ])
            if (self.params.Filteroutsecondaryalignments):
                filter1.append("not secondary_alignment")
            if (self.params.Duplication):
                filter1.append("not duplicate")
            if (self.params.Filteroutsecondaryalignments
                    or self.params.Duplication):
                options2.extend(['-F', '\"' + ' and '.join(filter1) + '\"'])

            options2.extend(['-o', fileNamePath + '.bam', '/dev/stdin'])
            run_cmd.extend(options2)
            Process('echo', '#!/bin/bash', stdout='run_haha.sh').run()
            Process('echo', *run_cmd, stdout='run_bwa.sh').run()
            Process('echo', 'wait', stdout='run_end.sh').run()
            Process('cat',
                    'run_haha.sh',
                    'run_bwa.sh',
                    'run_end.sh',
                    stdout='/l3bioinfo/run.sh').run()
            Process('chmod', '777', '/l3bioinfo/run.sh').run()
            Process('/l3bioinfo/run.sh').run()
            #Process('rm','-f',fileNamePath+'.sam' ).run()

            if (self.params.Outputformat == 'BAM'):
                self.outputs.out = fileNamePath + '.bam'
                self.outputs.out.meta = self.inputs.reads[0].make_metadata()
            else:
                Process('/opt/bin/sambamba_v0.4.7', 'sort', '-m', '50G', '-t',
                        '12', '--tmpdir=' + './temp', '-o',
                        fileNamePath + '.sorted.bam',
                        fileNamePath + '.bam').run()
                Process('rm', '-f', fileNamePath + '.bam').run()

                if (self.params.Duplication == 'Mark Duplicates'
                        or self.params.Duplication == 'Remove duplicates'):
                    temp_options = [
                        '/opt/bin/sambamba_v0.4.7', 'markdup', '-t', '12'
                    ]
                    if (self.params.Duplication == 'Remove duplicates'):
                        temp_options.append('--remove-duplicates')
                        temp_options.extend([
                            fileNamePath + '.sorted.bam', fileNamePath + '.bam'
                        ])
                    Process(*temp_options).run()
                    self.outputs.out = fileNamePath + '.bam'
                    self.outputs.out.meta = self.inputs.reads[0].make_metadata(
                    )
                else:
                    Process('mv', fileNamePath + '.sorted.bam',
                            fileNamePath + '.bam').run()
                    self.outputs.out = fileNamePath + '.bam'
                    self.outputs.out.meta = self.inputs.reads[0].make_metadata(
                    )

                if (self.params.CreateIndex):
                    Process('/opt/bin/sambamba_v0.4.7', 'index', '-t', '12',
                            fileNamePath + '.bam',
                            fileNamePath + '.bai').run()
                    self.outputs.out_bai = fileNamePath + '.bai'
        else:
            self.outputs.out = fileNamePath + '.sam'
            self.outputs.out.meta = self.inputs.reads[0].make_metadata()
    def execute(self):
        if self.params.rename:
            if self.params.rename in self.inputs.In_vcf.meta:
                prefix = self.inputs.In_vcf.meta.get(self.params.rename)
            else:
                prefix = self.params.rename
            prefix.replace(" ", "")
            rstr = r"[\/\\\:\*\?\'\"\<\>\|]"  # '/\:*?"<>|'
            prefix = re.sub(rstr, "", prefix)
        else:
            vcf_name = os.path.basename(self.inputs.In_vcf)
            if (os.path.splitext(self.inputs.In_vcf)[1] == ".gz"):
                name = os.path.splitext(vcf_name)[0]
                prefix = os.path.splitext(name)[0]
            else:
                prefix = os.path.splitext(vcf_name)[0]

        out_raw_vcf = prefix + '.raw.' + self.params.SelectType.lower(
        ) + '.vcf.gz'
        out_recal = prefix + '.recalibrate_' + self.params.SelectType.lower(
        ) + '.recal'
        out_tranches = prefix + '.recalibrate_' + self.params.SelectType.lower(
        ) + '.tranches'
        out_rscript = prefix + '.recalibrate_' + self.params.SelectType.lower(
        ) + '_plots.R'
        out_vcf = prefix + '.' + self.params.SelectType.lower() + '.vcf.gz'

        #step1(SelectVariants):
        content = "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T SelectVariants -R "

        if (self.inputs.Reference):
            if (os.path.splitext(self.inputs.Reference)[1] == ".gz"):
                Process("gunzip", self.inputs.Reference).run()
                fa = os.path.splitext(self.inputs.Reference)[0]
            else:
                fa = self.inputs.Reference

        fai = fa + ".fai"
        if (os.path.isfile(fai)):
            pass
        else:
            Process('/opt/bin/samtools-1.3/samtools', 'faidx', fa).run()
        fa_dict = os.path.splitext(fa)[0] + '.dict'
        if (os.path.isfile(fa_dict)):
            pass
        else:
            Process('/opt/bin/samtools-1.3/samtools', 'dict', fa, '-o',
                    fa_dict).run()

        tbi = self.inputs.In_vcf + ".tbi"
        if (os.path.isfile(tbi)):
            pass
        else:
            Process('/opt/bin/htslib-1.3/tabix', self.inputs.In_vcf).run()
        content += fa + " -V " + self.inputs.In_vcf + " -selectType " + self.params.SelectType

        #if(self.params.ExcludeNonVariants):
        #  content += " --" + self.params.ExcludeNonVariants
        if (self.params.cmd_SelectVariants):
            content += " " + self.params.cmd_SelectVariants
        content += " -o " + out_raw_vcf + " && \\\n"

        #step2(VariantRecalibrator):
        content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T VariantRecalibrator -R " + fa + " -input " + out_raw_vcf + " \\\n"

        if (self.inputs.In_resource_hapmap):
            if (os.path.splitext(self.inputs.In_resource_hapmap)[1] == ".gz"):
                Process("gunzip", self.inputs.In_resource_hapmap).run()
                hapmap = os.path.splitext(self.inputs.In_resource_hapmap)[0]
            else:
                hapmap = self.inputs.In_resource_hapmap

#           hapmapidx = hapmap +".idx"
#           if (os.path.isfile(hapmapidx)):
#                   pass
#           else:
#                   Process('/opt/bin/htslib-1.3/tabix',hapmap).run()

            if (self.params.Resource_hapmap):
                content += "-resource:" + self.params.Resource_hapmap + " " + hapmap + " \\\n"

        if (self.inputs.In_resource_omni):
            if (os.path.splitext(self.inputs.In_resource_omni)[1] == ".gz"):
                Process("gunzip", self.inputs.In_resource_omni).run()
                omni = os.path.splitext(self.inputs.In_resource_omni)[0]
            else:
                omni = self.inputs.In_resource_omni

#           omniidx = omni +".idx"
#           if (os.path.isfile(omniidx)):
#                   pass
#           else:
#                   Process('/opt/bin/htslib-1.3/tabix',omniidx).run()
            if (self.params.Resource_omni):
                content += "-resource:" + self.params.Resource_omni + " " + omni + " \\\n"

        if (self.inputs.In_resource_1000G):
            if (os.path.splitext(self.inputs.In_resource_1000G)[1] == ".gz"):
                Process("gunzip", self.inputs.In_resource_1000G).run()
                G = os.path.splitext(self.inputs.In_resource_1000G)[0]
            else:
                G = self.inputs.In_resource_1000G

#           Gidx = G +".idx"
#           if (os.path.isfile(Gidx)):
#                   pass
#           else:
#                   Process('/opt/bin/htslib-1.3/tabix',G).run()

            if (self.params.Resource_1000G):
                content += "-resource:" + self.params.Resource_1000G + " " + G + " \\\n"

        if (self.inputs.In_resource_dbsnp):
            if (os.path.splitext(self.inputs.In_resource_dbsnp)[1] == ".gz"):
                Process("gunzip", self.inputs.In_resource_dbsnp).run()
                dbsnp = os.path.splitext(self.inputs.In_resource_dbsnp)[0]
            else:
                dbsnp = self.inputs.In_resource_dbsnp

#           dbsnpidx = dbsnp +".idx"
#           if (os.path.isfile(dbsnpidx)):
#                   pass
#           else:
#                   Process('/opt/bin/htslib-1.3/tabix',dbsnp).run()

            if (self.params.Resource_dbsnp):
                content += "-resource:" + self.params.Resource_dbsnp + " " + dbsnp + " \\\n"

        if (self.inputs.In_resource_mills):
            if (os.path.splitext(self.inputs.In_resource_mills)[1] == ".gz"):
                Process("gunzip", self.inputs.In_resource_mills).run()
                mills = os.path.splitext(self.inputs.In_resource_mills)[0]
            else:
                mills = self.inputs.In_resource_mills
            if (self.params.Resource_mills):
                content += "-resource:" + self.params.Resource_mills + " " + mills + " \\\n"

        if (self.params.An):
            for an in self.params.An.split(","):
                content += " -an " + an

        content += " -mode " + self.params.Mode
        if (self.params.Tranche):
            for tranche in self.params.Tranche.split(","):
                content += " -tranche " + tranche
        if (self.params.MaxGaussians):
            content += " --maxGaussians " + str(self.params.MaxGaussians)

        content += " \\\n"
        content += "-recalFile " + out_recal
        content += " -tranchesFile " + out_tranches
        if (self.params.cmd_VariantRecalibrator):
            content += " " + self.params.cmd_VariantRecalibrator
        content += " -rscriptFile " + out_rscript + " && \\\n"

        #step3(ApplyRecalibration):

        content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T ApplyRecalibration -R " + fa + " -input " + out_raw_vcf + " -mode " + self.params.Mode
        if (self.params.Ts_filter_level):
            content += " --ts_filter_level " + str(self.params.Ts_filter_level)

        if (self.params.cmd_ApplyRecalibration):
            content += " " + self.params.cmd_ApplyRecalibration
        content += " -recalFile " + out_recal + " -tranchesFile " + out_tranches + " -o  out_snp.vcf.gz && \\\n"

        #step4(SelectVariants2):

        content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T SelectVariants -R " + fa + " -V out_snp.vcf.gz "

        #if(self.params.ExcludeFiltered):
        #  content += " --" + self.params.ExcludeFiltered

        content += " -o " + out_vcf + " \n "

        sys.stdout.write(content)
        f = open("VQSR.sh", "w")
        f.write(content)
        f.close()

        Process("sh", "VQSR.sh").run()

        #        Process('mv',"output_CNV",out_CNV).run()

        self.outputs.Out_raw_vcf = out_raw_vcf
        self.outputs.Out_recalFile = out_recal
        self.outputs.Out_tranchesFile = out_tranches
        self.outputs.Out_rscriptFile = out_rscript
        self.outputs.Out_vcf = out_vcf

        d = "result/" + prefix + "/result_variation/" + self.params.SelectType.lower(
        )

        self.outputs.Out_raw_vcf.meta = self.inputs.In_vcf.make_metadata(url=d)
        self.outputs.Out_recalFile.meta = self.inputs.In_vcf.make_metadata(
            url=d)
        self.outputs.Out_tranchesFile.meta = self.inputs.In_vcf.make_metadata(
            url=d)
        self.outputs.Out_rscriptFile.meta = self.inputs.In_vcf.make_metadata(
            url=d)
        self.outputs.Out_vcf.meta = self.inputs.In_vcf.make_metadata(url=d)
Example #7
0
    def execute(self):
        options = []
        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals == 'True'):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        elif (self.params.FixMisencodedQuals == 'Auto'):
            if "_quality_scale" in self.inputs.bam[0].meta:
                if (self.inputs.bam[0].meta.get('_quality_scale') == 'Phred+64'
                    ):
                    options.append('-fixMisencodedQuals')
                    for x in self.inputs.bam:
                        x.meta['_quality_scale'] = 'Phred+33'

        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])


#        if(self.params.Groupby):
#enum#Inputs will be grouped by selected value from this category. One output will be generated for each group.
#if(self.params.Memoryperjob):
#integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value
#if(self.params.Threadsperjob):
#integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases)
        if (self.params.Maximumintervalsize):
            #integer#[-maxInterval]Maximum interval size. Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign.
            options.extend(['-maxInterval', self.params.Maximumintervalsize])
        if (self.params.Minimumreadsatlocus):
            #integer#[-minReads]Minimum reads at a locus to enable using the entropy calculation.
            options.extend(['-minReads', self.params.Minimumreadsatlocus])
        if (self.params.Mismatchfraction):
            #float#[-mismatch]Fraction of base qualities needing to mismatch for a position to have high entropy. To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data) and should be used in conjunction with USE_SW' option.
            options.extend(['-mismatch', str(self.params.Mismatchfraction)])
        if (self.params.Windowsize):
            #integer#[-window]Window size for calculating entropy or SNP clusters. Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many base pairs apart.
            options.extend(['-window', self.params.Windowsize])

        out_file_name = "dedup.bam.intervals"
        fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0])
        out_file_name = fileNamePath + ".dedup.bam.intervals"

        #===========================================================================
        #         # build bam list file
        #     bam_list_file = "bam.list"
        #     with open(bam_list_file, 'w') as f:
        #         for i in range(len(self.inputs.bam_list)/2):
        #             os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam")
        #             os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai")
        #             f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam"))
        #
        #===========================================================================

        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])

        if (self.inputs.exome_bed and not self.params.DivideByIntervals):
            for x in self.inputs.exome_bed:
                options.extend(['-L', x])
        #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'RealignerTargetCreator','-nt','32']

        run_cmd = [
            '-Djava.io.tmpdir=/extra/tmp', '-jar',
            '/opt/bin/GenomeAnalysisTK.jar', '-T', 'RealignerTargetCreator'
        ]
        run_cmd.extend(options)

        #Process('samtools','index',self.inputs.bam).run()

        #for  x in self.inputs.bam:
        #    Process('samtools','index',x).run()
        #    run_cmd.extend(['-I',x])
        if (self.inputs.bai):
            run_touch = ['touch']
            for x in self.inputs.bai:
                run_touch.append(x)
            Process(*run_touch).run()

        with open('somefile_temp_Samtool_Index.txt', 'a') as the_file:
            for x in self.inputs.bam:
                fileNamePath2, fileExtension = os.path.splitext(x)
                if (not (os.path.exists(fileNamePath2 + '.bai')
                         or os.path.exists(fileNamePath2 + '.bam.bai'))):
                    the_file.write('samtools index ' + x + '\n')
                run_cmd.extend(['-I', x])
        Process('/opt/bin/multi_process', "-c", '25', '-i',
                "somefile_temp_Samtool_Index.txt").run()

        run_cmd.extend([
            '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--known',
            '/opt/db/Mills_and_1000G_gold_standard.indels.b37.sites.vcf',
            '--known', '/opt/db/1000G_phase1.indels.b37.vcf'
        ])
        counter_i = 0
        if (self.params.DivideByIntervals):
            with open('somefile_temp_RealignerTargetCreator.txt',
                      'a') as the_file:
                content = []
                #    for  Exome_x in self.inputs.exome_bed :
                with open('/opt/db/human_g1k_v37_decoy.breakpoints.bed') as f:
                    content.extend(f.readlines())

                for line in content:
                    run_cmd2 = ['java', '-Xmx2g']
                    run_cmd2.extend(run_cmd)
                    tempstr = re.split('\t', line)[0] + ':' + re.split(
                        '\t', line)[1] + '-' + re.split('\t', line)[2]
                    if (tempstr[-1] == '\n'):
                        tempstr = tempstr[:-1]

                    #run_cmd2.extend(['-nt','4','-o', fileNamePath  + ".dedup.bam" + str(counter_i)+ ".intervals",'-L', tempstr ])
                    run_cmd2.extend([
                        '-o', fileNamePath + ".dedup.bam." + str(counter_i) +
                        ".intervals", '-L', tempstr
                    ])

                    the_file.write(' '.join(str(x) for x in run_cmd2))
                    the_file.write("\n")

                    self.outputs.dedup_bam_intervals.add_file(fileNamePath +
                                                              ".dedup.bam." +
                                                              str(counter_i) +
                                                              ".intervals")
                    counter_i = counter_i + 1
                    self.outputs.dedup_bam_intervals[
                        -1].meta = self.inputs.bam[0].make_metadata(
                            _interval=tempstr)

            #self.outputs.dedup_bam_intervals[0].meta = self.inputs.bam[0].make_metadata()

            Process("/opt/bin/multi_process", '-c', '30', "-i",
                    'somefile_temp_RealignerTargetCreator.txt').run()

        else:
            run_cmd.extend(['-o', out_file_name, '-nt', '8'])
            run_cmd2 = ['java', '-Xmx16g']
            run_cmd2.extend(run_cmd)
            Process(*run_cmd2).run()
            #Process('java', '-Xmx24g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-2.1-9.jar', '-T', 'RealignerTargetCreator', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-nt', '12', '-R', '/opt/db/ucsc.hg19.fasta', '-I', bam_list_file, '-o', out_file_name, '--known', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--known', '/opt/db/1000G_phase1.indels.hg19.vcf', '-rf', 'BadCigar', '-L', self.inputs.exome_bed).run()
            #else:
            #    Process('java', '-Xmx24g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-2.1-9.jar', '-T', 'RealignerTargetCreator', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-nt', '12', '-R', '/opt/db/ucsc.hg19.fasta', '-I', bam_list_file, '-o', out_file_name, '--known', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--known', '/opt/db/1000G_phase1.indels.hg19.vcf', '-rf', 'BadCigar').run()
            self.outputs.dedup_bam_intervals.add_file(out_file_name)
            self.outputs.dedup_bam_intervals[-1].meta = self.inputs.bam[
                0].make_metadata()
    def execute(self):
        options = []
        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])


#        if(self.params.Groupby):
#enum#Inputs will be grouped by selected value from this category. One output will be generated for each group.
#if(self.params.Memoryperjob):
#integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value
#if(self.params.Threadsperjob):
#integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases)

        if (self.params.Annotation):
            #string#[-A]One or more specific annotations to apply to variant calls
            options.extend(['-A', self.params.Annotation])
        if (self.params.ComputeSlod):
            #boolean#[-slod]If provided, we will calculate the SLOD (SB annotation)
            options.append('-slod')
        if (self.params.Contamination):
            #float#[-contamination]Fraction of contamination in sequencing data (for all samples) to aggressively remove.
            options.extend(['-contamination', str(self.params.Contamination)])
        if (self.params.ExcludeAnnotation):
            #string#[-XA]One or more specific annotations to exclude
            options.extend(['-XA', self.params.ExcludeAnnotation])
        if (self.params.GenotypeLikelihoodsModel):
            #enum#[-glm]Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together
            options.extend(['-glm', self.params.GenotypeLikelihoodsModel])
        if (self.params.GenotypingMode):
            #enum#[-gt_mode]Specifies how to determine the alternate alleles to use for genotyping
            options.extend(['-gt_mode', self.params.GenotypingMode])
        if (self.params.Group):
            #string#[-G]One or more classes/groups of annotations to apply to variant calls
            options.extend(['-G', self.params.Group])
        if (self.params.Heterozygosity):
            #float#[-hets] Heterozygosity value used to compute prior likelihoods for any locus
            options.extend(['-hets', str(self.params.Heterozygosity)])
        if (self.params.IgnoreLaneInfo):
            #boolean#[-ignoreLane] Ignore lane when building error model, error model is then per-site
            options.append('-ignoreLane')
        if (self.params.IndelHeterozygosity):
            #float#[-indelHeterozygosity]Heterozygosity for indel calling
            options.extend(
                ['-indelHeterozygosity',
                 str(self.params.IndelHeterozygosity)])
        if (self.params.MaxDeletionFraction):
            #float#[-deletions]Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to  1; default:0.05]
            options.extend(
                ['-deletions',
                 str(self.params.MaxDeletionFraction)])
        if (self.params.MinBaseQualityScore):
            #integer#[-mbq]Minimum base quality required to consider a base for calling
            options.extend(['-mbq', str(self.params.MinBaseQualityScore)])
        if (self.params.MinIndelCnt):
            #integer#[-minIndelCnt]Minimum number of consensus indels required to trigger genotyping run
            options.extend(['-minIndelCnt', str(self.params.MinIndelCnt)])
        if (self.params.MinIndelFrac):
            #float#[-minIndelFrac]Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles
            options.extend(['-minIndelFrac', str(self.params.MinIndelFrac)])
        if (self.params.OutputMode):
            #enum#[-out_mode]Specifies which type of calls we should output
            options.extend(['-out_mode', self.params.OutputMode])
        if (self.params.PairHmmImplementation):
            #enum#[-pairHMM]The PairHMM implementation to use for -glm INDEL genotype likelihood calculations
            options.extend(['-pairHMM', self.params.PairHmmImplementation])
        if (self.params.PcrErrorRate):
            #float#The PCR error rate to be used for computing fragment-based likelihoods
            options.extend(['--pcr_error_rate', str(self.params.PcrErrorRate)])
        if (self.params.StandCallConf):
            #float#[-stand_call_conf]The minimum phred-scaled confidence threshold at which variants should be called
            options.extend(
                ['-stand_call_conf',
                 str(self.params.StandCallConf)])
        if (self.params.StandEmitConf):
            #float#[-stand_emit_conf]The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)
            options.extend(
                ['-stand_emit_conf',
                 str(self.params.StandEmitConf)])
        if (self.params.IndelGapContinuationPenalty):
            #integer#[-indelGCP]Indel gap continuation penalty, as Phred-scaled probability.  I.e., 30 => 10^-30/10
            options.extend(
                ['-indelGCP',
                 str(self.params.IndelGapContinuationPenalty)])
        if (self.params.IndelGapOpenPenalty):
            #integer#[-indelGOP]Indel gap open penalty, as Phred-scaled probability.  I.e., 30 => 10^-30/10
            options.extend(['-indelGOP', str(self.params.IndelGapOpenPenalty)])
        if (self.params.MaxAlternateAlleles):
            #integer#[-maxAltAlleles]Maximum number of alternate alleles to genotype
            options.extend(
                ['-maxAltAlleles',
                 str(self.params.MaxAlternateAlleles)])
        if (self.params.PNonrefModel):
            #enum#[--pnrm] Non-reference probability calculation model to employ
            options.extend(['--p_nonref_model', self.params.PNonrefModel])

        out_file_name = "all.vcf"
        fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0])
        out_file_name = fileNamePath + ".vcf"
        #out_file_name = os.path.basename(self.inputs.bam)
        #out_file_name = outFileNameVCF[::-1].replace(".bam"[::-1], ".vcf"[::-1], 1)[::-1]

        #===========================================================================
        #         # build bam list file
        #     bam_list_file = "bam.list"
        #     with open(bam_list_file, 'w') as f:
        #         for i in range(len(self.inputs.bam_list)/2):
        #             os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam")
        #             os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai")
        #             f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam"))
        #
        #===========================================================================

        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])
        if (self.inputs.exome_bed and not self.params.DivideByIntervals):
            for x in self.inputs.exome_bed:
                options.extend(['-L', x])

        if (self.inputs.Alleles):
            options.extend(['--alleles', self.inputs.Alleles])
        if (self.inputs.comp):
            options.extend(['--comp', self.inputs.comp])
        if (self.inputs.dbSNP):
            options.extend(['--dbsnp', self.inputs.dbSNP])
        if (self.inputs.BQSR):
            options.extend(['--BQSR', self.inputs.BQSR])

        if (self.inputs.bai):
            run_touch = ['touch']
            for x in self.inputs.bai:
                run_touch.append(x)
            Process(*run_touch).run()

        with open('somefile_temp_Samtool_Index.txt', 'a') as the_file:
            for i in xrange(0, len(self.inputs.bam)):
                fileNamePath2, fileExtension = os.path.splitext(
                    self.inputs.bam[i])
                if (not (os.path.exists(fileNamePath2 + '.bai')
                         or os.path.exists(fileNamePath2 + '.bam.bai'))):
                    the_file.write('samtools index ' + self.inputs.bam[i] +
                                   '\n')

        Process('/opt/bin/multi_process', "-c", '25', '-i',
                "somefile_temp_Samtool_Index.txt").run()

        if (self.params.DivideByIntervals):
            with open('somefile_temp_UnifiedGenotyper.txt', 'a') as the_file:
                for i in xrange(0, len(self.inputs.bam)):
                    fileNamePath, fileExtension = os.path.splitext(
                        self.inputs.bam[i])

                    run_cmd2 = [
                        'java', '-Xmx2g', '-Djava.io.tmpdir=/extra/tmp',
                        '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T',
                        'UnifiedGenotyper', '-nt', '4', '-nct', '1'
                    ]
                    run_cmd2.extend(options)
                    run_cmd2.extend(['--dbsnp', '/opt/db/dbsnp_137.b37.vcf'])
                    run_cmd2.extend(
                        ['-R', '/opt/db/human_g1k_v37_decoy.fasta'])
                    run_cmd2.extend([
                        '-I', self.inputs.bam[i], '-L',
                        self.inputs.bam[i].meta.get('_interval')
                    ])
                    run_cmd2.extend(['-o', '%s.vcf' % (fileNamePath, )])
                    the_file.write(' '.join(str(x) for x in run_cmd2) + '\n')

            Process('/opt/bin/multi_process', "-c", '25', '-i',
                    "somefile_temp_UnifiedGenotyper.txt").run()
            for i in xrange(0, len(self.inputs.bam)):
                fileNamePath, fileExtension = os.path.splitext(
                    self.inputs.bam[i])

                self.outputs.all_vcf.add_file('%s.vcf' % (fileNamePath, ))
                self.outputs.all_vcf[-1].meta = self.inputs.bam[
                    i].make_metadata()
                # remove _interval meta
                if '_interval' in self.inputs.bam[i].meta:
                    self.outputs.all_vcf[-1].meta.pop("_interval", None)
        else:
            #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'UnifiedGenotyper','-nt', '32', '-nct', '1']
            run_cmd = [
                'java', '-Xmx16g', '-Djava.io.tmpdir=/extra/tmp', '-jar',
                '/opt/bin/GenomeAnalysisTK.jar', '-T', 'UnifiedGenotyper',
                '-nt', '12', '-nct', '1'
            ]
            run_cmd.extend(options)
            run_cmd.extend(['--dbsnp', '/opt/db/dbsnp_137.b37.vcf'])
            run_cmd.extend([
                '-R', '/opt/db/human_g1k_v37_decoy.fasta', '-I',
                self.inputs.bam[0], '-o', out_file_name
            ])
            #Process('samtools', 'index', self.inputs.bam[0]).run()
            Process(*run_cmd).run()
            self.outputs.all_vcf.add_file(out_file_name)
            self.outputs.all_vcf[-1].meta = self.inputs.bam[0].make_metadata()
    def execute(self):
        options = []
        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])

        if (self.params.AssumeIdenticalSamples):
            #boolean#[--assumeIdenticalSamples]If true, assume input VCFs have identical sample sets and disjoint calls
            options.append('--assumeIdenticalSamples')
        if (self.params.FilteredAreUncalled):
            #boolean#[--filteredAreUncalled]If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF
            options.append('--filteredAreUncalled')
        if (self.params.Filteredrecordsmergetype):
            #enum#[--filteredrecordsmergetype]Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields
            options.extend([
                '--filteredrecordsmergetype',
                self.params.Filteredrecordsmergetype
            ])
        if (self.params.Genotypemergeoption
                and self.params.Genotypemergeoption != 'null'):
            #enum#[--genotypemergeoption] Determines how we should merge genotype records for samples shared across the ROD files
            options.extend(
                ['--genotypemergeoption', self.params.Genotypemergeoption])
        if (self.params.MergeInfoWithMaxAc):
            #boolean#[--mergeInfoWithMaxAC] If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.
            options.append('--mergeInfoWithMaxAC')
        if (self.params.MinimalVcf):
            #boolean#[--minimalVCF] If true, then the output VCF will contain no INFO or genotype FORMAT fields
            options.append('--minimalVCF')
        if (self.params.MinimumN):
            #integer#[--minimumN]Combine variants and output site only if the variant is present in at least N input files.
            options.extend(['--minimumN', str(self.params.MinimumN)])
        if (self.params.PrintComplexMerges):
            #boolean#[--printComplexMerges]Print out interesting sites requiring complex compatibility merging
            options.append('--printComplexMerges')
        if (self.params.SetKey):
            #string#[--setKey]Key used in the INFO key=value tag emitted describing which set the combined VCF record came from
            options.extend(['--setKey', self.params.SetKey])
        if (self.params.SuppressCommandLineHeader):
            #boolean#[--suppressCommandLineHeader] If true, do not output the header containing the command line used
            options.append('--suppressCommandLineHeader')

    #def execute(self):
    #assert self.inputs.indel.endswith(".vcf")
    #assert self.inputs.snp.endswith(".vcf")

        fileNamePath, fileExtension = os.path.splitext(self.inputs.vcfs[0])
        out_file_name = fileNamePath + ".final.vcf"
        #Process('nice', '-n', '19', 'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar',
        #        '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'CombineVariants', '--variant', self.inputs.indel, '--variant', self.inputs.snp, '-o', out_file_name).run()

        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])
        if (self.inputs.exome_bed):
            for x in self.inputs.exome_bed:
                options.extend(['-L', x])
        print self.inputs.vcfs
        for x in self.inputs.vcfs:
            options.extend(['--variant', x])

        run_cmd = [
            'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar',
            '/opt/bin/GenomeAnalysisTK.jar', '-T', 'CombineVariants'
        ]
        run_cmd.extend(options)

        run_cmd.extend(
            ['-R', '/opt/db/human_g1k_v37_decoy.fasta', '-o', out_file_name])
        Process(*run_cmd).run()

        self.outputs.out = out_file_name
        self.outputs.out.meta = self.inputs.vcfs[0].make_metadata()
Example #10
0
    def execute(self):
        options = []
        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])

        if (self.params.ClusterSize):
            #integer#[-cluster]The number of SNPs which make up a cluster
            options.extend(['-cluster', str(self.params.ClusterSize)])
        if (self.params.ClusterWindowSize):
            #integer#[-window]The window size (in bases) in which to evaluate clustered SNPs
            options.extend(['-window', str(self.params.ClusterWindowSize)])
        if (self.params.FiltersName):
            #string#[-filter] One or more expression used with INFO fields to filter
            for x in re.split(',', self.params.FiltersName):
                options.extend(['--filterName', x])
        if (self.params.Filters):
            #string#[-filter] One or more expression used with INFO fields to filter
            for x in re.split(',', self.params.Filters):
                options.extend(['-filter', x])
        if (self.params.GenotypefiltersName):
            #string#[-G_filter] One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)
            for x in re.split(',', self.params.GenotypefiltersName):
                options.extend(['--genotypeFilterName', x])
        if (self.params.Genotypefilters):
            #string#[-G_filter] One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info)
            for x in re.split(',', self.params.Genotypefilters):
                options.extend(['-G_filter', x])
        if (self.params.InvalidatePreviousFilters):
            #boolean#[--invalidatePreviousFilters]Remove previous filters applied to the VCF
            options.append('--invalidatePreviousFilters')
        if (self.params.MaskExtension):
            #integer#[-maskExtend]How many bases beyond records from a provided 'mask' rod should variants be filtered
            options.extend(['-maskExtend', str(self.params.MaskExtension)])
        if (self.params.MaskName):
            #string#[--maskName]The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call
            options.extend(['--maskName', self.params.MaskName])
        if (self.params.MissingValuesInExpressionsShouldEvaluateAsFailing):
            #boolean#[--missingValuesInExpressionsShouldEvaluateAsFailing] When evaluating the JEXL expressions, missing values should be considered failing the expression
            options.append(
                '--missingValuesInExpressionsShouldEvaluateAsFailing')

        out_file_name = 'filtered.vcf'
        fileNamePath, fileExtension = os.path.splitext(self.inputs.vcf)
        out_file_name = fileNamePath + ".filtered.vcf"
        #===========================================================================
        # if self.inputs.exome_bed:
        #     Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'VariantRecalibrator', '-nt', '12', '-input', self.inputs.inp, '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', '/opt/db/hapmap_3.3.hg19.vcf', '-resource:omni,known=false,training=true,truth=false,prior=12.0', '/opt/db/1000G_omni2.5.hg19.vcf', '-resource:dbsnp,known=true,training=false,truth=false,prior=8.0', '/opt/db/dbsnp_138.hg19.vcf', '-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '-resource:phase1,VCF,known=true,training=true,truth=true,prior=9.0', '/opt/db/1000G_phase1.indels.hg19.vcf', '-an', 'FS', '-an', 'QD', '-an', 'ReadPosRankSum', '-an', 'HaplotypeScore', '-an', 'MQ', '-recalFile', recal_file_name, '-tranchesFile', tranches_file_name, '--TStranche', '90.0', '--TStranche', '93.0', '--TStranche', '95.0', '--TStranche', '97.0', '--TStranche', '99.0', '--TStranche', '100.0', '-mode', 'BOTH', '-L', self.inputs.exome_bed).run()
        # else:
        #     Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'VariantRecalibrator', '-nt', '12', '-input', self.inputs.inp, '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', '/opt/db/hapmap_3.3.hg19.vcf', '-resource:omni,known=false,training=true,truth=false,prior=12.0', '/opt/db/1000G_omni2.5.hg19.vcf', '-resource:dbsnp,known=true,training=false,truth=false,prior=8.0', '/opt/db/dbsnp_138.hg19.vcf', '-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '-resource:phase1,VCF,known=true,training=true,truth=true,prior=9.0', '/opt/db/1000G_phase1.indels.hg19.vcf', '-an', 'FS', '-an', 'QD', '-an', 'ReadPosRankSum', '-an', 'HaplotypeScore', '-an', 'MQ', '-recalFile', recal_file_name, '-tranchesFile', tranches_file_name, '--TStranche', '90.0', '--TStranche', '93.0', '--TStranche', '95.0', '--TStranche', '97.0', '--TStranche', '99.0', '--TStranche', '100.0', '-mode', 'BOTH').run()
        #===========================================================================
        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])
        if (self.inputs.exome_bed):
            for x in self.inputs.exome_bed:
                options.extend(['-L', x])
        if (self.inputs.mask):
            options.extend(['--mask', self.inputs.mask])

        run_cmd = [
            'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar',
            '/opt/bin/GenomeAnalysisTK.jar', '-T', 'VariantFiltration'
        ]
        run_cmd.extend(options)

        run_cmd.extend([
            '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--variant',
            self.inputs.vcf, '-o', out_file_name
        ])
        Process(*run_cmd).run()

        self.outputs.out = out_file_name
        self.outputs.out.meta = self.inputs.vcf.make_metadata()
    def execute(self):
        options = []

        if (self.params.BinaryTagName):
            #string[-bintag] the binary tag covariate name if using it"
            options.extend(['-bintag', self.params.BinaryTagName])
        if (self.params.Covariate):
            #enum[-cov]One or more covariates to be used in the recalibration. Can be specified multiple times"
            options.extend(['-cov', self.params.Covariate])
        if (self.params.DeletionsDefaultQuality):
            #integer[-ddq] default quality for the base deletions covariate
            options.extend(['-ddq', self.params.DeletionsDefaultQuality])
        if (self.params.IndelsContextSize):
            #integer[-ics] size of the k-mer context to be used for base insertions and deletions
            options.extend(['-ics', self.params.IndelsContextSize])
        if (self.params.InsertionsDefaultQuality):
            #integer[-idq] default quality for the base insertions covariate
            options.extend(['-idq', self.params.InsertionsDefaultQuality])
        if (self.params.LowQualityTail):
            #integer[-lqt] minimum quality for the bases in the tail of the reads to be considered
            options.extend(['-lqt', self.params.LowQualityTail])
        if (self.params.MaximumCycleValue):
            #integer[-maxCycle ] the maximum cycle value permitted for the Cycle covariate
            options.extend(['-maxCycle', self.params.MaximumCycleValue])
        if (self.params.MismatchesContextSize):
            #integer= "[-mcs]size of the k-mer context to be used for base mismatches")
            options.extend(['-mcs', self.params.MismatchesContextSize])
        if (self.params.MismatchesDefaultQuality):
            #integer[-mdq]default quality for the base mismatches covariate
            options.extend(['-mdq', self.params.MismatchesDefaultQuality])
        if (self.params.QuantizingLevels):
            #integer[-ql] number of distinct quality scores in the quantized output
            options.extend(['-ql', self.params.QuantizingLevels])
        if (self.params.SolidNocallStrategy):
            #enum[--solid_nocall_strategy]Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ
            options.extend(
                ['--solid_nocall_strategy', self.params.SolidNocallStrategy])
        if (self.params.SolidRecalMode):
            #enum[-sMode]How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS
            options.extend(['-sMode', self.params.SolidRecalMode])
        if (self.params.BqsrBaqGapOpenPenalty):
            #real[-bqsrBAQGOP]BQSR BAQ gap open penalty (Phred Scaled).  Default value is 40.  30 is perhaps better for whole genome call sets
            options.extend(['-bqsrBAQGOP', self.params.BqsrBaqGapOpenPenalty])
        if (self.params.RunWithoutDbsnpPotentiallyRuiningQuality):
            #boolean[-run_without_dbsnp_potentially_ruining_quality] If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.
            options.append('-run_without_dbsnp_potentially_ruining_quality')

        if (self.params.DisableRandomization):
            #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.
            options.append('-ndrs')
        if (self.params.AllowPotentiallyMisencodedQuals):
            #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file.
            options.append('-allowPotentiallyMisencodedQuals')
        if (self.params.BAQCalculationType):
            #enum#[-baq]Type of BAQ calculation to apply in the engine.
            options.extend(['-baq', self.params.BAQCalculationType])
        if (self.params.BAQGapOpenPenalty):
            #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets
            options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)])
        if (self.params.DefaultBaseQualities):
            #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores
            options.extend(['-DBQ', str(self.params.DefaultBaseQualities)])
        if (self.params.DisableIndelQuals):
            #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
            options.append('-DIQ')
        if (self.params.DownsampletoCoverage):
            #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position.
            options.extend(['-dcov', self.params.DownsampletoCoverage])
        if (self.params.DownsampletoFraction):
            #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to
            options.extend(['-dfrac', str(self.params.DownsampletoFraction)])
        if (self.params.DownsamplingType
                and self.params.DownsamplingType != 'null'):
            #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here
            options.extend(['-dt', self.params.DownsamplingType])
        if (self.params.EmitOriginalQuals):
            #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR)
            options.append('-EOQ')
        if (self.params.FixMisencodedQuals):
            #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores
            options.append('-fixMisencodedQuals')
        if (self.params.IntervalMerging):
            #enum#[-im]Indicates the interval merging rule we should use for abutting intervals
            options.extend(['-im', self.params.IntervalMerging])
        if (self.params.IntervalPadding):
            #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument
            options.extend(['-ip', self.params.IntervalPadding])
        if (self.params.IntervalSetRule):
            #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs
            options.extend(['-isr', self.params.IntervalSetRule])
        if (self.params.KeepProgramRecords):
            #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header
            options.append('-kpr')
        if (self.params.MaxRuntime):
            #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits
            options.extend(['-maxRuntime', self.params.MaxRuntime])
        if (self.params.MaxRuntimeUnits):
            #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime
            options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits])
        if (self.params.NonDeterministicRandomSeed):
            #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run
            options.append('-ndrs')
        if (self.params.PedigreeString):
            #string#[-pedString]Pedigree string for samples
            options.extend(['-pedString', self.params.PedigreeString])
        if (self.params.PedigreeValidationType):
            #enum#[-pedValidationType]How strict should we be in validating the pedigree information?
            options.extend(
                ['-pedValidationType', self.params.PedigreeValidationType])
        if (self.params.PhoneHome):
            #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details.
            options.extend(['-et', self.params.PhoneHome])
        if (self.params.PreserveQscoresLessThan):
            #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)
            options.extend(['-preserveQ', self.params.PreserveQscoresLessThan])
        if (self.params.ReadFilter):
            #string#[-rf]Specify filtration criteria to apply to each read individually
            options.extend(['-rf', self.params.ReadFilter])
        if (self.params.ReadGroupBlackList):
            #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line.
            options.extend(['-rgbl', self.params.ReadGroupBlackList])
        if (self.params.RemoveProgramRecords):
            #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header
            options.append('-rpr')
        if (self.params.Tag):
            #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis
            options.extend(['-tag', self.params.Tag])
        if (self.params.Unsafe and self.params.Unsafe != 'null'):
            #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.
            options.extend(['-U', self.params.Unsafe])
        if (self.params.UseLegacyDownsampler):
            options.extend(
                ['-use_legacy_downsampler', self.params.UseLegacyDownsampler])
        #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation
        if (self.params.UseOriginalQualities):
            #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores
            options.append('-OQ')
        if (self.params.ValidationStrictness):
            #enum#[-S]How strict should we be with validation
            options.extend(['-S', self.params.ValidationStrictness])


#        if(self.params.Groupby):
#enum#Inputs will be grouped by selected value from this category. One output will be generated for each group.
#if(self.params.Memoryperjob):
#integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value
#if(self.params.Threadsperjob):
#integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases)

        out_file_name = "dedup.realn.bam.table"
        fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0])
        out_file_name = fileNamePath + ".dedup.realn.bam.table"
        #===========================================================================
        #         # build bam list file
        #     bam_list_file = "bam.list"
        #     with open(bam_list_file, 'w') as f:
        #         for i in range(len(self.inputs.bam_list)/2):
        #             os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam")
        #             os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai")
        #             f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam"))
        #
        #===========================================================================
        if (self.inputs.bai):
            run_touch = ['touch']
            for x in self.inputs.bai:
                run_touch.append(x)
            Process(*run_touch).run()

        if (self.inputs.Gatk_key):
            options.extend(['-K', self.inputs.Gatk_key])
        if (self.inputs.exclude_intervals):
            for x in self.inputs.exclude_intervals:
                options.extend(['-XL', x])
                # 16g
        #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'BaseRecalibrator','-nct', '32' ,'--disable_indel_quals']
        run_cmd = [
            'java', '-Xmx16g', '-Djava.io.tmpdir=/extra/tmp', '-jar',
            '/opt/bin/GenomeAnalysisTK.jar', '-T', 'BaseRecalibrator', '-nct',
            '8', '--disable_indel_quals'
        ]
        run_cmd.extend(options)
        with open('somefile_temp_Samtool_Index.txt', 'a') as the_file:
            for i in xrange(0, len(self.inputs.bam)):
                fileNamePath2, fileExtension = os.path.splitext(
                    self.inputs.bam[i])
                if (not (os.path.exists(fileNamePath2 + '.bai')
                         or os.path.exists(fileNamePath2 + '.bam.bai'))):
                    the_file.write('samtools index ' + self.inputs.bam[i] +
                                   '\n')
        Process('/opt/bin/multi_process', "-c", '25', '-i',
                "somefile_temp_Samtool_Index.txt").run()
        run_cmd.extend(
            ['-R', '/opt/db/human_g1k_v37_decoy.fasta', '-o', out_file_name])
        for i in xrange(0, len(self.inputs.bam)):
            run_cmd.extend(['-I', self.inputs.bam[i]])
        run_cmd.extend(['--knownSites', '/opt/db/dbsnp_137.b37.vcf'])
        #run_cmd.extend(['--intervals', '/opt/db/BaseRecali.intervals']);
        for x in self.inputs.exome_bed:
            run_cmd.extend(['-L', x])

        #Process('echo', *run_cmd).run()
        Process(*run_cmd).run()

        #===========================================================================
        # if self.params.fix_misencoded_quality_scores:
        #     Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'BaseRecalibrator', '--knownSites', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--knownSites', '/opt/db/1000G_phase1.indels.hg19.vcf', '-nct', '12', '--knownSites', '/opt/db/dbsnp_138.hg19.vcf', '-R', '/opt/db/ucsc.hg19.fasta', '-I', in0, '-I', in1, '-I', in2, '-I', in3, '-I', in4, '-I', in5, '-o', out_file_name, '-rf', 'BadCigar', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-L', 'chr1', '-L', 'chr6', '-L', 'chr22', '-L', 'chrX', '--fix_misencoded_quality_scores').run()
        # else:
        #     Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'BaseRecalibrator', '--knownSites', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--knownSites', '/opt/db/1000G_phase1.indels.hg19.vcf', '-nct', '12', '--knownSites', '/opt/db/dbsnp_138.hg19.vcf', '-R', '/opt/db/ucsc.hg19.fasta', '-I', in0, '-I', in1, '-I', in2, '-I', in3, '-I', in4, '-I', in5, '-o', out_file_name, '-rf', 'BadCigar', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-L', 'chr1', '-L', 'chr6', '-L', 'chr22', '-L', 'chrX').run()
        #===========================================================================

        self.outputs.grp = out_file_name
        self.outputs.grp.meta = self.inputs.bam.make_metadata()
Example #12
0
    def execute(self):
        options=[]
        if(self.params.Minimumlengthmatch):
        #float#[-s]: Log scale for adapter minimum-length-match (2.2)
            options.extend([ '-s', str(self.params.Minimumlengthmatch)])
        if(self.params.Adapteroccurrencethreshold):
        #float#[-t]: % occurrence threshold before adapter clipping (0.25)
            options.extend([ '-t', str(self.params.Adapteroccurrencethreshold)])
        if(self.params.Mincliplength):
        #integer#[-m]: Minimum clip length, overrides scaled auto (1)
            options.extend([ '-m', self.params.Mincliplength])
        if(self.params.Maxadapterdifference):
        #integer#[-p]: Maximum adapter difference percentage (10)
            options.extend([ '-p', self.params.Maxadapterdifference])
        if(self.params.Setalldefaultparameterstozerodonothing):
        #boolean#[-0]: default False
            options.append( '-0')
        if(self.params.Minremainingsequencelength):
        #integer#Minimum remaining sequence length (19)
            options.extend([ '-l', self.params.Minremainingsequencelength])
        if(self.params.Maxremainingsequencelength):
        #integer#[-L]: Maximum remaining sequence length
            options.extend([ '-L', self.params.Maxremainingsequencelength])
        if(self.params.Removeduplicatereads):
        #integer#[-D]: Read_1 has an identical N bases (0)
            options.extend([ '-D', self.params.Removeduplicatereads])
        if(self.params.sKewPercentage):
        #integer#[-k]: If any nucleotide is less than the skew percentage, then the whole cycle is removed (2). Set the skew (-k) or N-pct (-x) to 0 to turn it off, this should be done for miRNA, amplicon and other low-complexity situations.
            options.extend([ '-k', self.params.sKewPercentage])
        if(self.params.Badreadpercentagethreshold):
        #integer#[-x]: 'N' (Bad read) percentage causing cycle removal from ALL read (20). Set the skew (-k) or N-pct (-x) to 0 to turn it off, this should be done for miRNA, amplicon and other low-complexity situations.
            options.extend([ '-x', self.params.Badreadpercentagethreshold])
        if(self.params.Qualitythreshold):
        #integer#[-q]: Quality threshold causing base removal (7)
            options.extend([ '-q', self.params.Qualitythreshold])
        if(self.params.Trimmingwindowsize):
        #integer#[-w]: Window-size for quality trimming (1)
            options.extend([ '-w', self.params.Trimmingwindowsize])
        if(self.params.Removehomopolymerreads):
        #boolean#[-H]: Remove >95% homopolymer reads
            options.append( '-H')
        if(self.params.IlluminaPF):
        #boolean#[-U|u]: Force disable/enable Illumina PF filtering. Values are -u, disable (default), -U, enable
            options.append( '-U')
        if(self.params.DonttrimNs):
        #boolean#[-R]: Don't remove N's from the fronts/ends of reads
            options.append( '-R')
        if(self.params.Subsampling):
        #integer#[-C]: Number of reads to use for subsampling (300k)
            options.extend([ '-C', self.params.Subsampling])
        if(self.params.Phredscale):
        #integer#[-P]: Phred-scale (auto-determined)
            options.extend([ '-P', self.params.Phredscale])
        if(self.params.Dontclip):
        #boolean#[-n]: Just output what would be done
            options.append( '-n')
        if(self.params.Onlykeepclippedreads):
        #boolean#[-K]: Only keep clipped reads
            options.append( '-K')
        if(self.params.Saveskippedreads):
        #boolean#[-S]: Output FASTQ files skipped reads on the 'Skipped Reads' output.
            options.append( '-S')
        if(self.params.Minimummeanqualityscore):
        #float#[--qual-mean]: Evaluated after clipping/trimming
            options.extend([ '--qual-mean', str(self.params.Minimummeanqualityscore)])
        if(self.params.Minimummeanqualityscoreappliestosecondnonbarcodereadonly):
        #float#[--mate-qual-mean]: Evaluated after clipping/trimming
            options.extend([ '--mate-qual-mean', str(self.params.Minimummeanqualityscoreappliestosecondnonbarcodereadonly)])
        if(self.params.Qualitygreaterthanthreshold):
        #string#[--qual-gt NUM,THR]: Evaluated after clipping/trimming, At least NUM quals > THR
            options.extend([ '--qual-gt', self.params.Qualitygreaterthanthreshold])
        if(self.params.Qualitygreaterthanthresholdappliestosecondnonbarcodereadonly):
        #string#[--mate-qual-gt NUM,THR]:Evaluated after clipping/trimming, At least NUM quals > THR
            options.extend([ '--mate-qual-gt', self.params.Qualitygreaterthanthresholdappliestosecondnonbarcodereadonly])
        if(self.params.MaximumNcallsinareadcanbea):
        #float#[--max-ns]: Evaluated after clipping/trimming
            options.extend([ '--max-ns', str(self.params.MaximumNcallsinareadcanbea)])
        if(self.params.MaximumNcallsinareadcanbeaappliestosecondnonbarcodereadonly):
        #float#[--mate-max-ns]: Evaluated after clipping/trimming
            options.extend([ '--mate-max-ns', str(self.params.MaximumNcallsinareadcanbeaappliestosecondnonbarcodereadonly)])
        if(self.params.Homopolymerfilterpercentageasnumber):
        #integer#[--homopolymer-pct]: Homopolymer filter percentage, evaluated after clipping/trimming
            options.extend([ '--homopolymer-pct', self.params.Homopolymerfilterpercentageasnumber])
        if(self.params.Complexityfilterpercent):
        #integer#[--lowcomplex-pct]: Complexity filter percent (95)
            options.extend([ '--lowcomplex-pct', self.params.Complexityfilterpercent])
        if(self.params.AdjustcycleCYCnegativeoffsetfromendbyamountAMT):
        #string#[--cycle-adjust CYC,AMT] Adjust cycle CYC (negative - offset from end) by amount AMT
            options.extend([ '--cycle-adjust', self.params.AdjustcycleCYCnegativeoffsetfromendbyamountAMT])
        if(self.params.AdjustscoreSCOREbyamountAMT):
        #string#[--phred-adjust SCORE,AMT]: Adjust score SCORE by amount AMT
            options.extend([ '--phred-adjust', self.params.AdjustscoreSCOREbyamountAMT])
        
#        if( self.params.AutoAdjustToSanger ):
#            Process('perl' ,'/opt/bin/fastq_detect.pl',self.inputs.reads[0],'1000' ).run()
#        if (  os.path.exists( 'report.txt' ) ):
#            options.extend([ '--phred-adjust',  '-31'])

        run_cmd = ['/opt/bin/ea-utils/fastq-mcf']
        
        is_phed64 = "Phred+33"
        Process('perl' ,'/opt/bin/fastq_detect.pl',self.inputs.reads[0],'1000' ).run()
        if (  os.path.exists( 'report.txt' ) ):
            is_phed64 = "Phred+64"
        
        inputfiles = []
        for x in self.inputs.reads:
            fileNamePath, fileExtension = os.path.splitext(x)
            inputfiles.append(x)
            options.extend(['-o',fileNamePath+'.clip'+fileExtension])
            self.outputs.out_fq.add_file ( fileNamePath+'.clip'+fileExtension)
            self.outputs.out_fq[-1].meta =  x.make_metadata( _quality_scale=is_phed64 )
            # Phred+64 Phred+33
            if (self.params.Saveskippedreads):
                self.outputs.out_skip.add_file ( fileNamePath+'.clip'+fileExtension+'.skip')
                self.outputs.out_skip[-1].meta =  x.make_metadata( _quality_scale=is_phed64 )

        run_cmd.extend(options)

        if (self.inputs.adapter):
            run_cmd.append(self.inputs.adapter)
        else:
            run_cmd.extend(["-f",'/dev/null'])
            
#        options2 = []
#        for x in self.inputs.reads:
#            fileNamePath, fileExtension = os.path.splitext(x)
#            run_cmd.append(x)
#            options2.extend(['-o',fileNamePath+'.clip'+fileExtension])
#            self.outputs.out_fq.add_file ( fileNamePath+'.clip'+fileExtension)
#            self.outputs.out_fq[-1].meta =  x.make_metadata()
#            if (self.params.Saveskippedreads):
#                self.outputs.out_skip.add_file ( fileNamePath+'.clip'+fileExtension+'.skip')
#                self.outputs.out_skip[-1].meta =  x.make_metadata()

        run_cmd.extend(inputfiles)
        #run_cmd.extend(['>',fileNamePath+'.fastq-mcf_summary.txt'  ,'||','true'])
        Process(*run_cmd,stdout= fileNamePath+'.fastq-mcf_summary.txt' ).run()
        self.outputs.out_summary = fileNamePath+'.fastq-mcf_summary.txt' 
        self.outputs.out_summary.meta = self.inputs.reads[0].make_metadata()