def execute(self): options = [] if (self.params.Maximuminsertsize): #integer#Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. [Default: 100000]. options.append('MAX_INSERT_SIZE=' + str(self.params.Maximuminsertsize)) if (self.params.Adaptersequence): #string#This option may be specified 0 or more times. for x in re.split(',| ', self.params.Adaptersequence): options.append('ADAPTER_SEQUENCE=' + x) if (self.params.Metricaccumulationlevel): #enum#The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE, LIBRARY, READ_GROUP} This option may be specified 0 or more times. for x in re.split(',| ', self.params.Metricaccumulationlevel): options.append('METRIC_ACCUMULATION_LEVEL=' + x) if (self.params.Validationstringency): #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded. options.append('VALIDATION_STRINGENCY=' + self.params.Validationstringency) if (self.params.Isbisulfitesequenced): #boolean#Whether the SAM or BAM file consists of bisulfite sequenced reads. [Default: false]. options.append('IS_BISULFITE_SEQUENCED=true') if (self.params.Assumesorted == False): #boolean#If true (default), then the sort order in the header file will be ignored. [Default: true]. #default is true options.append('ASSUME_SORTED=false') if (self.params.Compressionlevel): #integer#Compression level for all compressed files created (e.g. BAM and GELI) options.append('COMPRESSION_LEVEL=' + str(self.params.Compressionlevel)) if (self.params.CreateIndex): options.append('CREATE_INDEX=true') #boolean#Whether to create a BAM index when writing a coordinate-sorted BAM file #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out #cmd_run = ['java','-Xmx750M','-jar','/opt/bin/picard.jar','CollectAlignmentSummaryMetrics'] cmd_run = [ 'java', '-Xmx2g', '-jar', '/opt/bin/picard.jar', 'CollectAlignmentSummaryMetrics' ] for i in range(len(self.inputs.reads)): fileNamePath, fileExtension = os.path.splitext( self.inputs.reads[i]) cmd_run2 = [] cmd_run2.extend(cmd_run) cmd_run2.extend([ 'INPUT=' + self.inputs.reads[i], 'OUTPUT=' + fileNamePath + '.summary_metrics.txt' ]) Process(*cmd_run2).run() self.outputs.out.add_file(fileNamePath + '.summary_metrics.txt') self.outputs.out[-1].meta = self.inputs.reads[i].make_metadata()
def execute(self): options=[] if(self.params.Assumesorted): #boolean#[ASSUME_SORTED]If true, assume that the input files are in the same sort order as the requested output sort order, even if their headers say otherwise. [Default: false] options.append( 'ASSUME_SORTED=true') if(self.params.Sortorder): #enum#[SORT_ORDER]Desired sort order. [default: coordinate] options.append( 'SORT_ORDER='+ self.params.Sortorder) if(self.params.MergeSequenceDictionary): #boolean#[MERGE_SEQUENCE_DICTIONARIES] Merge the sequence dictionaries options.append( 'MERGE_SEQUENCE_DICTIONARIES=true') if(self.params.CreateIndex): options.append( 'CREATE_INDEX=true') #boolean#Whether to create a BAM index when writing a coordinate-sorted BAM file if(self.params.Validationstringency): #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded. options.append('VALIDATION_STRINGENCY='+self.params.Validationstringency) if(self.params.Compressionlevel): #integer#Compression level for all compressed files created (e.g. BAM and GELI) options.append('COMPRESSION_LEVEL='+str(self.params.Compressionlevel)) #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out # 2g cmd_run = ['java', '-Xmx56g','-jar','/opt/bin/picard.jar','MergeSamFiles','USE_THREADING=true'] cmd_run.extend(options) for i in range(len(self.inputs.reads)): cmd_run.append('INPUT='+self.inputs.reads[i]) fileNamePath, fileExtension = os.path.splitext(self.inputs.reads[0]) cmd_run.append( 'OUTPUT='+fileNamePath+'sorted'+ fileExtension ) Process(*cmd_run).run() self.outputs.out = fileNamePath+'sorted'+ fileExtension self.outputs.out.meta = self.inputs.reads[0].make_metadata() if (self.params.CreateIndex and os.path.exists(fileNamePath+'sorted'+ fileExtension +'.bai') ): self.outputs.ind = fileNamePath+'sorted'+ fileExtension +'.bai' self.outputs.ind.meta = self.inputs.reads[0].make_metadata()
def execute(self): options = [] if (self.params.Minimunmappingquality): #integer#[MINIMUM_MAPPING_QUALITY]Minimum mapping quality for a read to contribute coverage. Default value: 20. This option can be set to 'null' to clear the default value options.append('MINIMUM_MAPPING_QUALITY=' + str(self.params.Minimunmappingquality)) if (self.params.Minimumbasequality): #integer#[MINIMUM_BASE_QUALITY]Minimum base quality for a base to contribute coverage. Default value: 20. This option can be set to 'null' to clear the default value. options.append('MINIMUM_BASE_QUALITY=' + str(self.params.Minimumbasequality)) if (self.params.Coveragecap): #integer#[COVERAGE_CAP]Treat bases with coverage exceeding this value as if they had coverage at this value. Default value: 250. This option can be set to 'null' to clear the default value. options.append('COVERAGE_CAP=' + str(self.params.Coveragecap)) if (self.params.Stopafter): #float#[STOP_AFTER]For debugging purposes, stop after processing this many genomic bases. Default value: -1. This option can be set to 'null' to clear the default value. options.append('STOP_AFTER=' + str(self.params.Stopafter)) if (self.params.Validationstringency): #enum#Validation stringency for all BAM/SAM files read by this program. Setting stringency to SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded. options.append('VALIDATION_STRINGENCY=' + self.params.Validationstringency) #java -jar /opt/bin/picard.jar CollectAlignmentSummaryMetrics INPUT=test-data/ERR315327_.accepted_hits.bam OUTPUT=a.out #cmd_run = ['java','-Xmx6g','-jar','/opt/bin/picard.jar','CollectWgsMetrics','REFERENCE_SEQUENCE=/opt/db/human_g1k_v37_decoy.fasta'] cmd_run = [ 'java', '-Xmx7g', '-jar', '/opt/bin/picard.jar', 'CollectWgsMetrics', 'REFERENCE_SEQUENCE=/opt/db/human_g1k_v37_decoy.fasta' ] fileNamePath, fileExtension = os.path.splitext(self.inputs.reads) cmd_run2 = [] cmd_run2.extend(cmd_run) cmd_run2.extend([ 'INPUT=' + self.inputs.reads, 'OUTPUT=' + fileNamePath + '.wgs_metrics.txt' ]) Process(*cmd_run2).run() self.outputs.out = fileNamePath + '.wgs_metrics.txt' self.outputs.out.meta = self.inputs.reads.make_metadata()
def execute(self): options = [] if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) if (self.params.AllowNonoverlappingCommandLineSamples): #boolean#[--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES]Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored. options.append('--ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES') if (self.params.ExcludeSampleName): #string#[--exclude_sample_name] Exclude genotypes from this sample. Can be specified multiple times options.extend( ['--exclude_sample_name', self.params.ExcludeSampleName]) if (self.params.ExcludeFiltered): #boolean#[-ef]Don't include filtered loci in the analysis options.append('-ef') if (self.params.ExcludeNonVariants): #boolean#[-env]Don't include loci found to be non-variant after the subsetting procedure options.append('-env') if (self.params.KeepOriginalAc): #boolean#[--keepOriginalAC]Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig) options.append('--keepOriginalAC') if (self.params.MaxIndelSize): #integer#[--maxIndelSize]indel size select options.extend(['--maxIndelSize', str(self.params.MaxIndelSize)]) if (self.params.MendelianViolation): #boolean#[-mv]output mendelian violation sites only options.append('-mv') if (self.params.Mvq): #float#[-mvq]Minimum genotype QUAL score for each trio member required to accept a site as a violation options.extend(['-mvq', str(self.params.Mvq)]) if (self.params.Regenotype): #boolean#[-regenotype]re-genotype the selected samples based on their GLs (or PLs) options.append('-regenotype') if (self.params.RemoveFractionGenotypes): #float#[-fractionGenotypes]Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall options.extend([ '-fractionGenotypes', str(self.params.RemoveFractionGenotypes) ]) if (self.params.RestrictAllelesTo): #enum#[--restrictAllelesTo]Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC options.extend( ['--restrictAllelesTo', self.params.RestrictAllelesTo]) if (self.params.SampleExpressions): #string#[-se]Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times options.extend(['-se', self.params.SampleExpressions]) if (self.params.SampleName): #string#[-sn]Include genotypes from this sample. Can be specified multiple times options.extend(['-sn', self.params.SampleName]) if (self.params.SelectExpressions): #string#[-select]One or more criteria to use when selecting the data options.extend(['-select', self.params.SelectExpressions]) if (self.params.SelectRandomFraction): #float#[-fraction]Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track options.extend( ['-fraction', str(self.params.SelectRandomFraction)]) #if(self.params.SelectTypeToInclude): #enum#[-selectType] Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times # options.extend([ '-selectType', self.params.SelectTypeToInclude]) if (self.params.SelectTypeToInclude): for x in re.split(',| ', self.params.SelectTypeToInclude): options.extend(['-selectType', str(x)]) out_file_name = "snp_out.vcf" fileNamePath, fileExtension = os.path.splitext(self.inputs.inp) out_file_name = fileNamePath + ".snp_out.vcf" #Process('java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'SelectVariants', '--variant', self.inputs.inp, '-selectType', 'SNP', '-o', out_file_name).run() if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) if (self.inputs.exome_bed): for x in self.inputs.exome_bed: options.extend(['-L', x]) if (self.inputs.concordance): options.extend(['--concordance', self.inputs.concordance]) if (self.inputs.discordance): options.extend(['--discordance', self.inputs.discordance]) if (self.inputs.keepIDs): options.extend(['--keepIDs', self.inputs.keepIDs]) run_cmd = [ 'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'SelectVariants' ] run_cmd.extend(options) run_cmd.extend([ '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--variant', self.inputs.inp, '-o', out_file_name ]) Process(*run_cmd).run() self.outputs.out = out_file_name self.outputs.out.meta = self.inputs.inp.make_metadata()
def execute(self): options = [] if (self.params.Minimumseedlength): #integer#[-k]Matches shorter than INT will be missed. The alignment speed is usually insensitive to this value unless it significantly deviates 20. [default: 19] options.extend(['-k', self.params.Minimumseedlength]) if (self.params.Bandwidthforbandedalignment): #integer#[-w]Band width in the banded alignment [default: 100] options.extend(['-w', self.params.Bandwidthforbandedalignment]) if (self.params.OffdiagonalXdropoff): #integer#[-d]Stop extension when the difference between the best and the current extension score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, respectively, and A is the matching score. Z-dropoff not only avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. [default: 100] options.extend(['-d', self.params.OffdiagonalXdropoff]) if (self.params.TriggerreseedingforaMEMlongerthanminSeedLenFLOAT): #float#[-r]This is a key heuristic parameter for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment speed but lower accuracy. [default: 1.5] options.extend([ '-r', str(self.params. TriggerreseedingforaMEMlongerthanminSeedLenFLOAT) ]) if (self.params.SkipseedswithmorethanINToccurrences): #integer#[-c]Discard a MEM if it has more than INT occurence in the genome. This is an insensitive parameter. [default: 500] options.extend( ['-c', self.params.SkipseedswithmorethanINToccurrences]) if (self.params.Dropchainfraction): #float#[-D]Drop chains shorter than FLOAT fraction of the longest overlapping chain. options.extend(['-D', str(self.params.Dropchainfraction)]) if (self.params.Dropchainlength): #integer#[-W]Discard a chain if seeded bases shorter than INT. options.extend(['-W', self.params.Dropchainlength]) if (self.params.Materescuerounds): #integer#[-m] Perform at most INT rounds of mate rescues for each read. options.extend(['-m', self.params.Materescuerounds]) if (self.params.Skipmaterescue): #boolean#[-S] Skip mate rescue options.append('-S') if (self.params.SkippairingmaterescueperformedunlessSalsoinuse): #boolean#[-P] In the paired-end mode, perform SW to rescue missing hits only but do not try to find hits that fit a proper pair. options.append('-P') if (self.params.Discardexactmatches): #boolean#[-e] Discard full-length exact matches options.append('-e') if (self.params.Readtype and self.params.Readtype != "None"): #enum#[-x] Read type. Setting -x changes multiple parameters unless overridden pacbio: -k17 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -L0; pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001 options.extend(['-x', self.params.Readtype]) if (self.params.Scoreforasequencematch): #integer#[-A] Score for a sequence match. [default: 1] options.extend(['-A', self.params.Scoreforasequencematch]) if (self.params.Penaltyforamismatch): #integer#[-B] Penalty for a mismatch. [default: 4] options.extend(['-B', self.params.Penaltyforamismatch]) if (self.params.Gapopenpenaltyfordeletions and self.params.Gapopenpenaltyforinsertions): #integer#[-O] Gap open penalty for deletions [default: 6] options.extend([ '-O', str(self.params.Gapopenpenaltyfordeletions) + "," + str(self.params.Gapopenpenaltyforinsertions) ]) else: if (self.params.Gapopenpenaltyfordeletions): #integer#[-O] Gap open penalty for deletions [default: 6] options.extend(['-O', self.params.Gapopenpenaltyfordeletions]) if (self.params.Gapopenpenaltyforinsertions): #integer#[-O] Gap open penalty for insertions [default: 6] options.extend(['-O', self.params.Gapopenpenaltyforinsertions]) if (self.params.Gapextensionpenaltyfordeletion and self.params.Gapextensionpenaltyforinsertion): #integer#[-O] Gap open penalty for deletions [default: 6] options.extend([ '-E', str(self.params.Gapextensionpenaltyfordeletion) + "," + str(self.params.Gapextensionpenaltyforinsertion) ]) else: if (self.params.Gapextensionpenaltyfordeletion): #integer#[-E] Gap extension penalty for deletion. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [default: 1] options.extend( ['-E', self.params.Gapextensionpenaltyfordeletion]) if (self.params.Gapextensionpenaltyforinsertion): #integer#[-E] Gap extension penalty for insertion. A gap of length k costs O + k*E (i.e. -O is for opening a zero-length gap). [default: 1] options.extend( ['-E', self.params.Gapextensionpenaltyforinsertion]) if (self.params.Penaltyfor5endclipping and self.params.Penaltyfor3endclipping): #integer#[-O] Gap open penalty for deletions [default: 6] options.extend([ '-L', str(self.params.Penaltyfor5endclipping) + "," + str(self.params.Penaltyfor3endclipping) ]) else: if (self.params.Penaltyfor5endclipping): #integer#[-L] When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [default: 5] options.extend(['-L', self.params.Penaltyfor5endclipping]) if (self.params.Penaltyfor3endclipping): #integer#[-L] When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not deducted. [default: 5] options.extend(['-L', self.params.Penaltyfor3endclipping]) if (self.params.Penaltyforanunpairedreadpair): #integer#[-U] BWA-MEM scores an unpaired read pair as scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. [default: 17] options.extend(['-U', self.params.Penaltyforanunpairedreadpair]) # if(self.params.Firstqueryfileconsistsofinterleavedpairedendsequences): #boolean#Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. if (self.params.XAtag): #integer#[-h]If #hits < INT, output all in the XA tag options.extend(['-h', self.params.XAtag]) if (self.params.Scorethreshold): #integer#[-T]Minimum score to output [default: 30] options.extend(['-T', self.params.Scorethreshold]) if (self.params.OutputallalignmentsforSEorunpairedPE): #boolean#[-a]Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. options.append('-a') if (self.params.AppendappendFASTAQcommenttoSAMoutput): #boolean#[-C]This option can be used to transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. options.append('-C') if (self.params.Usesoftclippingforsupplementaryalignments): #boolean#[-Y]Use soft clipping for supplementary alignments. options.append('-Y') if (self.params.Markshortersplithitsassecondary): #boolean#[-M]Mark shorter split hits as secondary (for Picard compatibility). options.append('-M') if (self.params.Completereadgroupheaderline): #string#[-R]Specify the read group in a format like '@RG\tID:foo\tSM:bar'. This value takes precedence over per-attribute parameters. [default: constructed from per-attribute parameters or inferred from metadata] options.extend(['-R', self.params.Completereadgroupheaderline]) else: if 'ReadGroup' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('ReadGroup') elif 'Readgroup' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('Readgroup') elif 'readgroup' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('readgroup') elif 'RG' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('RG') elif 'rg' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('rg') elif 'Rg' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('Rg') elif 'rG' in self.inputs.reads[0].meta: sampleName = self.inputs.reads[0].meta.get('rG') else: sampleName = os.path.splitext( os.path.basename(self.inputs.reads[0]))[0] if 'SampleName' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('SampleName') elif 'sampleName' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('sampleName') elif 'Samplename' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('Samplename') elif 'samplename' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('samplename') elif 'sample' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('sample') elif 'Sample' in self.inputs.reads[0].meta: smN = self.inputs.reads[0].meta.get('Sample') else: smN = 'DefaultSampleName' options.extend([ '-R', '\"@RG\tID:' + re.split(' ', sampleName)[0] + '\tSM:' + re.split(' ', smN)[0] + '\tPL:ILLUMINA\"' ]) # if(self.params.Outputformat): #enum#Select format to output. Sorted BAM option will output coordinate sorted BAM. # if(self.params.Filteroutsecondaryalignments): #boolean#Set to true to filter out secondary alignments. Works only with output format set to BAM or Sorted BAM # if(self.params.Duplication): #enum#Remove duplicates reads from all output files. Implies: Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file. # if(self.params.SorternumberofGBs): #integer#If set to zero, auto-detect best algorithm, else set desired value. [default: 0] # options.extend([ 'default: 0', self.params.SorternumberofGBs]) # if(self.params.SplitfileslargerthanGB): #integer#Files larger than this value will be split, into this sized chunks for alignment.This number is considered for compressed (.gz) files. For uncompressed files a 3x larger value will be taken. This value is in GB. #bwa mem [options] <idxbase> <in1.reads> run_cmd = ["/opt/bin/bwa", "mem", '-t', '12'] run_cmd.extend(options) run_cmd.extend(['/opt/db/human_g1k_v37_decoy.fasta']) for i in range(len(self.inputs.reads)): run_cmd.append(self.inputs.reads[i]) fileNamePath, fileExtension = os.path.splitext(self.inputs.reads[0]) #run_cmd.extend(['>',fileNamePath+'.sam']) # if (self.params.Outputformat != 'BAM' and self.params.Outputformat != 'Sorted BAM'): Process(*run_cmd, stdout=(fileNamePath + '.sam')).run() filter1 = [] if (self.params.Outputformat == 'BAM' or self.params.Outputformat == 'Sorted BAM'): options2 = ['|', '/opt/bin/bfr', '-b', '256M'] options2.extend([ '|', '/opt/bin/sambamba_v0.4.7', 'view', '--sam-input', '-f', 'bam', '-t', '2', ]) if (self.params.Filteroutsecondaryalignments): filter1.append("not secondary_alignment") if (self.params.Duplication): filter1.append("not duplicate") if (self.params.Filteroutsecondaryalignments or self.params.Duplication): options2.extend(['-F', '\"' + ' and '.join(filter1) + '\"']) options2.extend(['-o', fileNamePath + '.bam', '/dev/stdin']) run_cmd.extend(options2) Process('echo', '#!/bin/bash', stdout='run_haha.sh').run() Process('echo', *run_cmd, stdout='run_bwa.sh').run() Process('echo', 'wait', stdout='run_end.sh').run() Process('cat', 'run_haha.sh', 'run_bwa.sh', 'run_end.sh', stdout='/l3bioinfo/run.sh').run() Process('chmod', '777', '/l3bioinfo/run.sh').run() Process('/l3bioinfo/run.sh').run() #Process('rm','-f',fileNamePath+'.sam' ).run() if (self.params.Outputformat == 'BAM'): self.outputs.out = fileNamePath + '.bam' self.outputs.out.meta = self.inputs.reads[0].make_metadata() else: Process('/opt/bin/sambamba_v0.4.7', 'sort', '-m', '50G', '-t', '12', '--tmpdir=' + './temp', '-o', fileNamePath + '.sorted.bam', fileNamePath + '.bam').run() Process('rm', '-f', fileNamePath + '.bam').run() if (self.params.Duplication == 'Mark Duplicates' or self.params.Duplication == 'Remove duplicates'): temp_options = [ '/opt/bin/sambamba_v0.4.7', 'markdup', '-t', '12' ] if (self.params.Duplication == 'Remove duplicates'): temp_options.append('--remove-duplicates') temp_options.extend([ fileNamePath + '.sorted.bam', fileNamePath + '.bam' ]) Process(*temp_options).run() self.outputs.out = fileNamePath + '.bam' self.outputs.out.meta = self.inputs.reads[0].make_metadata( ) else: Process('mv', fileNamePath + '.sorted.bam', fileNamePath + '.bam').run() self.outputs.out = fileNamePath + '.bam' self.outputs.out.meta = self.inputs.reads[0].make_metadata( ) if (self.params.CreateIndex): Process('/opt/bin/sambamba_v0.4.7', 'index', '-t', '12', fileNamePath + '.bam', fileNamePath + '.bai').run() self.outputs.out_bai = fileNamePath + '.bai' else: self.outputs.out = fileNamePath + '.sam' self.outputs.out.meta = self.inputs.reads[0].make_metadata()
def execute(self): if self.params.rename: if self.params.rename in self.inputs.In_vcf.meta: prefix = self.inputs.In_vcf.meta.get(self.params.rename) else: prefix = self.params.rename prefix.replace(" ", "") rstr = r"[\/\\\:\*\?\'\"\<\>\|]" # '/\:*?"<>|' prefix = re.sub(rstr, "", prefix) else: vcf_name = os.path.basename(self.inputs.In_vcf) if (os.path.splitext(self.inputs.In_vcf)[1] == ".gz"): name = os.path.splitext(vcf_name)[0] prefix = os.path.splitext(name)[0] else: prefix = os.path.splitext(vcf_name)[0] out_raw_vcf = prefix + '.raw.' + self.params.SelectType.lower( ) + '.vcf.gz' out_recal = prefix + '.recalibrate_' + self.params.SelectType.lower( ) + '.recal' out_tranches = prefix + '.recalibrate_' + self.params.SelectType.lower( ) + '.tranches' out_rscript = prefix + '.recalibrate_' + self.params.SelectType.lower( ) + '_plots.R' out_vcf = prefix + '.' + self.params.SelectType.lower() + '.vcf.gz' #step1(SelectVariants): content = "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T SelectVariants -R " if (self.inputs.Reference): if (os.path.splitext(self.inputs.Reference)[1] == ".gz"): Process("gunzip", self.inputs.Reference).run() fa = os.path.splitext(self.inputs.Reference)[0] else: fa = self.inputs.Reference fai = fa + ".fai" if (os.path.isfile(fai)): pass else: Process('/opt/bin/samtools-1.3/samtools', 'faidx', fa).run() fa_dict = os.path.splitext(fa)[0] + '.dict' if (os.path.isfile(fa_dict)): pass else: Process('/opt/bin/samtools-1.3/samtools', 'dict', fa, '-o', fa_dict).run() tbi = self.inputs.In_vcf + ".tbi" if (os.path.isfile(tbi)): pass else: Process('/opt/bin/htslib-1.3/tabix', self.inputs.In_vcf).run() content += fa + " -V " + self.inputs.In_vcf + " -selectType " + self.params.SelectType #if(self.params.ExcludeNonVariants): # content += " --" + self.params.ExcludeNonVariants if (self.params.cmd_SelectVariants): content += " " + self.params.cmd_SelectVariants content += " -o " + out_raw_vcf + " && \\\n" #step2(VariantRecalibrator): content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T VariantRecalibrator -R " + fa + " -input " + out_raw_vcf + " \\\n" if (self.inputs.In_resource_hapmap): if (os.path.splitext(self.inputs.In_resource_hapmap)[1] == ".gz"): Process("gunzip", self.inputs.In_resource_hapmap).run() hapmap = os.path.splitext(self.inputs.In_resource_hapmap)[0] else: hapmap = self.inputs.In_resource_hapmap # hapmapidx = hapmap +".idx" # if (os.path.isfile(hapmapidx)): # pass # else: # Process('/opt/bin/htslib-1.3/tabix',hapmap).run() if (self.params.Resource_hapmap): content += "-resource:" + self.params.Resource_hapmap + " " + hapmap + " \\\n" if (self.inputs.In_resource_omni): if (os.path.splitext(self.inputs.In_resource_omni)[1] == ".gz"): Process("gunzip", self.inputs.In_resource_omni).run() omni = os.path.splitext(self.inputs.In_resource_omni)[0] else: omni = self.inputs.In_resource_omni # omniidx = omni +".idx" # if (os.path.isfile(omniidx)): # pass # else: # Process('/opt/bin/htslib-1.3/tabix',omniidx).run() if (self.params.Resource_omni): content += "-resource:" + self.params.Resource_omni + " " + omni + " \\\n" if (self.inputs.In_resource_1000G): if (os.path.splitext(self.inputs.In_resource_1000G)[1] == ".gz"): Process("gunzip", self.inputs.In_resource_1000G).run() G = os.path.splitext(self.inputs.In_resource_1000G)[0] else: G = self.inputs.In_resource_1000G # Gidx = G +".idx" # if (os.path.isfile(Gidx)): # pass # else: # Process('/opt/bin/htslib-1.3/tabix',G).run() if (self.params.Resource_1000G): content += "-resource:" + self.params.Resource_1000G + " " + G + " \\\n" if (self.inputs.In_resource_dbsnp): if (os.path.splitext(self.inputs.In_resource_dbsnp)[1] == ".gz"): Process("gunzip", self.inputs.In_resource_dbsnp).run() dbsnp = os.path.splitext(self.inputs.In_resource_dbsnp)[0] else: dbsnp = self.inputs.In_resource_dbsnp # dbsnpidx = dbsnp +".idx" # if (os.path.isfile(dbsnpidx)): # pass # else: # Process('/opt/bin/htslib-1.3/tabix',dbsnp).run() if (self.params.Resource_dbsnp): content += "-resource:" + self.params.Resource_dbsnp + " " + dbsnp + " \\\n" if (self.inputs.In_resource_mills): if (os.path.splitext(self.inputs.In_resource_mills)[1] == ".gz"): Process("gunzip", self.inputs.In_resource_mills).run() mills = os.path.splitext(self.inputs.In_resource_mills)[0] else: mills = self.inputs.In_resource_mills if (self.params.Resource_mills): content += "-resource:" + self.params.Resource_mills + " " + mills + " \\\n" if (self.params.An): for an in self.params.An.split(","): content += " -an " + an content += " -mode " + self.params.Mode if (self.params.Tranche): for tranche in self.params.Tranche.split(","): content += " -tranche " + tranche if (self.params.MaxGaussians): content += " --maxGaussians " + str(self.params.MaxGaussians) content += " \\\n" content += "-recalFile " + out_recal content += " -tranchesFile " + out_tranches if (self.params.cmd_VariantRecalibrator): content += " " + self.params.cmd_VariantRecalibrator content += " -rscriptFile " + out_rscript + " && \\\n" #step3(ApplyRecalibration): content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T ApplyRecalibration -R " + fa + " -input " + out_raw_vcf + " -mode " + self.params.Mode if (self.params.Ts_filter_level): content += " --ts_filter_level " + str(self.params.Ts_filter_level) if (self.params.cmd_ApplyRecalibration): content += " " + self.params.cmd_ApplyRecalibration content += " -recalFile " + out_recal + " -tranchesFile " + out_tranches + " -o out_snp.vcf.gz && \\\n" #step4(SelectVariants2): content += "java -Xmx5G -Djava.io.tmpdir=./java_tmp -jar /opt/bin/GenomeAnalysisTK.jar -T SelectVariants -R " + fa + " -V out_snp.vcf.gz " #if(self.params.ExcludeFiltered): # content += " --" + self.params.ExcludeFiltered content += " -o " + out_vcf + " \n " sys.stdout.write(content) f = open("VQSR.sh", "w") f.write(content) f.close() Process("sh", "VQSR.sh").run() # Process('mv',"output_CNV",out_CNV).run() self.outputs.Out_raw_vcf = out_raw_vcf self.outputs.Out_recalFile = out_recal self.outputs.Out_tranchesFile = out_tranches self.outputs.Out_rscriptFile = out_rscript self.outputs.Out_vcf = out_vcf d = "result/" + prefix + "/result_variation/" + self.params.SelectType.lower( ) self.outputs.Out_raw_vcf.meta = self.inputs.In_vcf.make_metadata(url=d) self.outputs.Out_recalFile.meta = self.inputs.In_vcf.make_metadata( url=d) self.outputs.Out_tranchesFile.meta = self.inputs.In_vcf.make_metadata( url=d) self.outputs.Out_rscriptFile.meta = self.inputs.In_vcf.make_metadata( url=d) self.outputs.Out_vcf.meta = self.inputs.In_vcf.make_metadata(url=d)
def execute(self): options = [] if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals == 'True'): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') elif (self.params.FixMisencodedQuals == 'Auto'): if "_quality_scale" in self.inputs.bam[0].meta: if (self.inputs.bam[0].meta.get('_quality_scale') == 'Phred+64' ): options.append('-fixMisencodedQuals') for x in self.inputs.bam: x.meta['_quality_scale'] = 'Phred+33' if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) # if(self.params.Groupby): #enum#Inputs will be grouped by selected value from this category. One output will be generated for each group. #if(self.params.Memoryperjob): #integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value #if(self.params.Threadsperjob): #integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases) if (self.params.Maximumintervalsize): #integer#[-maxInterval]Maximum interval size. Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign. options.extend(['-maxInterval', self.params.Maximumintervalsize]) if (self.params.Minimumreadsatlocus): #integer#[-minReads]Minimum reads at a locus to enable using the entropy calculation. options.extend(['-minReads', self.params.Minimumreadsatlocus]) if (self.params.Mismatchfraction): #float#[-mismatch]Fraction of base qualities needing to mismatch for a position to have high entropy. To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data) and should be used in conjunction with USE_SW' option. options.extend(['-mismatch', str(self.params.Mismatchfraction)]) if (self.params.Windowsize): #integer#[-window]Window size for calculating entropy or SNP clusters. Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many base pairs apart. options.extend(['-window', self.params.Windowsize]) out_file_name = "dedup.bam.intervals" fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0]) out_file_name = fileNamePath + ".dedup.bam.intervals" #=========================================================================== # # build bam list file # bam_list_file = "bam.list" # with open(bam_list_file, 'w') as f: # for i in range(len(self.inputs.bam_list)/2): # os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam") # os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai") # f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam")) # #=========================================================================== if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) if (self.inputs.exome_bed and not self.params.DivideByIntervals): for x in self.inputs.exome_bed: options.extend(['-L', x]) #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'RealignerTargetCreator','-nt','32'] run_cmd = [ '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'RealignerTargetCreator' ] run_cmd.extend(options) #Process('samtools','index',self.inputs.bam).run() #for x in self.inputs.bam: # Process('samtools','index',x).run() # run_cmd.extend(['-I',x]) if (self.inputs.bai): run_touch = ['touch'] for x in self.inputs.bai: run_touch.append(x) Process(*run_touch).run() with open('somefile_temp_Samtool_Index.txt', 'a') as the_file: for x in self.inputs.bam: fileNamePath2, fileExtension = os.path.splitext(x) if (not (os.path.exists(fileNamePath2 + '.bai') or os.path.exists(fileNamePath2 + '.bam.bai'))): the_file.write('samtools index ' + x + '\n') run_cmd.extend(['-I', x]) Process('/opt/bin/multi_process', "-c", '25', '-i', "somefile_temp_Samtool_Index.txt").run() run_cmd.extend([ '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--known', '/opt/db/Mills_and_1000G_gold_standard.indels.b37.sites.vcf', '--known', '/opt/db/1000G_phase1.indels.b37.vcf' ]) counter_i = 0 if (self.params.DivideByIntervals): with open('somefile_temp_RealignerTargetCreator.txt', 'a') as the_file: content = [] # for Exome_x in self.inputs.exome_bed : with open('/opt/db/human_g1k_v37_decoy.breakpoints.bed') as f: content.extend(f.readlines()) for line in content: run_cmd2 = ['java', '-Xmx2g'] run_cmd2.extend(run_cmd) tempstr = re.split('\t', line)[0] + ':' + re.split( '\t', line)[1] + '-' + re.split('\t', line)[2] if (tempstr[-1] == '\n'): tempstr = tempstr[:-1] #run_cmd2.extend(['-nt','4','-o', fileNamePath + ".dedup.bam" + str(counter_i)+ ".intervals",'-L', tempstr ]) run_cmd2.extend([ '-o', fileNamePath + ".dedup.bam." + str(counter_i) + ".intervals", '-L', tempstr ]) the_file.write(' '.join(str(x) for x in run_cmd2)) the_file.write("\n") self.outputs.dedup_bam_intervals.add_file(fileNamePath + ".dedup.bam." + str(counter_i) + ".intervals") counter_i = counter_i + 1 self.outputs.dedup_bam_intervals[ -1].meta = self.inputs.bam[0].make_metadata( _interval=tempstr) #self.outputs.dedup_bam_intervals[0].meta = self.inputs.bam[0].make_metadata() Process("/opt/bin/multi_process", '-c', '30', "-i", 'somefile_temp_RealignerTargetCreator.txt').run() else: run_cmd.extend(['-o', out_file_name, '-nt', '8']) run_cmd2 = ['java', '-Xmx16g'] run_cmd2.extend(run_cmd) Process(*run_cmd2).run() #Process('java', '-Xmx24g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-2.1-9.jar', '-T', 'RealignerTargetCreator', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-nt', '12', '-R', '/opt/db/ucsc.hg19.fasta', '-I', bam_list_file, '-o', out_file_name, '--known', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--known', '/opt/db/1000G_phase1.indels.hg19.vcf', '-rf', 'BadCigar', '-L', self.inputs.exome_bed).run() #else: # Process('java', '-Xmx24g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-2.1-9.jar', '-T', 'RealignerTargetCreator', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-nt', '12', '-R', '/opt/db/ucsc.hg19.fasta', '-I', bam_list_file, '-o', out_file_name, '--known', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--known', '/opt/db/1000G_phase1.indels.hg19.vcf', '-rf', 'BadCigar').run() self.outputs.dedup_bam_intervals.add_file(out_file_name) self.outputs.dedup_bam_intervals[-1].meta = self.inputs.bam[ 0].make_metadata()
def execute(self): options = [] if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) # if(self.params.Groupby): #enum#Inputs will be grouped by selected value from this category. One output will be generated for each group. #if(self.params.Memoryperjob): #integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value #if(self.params.Threadsperjob): #integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases) if (self.params.Annotation): #string#[-A]One or more specific annotations to apply to variant calls options.extend(['-A', self.params.Annotation]) if (self.params.ComputeSlod): #boolean#[-slod]If provided, we will calculate the SLOD (SB annotation) options.append('-slod') if (self.params.Contamination): #float#[-contamination]Fraction of contamination in sequencing data (for all samples) to aggressively remove. options.extend(['-contamination', str(self.params.Contamination)]) if (self.params.ExcludeAnnotation): #string#[-XA]One or more specific annotations to exclude options.extend(['-XA', self.params.ExcludeAnnotation]) if (self.params.GenotypeLikelihoodsModel): #enum#[-glm]Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together options.extend(['-glm', self.params.GenotypeLikelihoodsModel]) if (self.params.GenotypingMode): #enum#[-gt_mode]Specifies how to determine the alternate alleles to use for genotyping options.extend(['-gt_mode', self.params.GenotypingMode]) if (self.params.Group): #string#[-G]One or more classes/groups of annotations to apply to variant calls options.extend(['-G', self.params.Group]) if (self.params.Heterozygosity): #float#[-hets] Heterozygosity value used to compute prior likelihoods for any locus options.extend(['-hets', str(self.params.Heterozygosity)]) if (self.params.IgnoreLaneInfo): #boolean#[-ignoreLane] Ignore lane when building error model, error model is then per-site options.append('-ignoreLane') if (self.params.IndelHeterozygosity): #float#[-indelHeterozygosity]Heterozygosity for indel calling options.extend( ['-indelHeterozygosity', str(self.params.IndelHeterozygosity)]) if (self.params.MaxDeletionFraction): #float#[-deletions]Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to 1; default:0.05] options.extend( ['-deletions', str(self.params.MaxDeletionFraction)]) if (self.params.MinBaseQualityScore): #integer#[-mbq]Minimum base quality required to consider a base for calling options.extend(['-mbq', str(self.params.MinBaseQualityScore)]) if (self.params.MinIndelCnt): #integer#[-minIndelCnt]Minimum number of consensus indels required to trigger genotyping run options.extend(['-minIndelCnt', str(self.params.MinIndelCnt)]) if (self.params.MinIndelFrac): #float#[-minIndelFrac]Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles options.extend(['-minIndelFrac', str(self.params.MinIndelFrac)]) if (self.params.OutputMode): #enum#[-out_mode]Specifies which type of calls we should output options.extend(['-out_mode', self.params.OutputMode]) if (self.params.PairHmmImplementation): #enum#[-pairHMM]The PairHMM implementation to use for -glm INDEL genotype likelihood calculations options.extend(['-pairHMM', self.params.PairHmmImplementation]) if (self.params.PcrErrorRate): #float#The PCR error rate to be used for computing fragment-based likelihoods options.extend(['--pcr_error_rate', str(self.params.PcrErrorRate)]) if (self.params.StandCallConf): #float#[-stand_call_conf]The minimum phred-scaled confidence threshold at which variants should be called options.extend( ['-stand_call_conf', str(self.params.StandCallConf)]) if (self.params.StandEmitConf): #float#[-stand_emit_conf]The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold) options.extend( ['-stand_emit_conf', str(self.params.StandEmitConf)]) if (self.params.IndelGapContinuationPenalty): #integer#[-indelGCP]Indel gap continuation penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10 options.extend( ['-indelGCP', str(self.params.IndelGapContinuationPenalty)]) if (self.params.IndelGapOpenPenalty): #integer#[-indelGOP]Indel gap open penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10 options.extend(['-indelGOP', str(self.params.IndelGapOpenPenalty)]) if (self.params.MaxAlternateAlleles): #integer#[-maxAltAlleles]Maximum number of alternate alleles to genotype options.extend( ['-maxAltAlleles', str(self.params.MaxAlternateAlleles)]) if (self.params.PNonrefModel): #enum#[--pnrm] Non-reference probability calculation model to employ options.extend(['--p_nonref_model', self.params.PNonrefModel]) out_file_name = "all.vcf" fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0]) out_file_name = fileNamePath + ".vcf" #out_file_name = os.path.basename(self.inputs.bam) #out_file_name = outFileNameVCF[::-1].replace(".bam"[::-1], ".vcf"[::-1], 1)[::-1] #=========================================================================== # # build bam list file # bam_list_file = "bam.list" # with open(bam_list_file, 'w') as f: # for i in range(len(self.inputs.bam_list)/2): # os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam") # os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai") # f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam")) # #=========================================================================== if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) if (self.inputs.exome_bed and not self.params.DivideByIntervals): for x in self.inputs.exome_bed: options.extend(['-L', x]) if (self.inputs.Alleles): options.extend(['--alleles', self.inputs.Alleles]) if (self.inputs.comp): options.extend(['--comp', self.inputs.comp]) if (self.inputs.dbSNP): options.extend(['--dbsnp', self.inputs.dbSNP]) if (self.inputs.BQSR): options.extend(['--BQSR', self.inputs.BQSR]) if (self.inputs.bai): run_touch = ['touch'] for x in self.inputs.bai: run_touch.append(x) Process(*run_touch).run() with open('somefile_temp_Samtool_Index.txt', 'a') as the_file: for i in xrange(0, len(self.inputs.bam)): fileNamePath2, fileExtension = os.path.splitext( self.inputs.bam[i]) if (not (os.path.exists(fileNamePath2 + '.bai') or os.path.exists(fileNamePath2 + '.bam.bai'))): the_file.write('samtools index ' + self.inputs.bam[i] + '\n') Process('/opt/bin/multi_process', "-c", '25', '-i', "somefile_temp_Samtool_Index.txt").run() if (self.params.DivideByIntervals): with open('somefile_temp_UnifiedGenotyper.txt', 'a') as the_file: for i in xrange(0, len(self.inputs.bam)): fileNamePath, fileExtension = os.path.splitext( self.inputs.bam[i]) run_cmd2 = [ 'java', '-Xmx2g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'UnifiedGenotyper', '-nt', '4', '-nct', '1' ] run_cmd2.extend(options) run_cmd2.extend(['--dbsnp', '/opt/db/dbsnp_137.b37.vcf']) run_cmd2.extend( ['-R', '/opt/db/human_g1k_v37_decoy.fasta']) run_cmd2.extend([ '-I', self.inputs.bam[i], '-L', self.inputs.bam[i].meta.get('_interval') ]) run_cmd2.extend(['-o', '%s.vcf' % (fileNamePath, )]) the_file.write(' '.join(str(x) for x in run_cmd2) + '\n') Process('/opt/bin/multi_process', "-c", '25', '-i', "somefile_temp_UnifiedGenotyper.txt").run() for i in xrange(0, len(self.inputs.bam)): fileNamePath, fileExtension = os.path.splitext( self.inputs.bam[i]) self.outputs.all_vcf.add_file('%s.vcf' % (fileNamePath, )) self.outputs.all_vcf[-1].meta = self.inputs.bam[ i].make_metadata() # remove _interval meta if '_interval' in self.inputs.bam[i].meta: self.outputs.all_vcf[-1].meta.pop("_interval", None) else: #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'UnifiedGenotyper','-nt', '32', '-nct', '1'] run_cmd = [ 'java', '-Xmx16g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'UnifiedGenotyper', '-nt', '12', '-nct', '1' ] run_cmd.extend(options) run_cmd.extend(['--dbsnp', '/opt/db/dbsnp_137.b37.vcf']) run_cmd.extend([ '-R', '/opt/db/human_g1k_v37_decoy.fasta', '-I', self.inputs.bam[0], '-o', out_file_name ]) #Process('samtools', 'index', self.inputs.bam[0]).run() Process(*run_cmd).run() self.outputs.all_vcf.add_file(out_file_name) self.outputs.all_vcf[-1].meta = self.inputs.bam[0].make_metadata()
def execute(self): options = [] if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) if (self.params.AssumeIdenticalSamples): #boolean#[--assumeIdenticalSamples]If true, assume input VCFs have identical sample sets and disjoint calls options.append('--assumeIdenticalSamples') if (self.params.FilteredAreUncalled): #boolean#[--filteredAreUncalled]If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF options.append('--filteredAreUncalled') if (self.params.Filteredrecordsmergetype): #enum#[--filteredrecordsmergetype]Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields options.extend([ '--filteredrecordsmergetype', self.params.Filteredrecordsmergetype ]) if (self.params.Genotypemergeoption and self.params.Genotypemergeoption != 'null'): #enum#[--genotypemergeoption] Determines how we should merge genotype records for samples shared across the ROD files options.extend( ['--genotypemergeoption', self.params.Genotypemergeoption]) if (self.params.MergeInfoWithMaxAc): #boolean#[--mergeInfoWithMaxAC] If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records. options.append('--mergeInfoWithMaxAC') if (self.params.MinimalVcf): #boolean#[--minimalVCF] If true, then the output VCF will contain no INFO or genotype FORMAT fields options.append('--minimalVCF') if (self.params.MinimumN): #integer#[--minimumN]Combine variants and output site only if the variant is present in at least N input files. options.extend(['--minimumN', str(self.params.MinimumN)]) if (self.params.PrintComplexMerges): #boolean#[--printComplexMerges]Print out interesting sites requiring complex compatibility merging options.append('--printComplexMerges') if (self.params.SetKey): #string#[--setKey]Key used in the INFO key=value tag emitted describing which set the combined VCF record came from options.extend(['--setKey', self.params.SetKey]) if (self.params.SuppressCommandLineHeader): #boolean#[--suppressCommandLineHeader] If true, do not output the header containing the command line used options.append('--suppressCommandLineHeader') #def execute(self): #assert self.inputs.indel.endswith(".vcf") #assert self.inputs.snp.endswith(".vcf") fileNamePath, fileExtension = os.path.splitext(self.inputs.vcfs[0]) out_file_name = fileNamePath + ".final.vcf" #Process('nice', '-n', '19', 'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', # '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'CombineVariants', '--variant', self.inputs.indel, '--variant', self.inputs.snp, '-o', out_file_name).run() if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) if (self.inputs.exome_bed): for x in self.inputs.exome_bed: options.extend(['-L', x]) print self.inputs.vcfs for x in self.inputs.vcfs: options.extend(['--variant', x]) run_cmd = [ 'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'CombineVariants' ] run_cmd.extend(options) run_cmd.extend( ['-R', '/opt/db/human_g1k_v37_decoy.fasta', '-o', out_file_name]) Process(*run_cmd).run() self.outputs.out = out_file_name self.outputs.out.meta = self.inputs.vcfs[0].make_metadata()
def execute(self): options = [] if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) if (self.params.ClusterSize): #integer#[-cluster]The number of SNPs which make up a cluster options.extend(['-cluster', str(self.params.ClusterSize)]) if (self.params.ClusterWindowSize): #integer#[-window]The window size (in bases) in which to evaluate clustered SNPs options.extend(['-window', str(self.params.ClusterWindowSize)]) if (self.params.FiltersName): #string#[-filter] One or more expression used with INFO fields to filter for x in re.split(',', self.params.FiltersName): options.extend(['--filterName', x]) if (self.params.Filters): #string#[-filter] One or more expression used with INFO fields to filter for x in re.split(',', self.params.Filters): options.extend(['-filter', x]) if (self.params.GenotypefiltersName): #string#[-G_filter] One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info) for x in re.split(',', self.params.GenotypefiltersName): options.extend(['--genotypeFilterName', x]) if (self.params.Genotypefilters): #string#[-G_filter] One or more expression used with FORMAT (sample/genotype-level) fields to filter (see documentation guide for more info) for x in re.split(',', self.params.Genotypefilters): options.extend(['-G_filter', x]) if (self.params.InvalidatePreviousFilters): #boolean#[--invalidatePreviousFilters]Remove previous filters applied to the VCF options.append('--invalidatePreviousFilters') if (self.params.MaskExtension): #integer#[-maskExtend]How many bases beyond records from a provided 'mask' rod should variants be filtered options.extend(['-maskExtend', str(self.params.MaskExtension)]) if (self.params.MaskName): #string#[--maskName]The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call options.extend(['--maskName', self.params.MaskName]) if (self.params.MissingValuesInExpressionsShouldEvaluateAsFailing): #boolean#[--missingValuesInExpressionsShouldEvaluateAsFailing] When evaluating the JEXL expressions, missing values should be considered failing the expression options.append( '--missingValuesInExpressionsShouldEvaluateAsFailing') out_file_name = 'filtered.vcf' fileNamePath, fileExtension = os.path.splitext(self.inputs.vcf) out_file_name = fileNamePath + ".filtered.vcf" #=========================================================================== # if self.inputs.exome_bed: # Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'VariantRecalibrator', '-nt', '12', '-input', self.inputs.inp, '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', '/opt/db/hapmap_3.3.hg19.vcf', '-resource:omni,known=false,training=true,truth=false,prior=12.0', '/opt/db/1000G_omni2.5.hg19.vcf', '-resource:dbsnp,known=true,training=false,truth=false,prior=8.0', '/opt/db/dbsnp_138.hg19.vcf', '-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '-resource:phase1,VCF,known=true,training=true,truth=true,prior=9.0', '/opt/db/1000G_phase1.indels.hg19.vcf', '-an', 'FS', '-an', 'QD', '-an', 'ReadPosRankSum', '-an', 'HaplotypeScore', '-an', 'MQ', '-recalFile', recal_file_name, '-tranchesFile', tranches_file_name, '--TStranche', '90.0', '--TStranche', '93.0', '--TStranche', '95.0', '--TStranche', '97.0', '--TStranche', '99.0', '--TStranche', '100.0', '-mode', 'BOTH', '-L', self.inputs.exome_bed).run() # else: # Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-l', 'INFO', '-R', '/opt/db/ucsc.hg19.fasta', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'VariantRecalibrator', '-nt', '12', '-input', self.inputs.inp, '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', '/opt/db/hapmap_3.3.hg19.vcf', '-resource:omni,known=false,training=true,truth=false,prior=12.0', '/opt/db/1000G_omni2.5.hg19.vcf', '-resource:dbsnp,known=true,training=false,truth=false,prior=8.0', '/opt/db/dbsnp_138.hg19.vcf', '-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '-resource:phase1,VCF,known=true,training=true,truth=true,prior=9.0', '/opt/db/1000G_phase1.indels.hg19.vcf', '-an', 'FS', '-an', 'QD', '-an', 'ReadPosRankSum', '-an', 'HaplotypeScore', '-an', 'MQ', '-recalFile', recal_file_name, '-tranchesFile', tranches_file_name, '--TStranche', '90.0', '--TStranche', '93.0', '--TStranche', '95.0', '--TStranche', '97.0', '--TStranche', '99.0', '--TStranche', '100.0', '-mode', 'BOTH').run() #=========================================================================== if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) if (self.inputs.exome_bed): for x in self.inputs.exome_bed: options.extend(['-L', x]) if (self.inputs.mask): options.extend(['--mask', self.inputs.mask]) run_cmd = [ 'java', '-Xmx2g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'VariantFiltration' ] run_cmd.extend(options) run_cmd.extend([ '-R', '/opt/db/human_g1k_v37_decoy.fasta', '--variant', self.inputs.vcf, '-o', out_file_name ]) Process(*run_cmd).run() self.outputs.out = out_file_name self.outputs.out.meta = self.inputs.vcf.make_metadata()
def execute(self): options = [] if (self.params.BinaryTagName): #string[-bintag] the binary tag covariate name if using it" options.extend(['-bintag', self.params.BinaryTagName]) if (self.params.Covariate): #enum[-cov]One or more covariates to be used in the recalibration. Can be specified multiple times" options.extend(['-cov', self.params.Covariate]) if (self.params.DeletionsDefaultQuality): #integer[-ddq] default quality for the base deletions covariate options.extend(['-ddq', self.params.DeletionsDefaultQuality]) if (self.params.IndelsContextSize): #integer[-ics] size of the k-mer context to be used for base insertions and deletions options.extend(['-ics', self.params.IndelsContextSize]) if (self.params.InsertionsDefaultQuality): #integer[-idq] default quality for the base insertions covariate options.extend(['-idq', self.params.InsertionsDefaultQuality]) if (self.params.LowQualityTail): #integer[-lqt] minimum quality for the bases in the tail of the reads to be considered options.extend(['-lqt', self.params.LowQualityTail]) if (self.params.MaximumCycleValue): #integer[-maxCycle ] the maximum cycle value permitted for the Cycle covariate options.extend(['-maxCycle', self.params.MaximumCycleValue]) if (self.params.MismatchesContextSize): #integer= "[-mcs]size of the k-mer context to be used for base mismatches") options.extend(['-mcs', self.params.MismatchesContextSize]) if (self.params.MismatchesDefaultQuality): #integer[-mdq]default quality for the base mismatches covariate options.extend(['-mdq', self.params.MismatchesDefaultQuality]) if (self.params.QuantizingLevels): #integer[-ql] number of distinct quality scores in the quantized output options.extend(['-ql', self.params.QuantizingLevels]) if (self.params.SolidNocallStrategy): #enum[--solid_nocall_strategy]Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ options.extend( ['--solid_nocall_strategy', self.params.SolidNocallStrategy]) if (self.params.SolidRecalMode): #enum[-sMode]How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS options.extend(['-sMode', self.params.SolidRecalMode]) if (self.params.BqsrBaqGapOpenPenalty): #real[-bqsrBAQGOP]BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-bqsrBAQGOP', self.params.BqsrBaqGapOpenPenalty]) if (self.params.RunWithoutDbsnpPotentiallyRuiningQuality): #boolean[-run_without_dbsnp_potentially_ruining_quality] If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only. options.append('-run_without_dbsnp_potentially_ruining_quality') if (self.params.DisableRandomization): #boolean#[-ndrs]Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator. options.append('-ndrs') if (self.params.AllowPotentiallyMisencodedQuals): #boolean#[-allowPotentiallyMisencodedQuals] Do not fail when encountered base qualities that are too high and seemingly indicate a problem with the base quality encoding of the BAM file. options.append('-allowPotentiallyMisencodedQuals') if (self.params.BAQCalculationType): #enum#[-baq]Type of BAQ calculation to apply in the engine. options.extend(['-baq', self.params.BAQCalculationType]) if (self.params.BAQGapOpenPenalty): #float#[-baqGOP]BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets options.extend(['-baqGOP', str(self.params.BAQGapOpenPenalty)]) if (self.params.DefaultBaseQualities): #integer#If reads are missing some or all base quality scores, this value will be used for all base quality scores options.extend(['-DBQ', str(self.params.DefaultBaseQualities)]) if (self.params.DisableIndelQuals): #boolean#[-DBQ]If 'true', disables printing of base insertion and base deletion tags (with -BQSR). Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced. options.append('-DIQ') if (self.params.DownsampletoCoverage): #integer#[-dcov]Coverage to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus. For non-locus-based traversals (eg., ReadWalkers), this sets the maximum number of reads at each alignment start position. options.extend(['-dcov', self.params.DownsampletoCoverage]) if (self.params.DownsampletoFraction): #float#[-dfrac]Fraction [0.0-1.0] of reads to downsample to options.extend(['-dfrac', str(self.params.DownsampletoFraction)]) if (self.params.DownsamplingType and self.params.DownsamplingType != 'null'): #enum#[-dt]Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here options.extend(['-dt', self.params.DownsamplingType]) if (self.params.EmitOriginalQuals): #boolean#[-EOQ]If true, enables printing of the OQ tag with the original base qualities (with -BQSR) options.append('-EOQ') if (self.params.FixMisencodedQuals): #boolean#[-fixMisencodedQuals]Fix mis-encoded base quality scores options.append('-fixMisencodedQuals') if (self.params.IntervalMerging): #enum#[-im]Indicates the interval merging rule we should use for abutting intervals options.extend(['-im', self.params.IntervalMerging]) if (self.params.IntervalPadding): #integer#[-ip]Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument options.extend(['-ip', self.params.IntervalPadding]) if (self.params.IntervalSetRule): #enum#[-isr]Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs options.extend(['-isr', self.params.IntervalSetRule]) if (self.params.KeepProgramRecords): #boolean#[-kpr]Should we override the Walker's default and keep program records from the SAM header options.append('-kpr') if (self.params.MaxRuntime): #integer#[-maxRuntime]If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits options.extend(['-maxRuntime', self.params.MaxRuntime]) if (self.params.MaxRuntimeUnits): #enum#[-maxRuntimeUnits] The TimeUnit for maxRuntime options.extend(['-maxRuntimeUnits', self.params.MaxRuntimeUnits]) if (self.params.NonDeterministicRandomSeed): #boolean#[-ndrs]Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run options.append('-ndrs') if (self.params.PedigreeString): #string#[-pedString]Pedigree string for samples options.extend(['-pedString', self.params.PedigreeString]) if (self.params.PedigreeValidationType): #enum#[-pedValidationType]How strict should we be in validating the pedigree information? options.extend( ['-pedValidationType', self.params.PedigreeValidationType]) if (self.params.PhoneHome): #enum#[-et]What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://gatkforums.broadinstitute.org/discussion/1250/what-is-phone-home-and-how-does-it-affect-me#latest for details. options.extend(['-et', self.params.PhoneHome]) if (self.params.PreserveQscoresLessThan): #integer#[-preserveQ]Bases with quality scores less than this threshold won't be recalibrated (with -BQSR) options.extend(['-preserveQ', self.params.PreserveQscoresLessThan]) if (self.params.ReadFilter): #string#[-rf]Specify filtration criteria to apply to each read individually options.extend(['-rf', self.params.ReadFilter]) if (self.params.ReadGroupBlackList): #string#[-rgbl]Filters out read groups matching : or a .txt file containing the filter strings one per line. options.extend(['-rgbl', self.params.ReadGroupBlackList]) if (self.params.RemoveProgramRecords): #boolean#[-rpr]Should we override the Walker's default and remove program records from the SAM header options.append('-rpr') if (self.params.Tag): #string#[-tag]Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis options.extend(['-tag', self.params.Tag]) if (self.params.Unsafe and self.params.Unsafe != 'null'): #enum#[-U]If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument. options.extend(['-U', self.params.Unsafe]) if (self.params.UseLegacyDownsampler): options.extend( ['-use_legacy_downsampler', self.params.UseLegacyDownsampler]) #boolean#Use the legacy downsampling implementation instead of the newer, less-tested implementation if (self.params.UseOriginalQualities): #boolean#[-OQ]If set, use the original base quality scores from the OQ tag when present instead of the standard scores options.append('-OQ') if (self.params.ValidationStrictness): #enum#[-S]How strict should we be with validation options.extend(['-S', self.params.ValidationStrictness]) # if(self.params.Groupby): #enum#Inputs will be grouped by selected value from this category. One output will be generated for each group. #if(self.params.Memoryperjob): #integer#Amount of RAM memory to be used per job. Defaults to 2048MB for Single threaded jobs,and all of the available memory on the instance for multi-threaded jobs. Set to 0 for the default value #if(self.params.Threadsperjob): #integer#For tools which support multiprocessing, this value can be used to set the number of threads to be used. Set to 0 for auto-detect (use with caution,as auto-detect will find the optimal value in most cases) out_file_name = "dedup.realn.bam.table" fileNamePath, fileExtension = os.path.splitext(self.inputs.bam[0]) out_file_name = fileNamePath + ".dedup.realn.bam.table" #=========================================================================== # # build bam list file # bam_list_file = "bam.list" # with open(bam_list_file, 'w') as f: # for i in range(len(self.inputs.bam_list)/2): # os.rename(self.inputs.bam_list[i*2], self.inputs.bam_list[i*2] + ".bam") # os.rename(self.inputs.bam_list[i*2+1], self.inputs.bam_list[i*2] + ".bai") # f.write("%s\n" % (self.inputs.bam_list[i*2] + ".bam")) # #=========================================================================== if (self.inputs.bai): run_touch = ['touch'] for x in self.inputs.bai: run_touch.append(x) Process(*run_touch).run() if (self.inputs.Gatk_key): options.extend(['-K', self.inputs.Gatk_key]) if (self.inputs.exclude_intervals): for x in self.inputs.exclude_intervals: options.extend(['-XL', x]) # 16g #run_cmd = ['java', '-Xmx56g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'BaseRecalibrator','-nct', '32' ,'--disable_indel_quals'] run_cmd = [ 'java', '-Xmx16g', '-Djava.io.tmpdir=/extra/tmp', '-jar', '/opt/bin/GenomeAnalysisTK.jar', '-T', 'BaseRecalibrator', '-nct', '8', '--disable_indel_quals' ] run_cmd.extend(options) with open('somefile_temp_Samtool_Index.txt', 'a') as the_file: for i in xrange(0, len(self.inputs.bam)): fileNamePath2, fileExtension = os.path.splitext( self.inputs.bam[i]) if (not (os.path.exists(fileNamePath2 + '.bai') or os.path.exists(fileNamePath2 + '.bam.bai'))): the_file.write('samtools index ' + self.inputs.bam[i] + '\n') Process('/opt/bin/multi_process', "-c", '25', '-i', "somefile_temp_Samtool_Index.txt").run() run_cmd.extend( ['-R', '/opt/db/human_g1k_v37_decoy.fasta', '-o', out_file_name]) for i in xrange(0, len(self.inputs.bam)): run_cmd.extend(['-I', self.inputs.bam[i]]) run_cmd.extend(['--knownSites', '/opt/db/dbsnp_137.b37.vcf']) #run_cmd.extend(['--intervals', '/opt/db/BaseRecali.intervals']); for x in self.inputs.exome_bed: run_cmd.extend(['-L', x]) #Process('echo', *run_cmd).run() Process(*run_cmd).run() #=========================================================================== # if self.params.fix_misencoded_quality_scores: # Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'BaseRecalibrator', '--knownSites', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--knownSites', '/opt/db/1000G_phase1.indels.hg19.vcf', '-nct', '12', '--knownSites', '/opt/db/dbsnp_138.hg19.vcf', '-R', '/opt/db/ucsc.hg19.fasta', '-I', in0, '-I', in1, '-I', in2, '-I', in3, '-I', in4, '-I', in5, '-o', out_file_name, '-rf', 'BadCigar', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-L', 'chr1', '-L', 'chr6', '-L', 'chr22', '-L', 'chrX', '--fix_misencoded_quality_scores').run() # else: # Process('java', '-Xmx16g', '-Djava.io.tmpdir=/tmp', '-jar', '/opt/bin/GenomeAnalysisTK-3.2-2.jar', '-et', 'NO_ET', '-K', '/opt/db/rbluo_cs.hku.hk.key', '-T', 'BaseRecalibrator', '--knownSites', '/opt/db/Mills_and_1000G_gold_standard.indels.hg19.vcf', '--knownSites', '/opt/db/1000G_phase1.indels.hg19.vcf', '-nct', '12', '--knownSites', '/opt/db/dbsnp_138.hg19.vcf', '-R', '/opt/db/ucsc.hg19.fasta', '-I', in0, '-I', in1, '-I', in2, '-I', in3, '-I', in4, '-I', in5, '-o', out_file_name, '-rf', 'BadCigar', '-U', 'ALLOW_SEQ_DICT_INCOMPATIBILITY', '-L', 'chr1', '-L', 'chr6', '-L', 'chr22', '-L', 'chrX').run() #=========================================================================== self.outputs.grp = out_file_name self.outputs.grp.meta = self.inputs.bam.make_metadata()
def execute(self): options=[] if(self.params.Minimumlengthmatch): #float#[-s]: Log scale for adapter minimum-length-match (2.2) options.extend([ '-s', str(self.params.Minimumlengthmatch)]) if(self.params.Adapteroccurrencethreshold): #float#[-t]: % occurrence threshold before adapter clipping (0.25) options.extend([ '-t', str(self.params.Adapteroccurrencethreshold)]) if(self.params.Mincliplength): #integer#[-m]: Minimum clip length, overrides scaled auto (1) options.extend([ '-m', self.params.Mincliplength]) if(self.params.Maxadapterdifference): #integer#[-p]: Maximum adapter difference percentage (10) options.extend([ '-p', self.params.Maxadapterdifference]) if(self.params.Setalldefaultparameterstozerodonothing): #boolean#[-0]: default False options.append( '-0') if(self.params.Minremainingsequencelength): #integer#Minimum remaining sequence length (19) options.extend([ '-l', self.params.Minremainingsequencelength]) if(self.params.Maxremainingsequencelength): #integer#[-L]: Maximum remaining sequence length options.extend([ '-L', self.params.Maxremainingsequencelength]) if(self.params.Removeduplicatereads): #integer#[-D]: Read_1 has an identical N bases (0) options.extend([ '-D', self.params.Removeduplicatereads]) if(self.params.sKewPercentage): #integer#[-k]: If any nucleotide is less than the skew percentage, then the whole cycle is removed (2). Set the skew (-k) or N-pct (-x) to 0 to turn it off, this should be done for miRNA, amplicon and other low-complexity situations. options.extend([ '-k', self.params.sKewPercentage]) if(self.params.Badreadpercentagethreshold): #integer#[-x]: 'N' (Bad read) percentage causing cycle removal from ALL read (20). Set the skew (-k) or N-pct (-x) to 0 to turn it off, this should be done for miRNA, amplicon and other low-complexity situations. options.extend([ '-x', self.params.Badreadpercentagethreshold]) if(self.params.Qualitythreshold): #integer#[-q]: Quality threshold causing base removal (7) options.extend([ '-q', self.params.Qualitythreshold]) if(self.params.Trimmingwindowsize): #integer#[-w]: Window-size for quality trimming (1) options.extend([ '-w', self.params.Trimmingwindowsize]) if(self.params.Removehomopolymerreads): #boolean#[-H]: Remove >95% homopolymer reads options.append( '-H') if(self.params.IlluminaPF): #boolean#[-U|u]: Force disable/enable Illumina PF filtering. Values are -u, disable (default), -U, enable options.append( '-U') if(self.params.DonttrimNs): #boolean#[-R]: Don't remove N's from the fronts/ends of reads options.append( '-R') if(self.params.Subsampling): #integer#[-C]: Number of reads to use for subsampling (300k) options.extend([ '-C', self.params.Subsampling]) if(self.params.Phredscale): #integer#[-P]: Phred-scale (auto-determined) options.extend([ '-P', self.params.Phredscale]) if(self.params.Dontclip): #boolean#[-n]: Just output what would be done options.append( '-n') if(self.params.Onlykeepclippedreads): #boolean#[-K]: Only keep clipped reads options.append( '-K') if(self.params.Saveskippedreads): #boolean#[-S]: Output FASTQ files skipped reads on the 'Skipped Reads' output. options.append( '-S') if(self.params.Minimummeanqualityscore): #float#[--qual-mean]: Evaluated after clipping/trimming options.extend([ '--qual-mean', str(self.params.Minimummeanqualityscore)]) if(self.params.Minimummeanqualityscoreappliestosecondnonbarcodereadonly): #float#[--mate-qual-mean]: Evaluated after clipping/trimming options.extend([ '--mate-qual-mean', str(self.params.Minimummeanqualityscoreappliestosecondnonbarcodereadonly)]) if(self.params.Qualitygreaterthanthreshold): #string#[--qual-gt NUM,THR]: Evaluated after clipping/trimming, At least NUM quals > THR options.extend([ '--qual-gt', self.params.Qualitygreaterthanthreshold]) if(self.params.Qualitygreaterthanthresholdappliestosecondnonbarcodereadonly): #string#[--mate-qual-gt NUM,THR]:Evaluated after clipping/trimming, At least NUM quals > THR options.extend([ '--mate-qual-gt', self.params.Qualitygreaterthanthresholdappliestosecondnonbarcodereadonly]) if(self.params.MaximumNcallsinareadcanbea): #float#[--max-ns]: Evaluated after clipping/trimming options.extend([ '--max-ns', str(self.params.MaximumNcallsinareadcanbea)]) if(self.params.MaximumNcallsinareadcanbeaappliestosecondnonbarcodereadonly): #float#[--mate-max-ns]: Evaluated after clipping/trimming options.extend([ '--mate-max-ns', str(self.params.MaximumNcallsinareadcanbeaappliestosecondnonbarcodereadonly)]) if(self.params.Homopolymerfilterpercentageasnumber): #integer#[--homopolymer-pct]: Homopolymer filter percentage, evaluated after clipping/trimming options.extend([ '--homopolymer-pct', self.params.Homopolymerfilterpercentageasnumber]) if(self.params.Complexityfilterpercent): #integer#[--lowcomplex-pct]: Complexity filter percent (95) options.extend([ '--lowcomplex-pct', self.params.Complexityfilterpercent]) if(self.params.AdjustcycleCYCnegativeoffsetfromendbyamountAMT): #string#[--cycle-adjust CYC,AMT] Adjust cycle CYC (negative - offset from end) by amount AMT options.extend([ '--cycle-adjust', self.params.AdjustcycleCYCnegativeoffsetfromendbyamountAMT]) if(self.params.AdjustscoreSCOREbyamountAMT): #string#[--phred-adjust SCORE,AMT]: Adjust score SCORE by amount AMT options.extend([ '--phred-adjust', self.params.AdjustscoreSCOREbyamountAMT]) # if( self.params.AutoAdjustToSanger ): # Process('perl' ,'/opt/bin/fastq_detect.pl',self.inputs.reads[0],'1000' ).run() # if ( os.path.exists( 'report.txt' ) ): # options.extend([ '--phred-adjust', '-31']) run_cmd = ['/opt/bin/ea-utils/fastq-mcf'] is_phed64 = "Phred+33" Process('perl' ,'/opt/bin/fastq_detect.pl',self.inputs.reads[0],'1000' ).run() if ( os.path.exists( 'report.txt' ) ): is_phed64 = "Phred+64" inputfiles = [] for x in self.inputs.reads: fileNamePath, fileExtension = os.path.splitext(x) inputfiles.append(x) options.extend(['-o',fileNamePath+'.clip'+fileExtension]) self.outputs.out_fq.add_file ( fileNamePath+'.clip'+fileExtension) self.outputs.out_fq[-1].meta = x.make_metadata( _quality_scale=is_phed64 ) # Phred+64 Phred+33 if (self.params.Saveskippedreads): self.outputs.out_skip.add_file ( fileNamePath+'.clip'+fileExtension+'.skip') self.outputs.out_skip[-1].meta = x.make_metadata( _quality_scale=is_phed64 ) run_cmd.extend(options) if (self.inputs.adapter): run_cmd.append(self.inputs.adapter) else: run_cmd.extend(["-f",'/dev/null']) # options2 = [] # for x in self.inputs.reads: # fileNamePath, fileExtension = os.path.splitext(x) # run_cmd.append(x) # options2.extend(['-o',fileNamePath+'.clip'+fileExtension]) # self.outputs.out_fq.add_file ( fileNamePath+'.clip'+fileExtension) # self.outputs.out_fq[-1].meta = x.make_metadata() # if (self.params.Saveskippedreads): # self.outputs.out_skip.add_file ( fileNamePath+'.clip'+fileExtension+'.skip') # self.outputs.out_skip[-1].meta = x.make_metadata() run_cmd.extend(inputfiles) #run_cmd.extend(['>',fileNamePath+'.fastq-mcf_summary.txt' ,'||','true']) Process(*run_cmd,stdout= fileNamePath+'.fastq-mcf_summary.txt' ).run() self.outputs.out_summary = fileNamePath+'.fastq-mcf_summary.txt' self.outputs.out_summary.meta = self.inputs.reads[0].make_metadata()