def mapEachInterval(self, workflow=None, intervalData=None, chromosome=None, \ VCFJobData=None, passingData=None, mapEachChromosomeData=None, transferOutput=False, \ **keywords): """ 2013.04.30 """ if workflow is None: workflow = self returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] topOutputDirJob = passingData.topOutputDirJob intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix span = passingData.span noOfIndividuals= passingData.noOfIndividuals SNPVCFFile = VCFJobData.file SNPVCFJobLs = VCFJobData.jobLs """ ### 2013.06.19 intervalData does not exsit for input that is entirely composed of VCF files (SplitVCFFile job does not return intervals) if intervalData.file: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.file else: mpileupInterval = intervalData.interval bcftoolsInterval = intervalData.interval intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature overlapInterval = intervalData.overlapInterval overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature span = intervalData.span """ if chromosome is None: chromosome = getattr(passingData, 'chromosome', None) #noOfIndividuals realInputVolume = noOfIndividuals * span baseInputVolume = 600*2000 #600 individuals at 2000 sites walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=10000).value #splitVCFJob = passingData.mapEachVCFData.splitVCFJob #### Part 1 generate high-quality reference panel through Beagle on high-coverage individuals # extractRefPanelSampleIDJob outputs sample IDs with replicate tags # select the high-coverage members outputVCF = File(os.path.join(self.highCoveragePanelDirJob.output, \ '%s.minCoverageForRefPanel%s.vcf'%(intervalFileBasenamePrefix, self.minCoverageForRefPanel))) #selectVariants would re-generate AC, AF so that TrioCaller could read it. #samtools uses 'AC1' instead of AC, 'AF1' instead of AF. selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=VCFJobData.file, outputF=outputVCF, \ refFastaFList=self.registerReferenceData.refFastaFList, sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\ parentJobLs=[self.highCoveragePanelDirJob, self.extractRefPanelSampleIDJob] + VCFJobData.jobLs, \ extraDependentInputLs=[VCFJobData.tbi_F], transferOutput=False, \ extraArguments=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=9000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) # run Beagle outputFnamePrefix = os.path.join(self.highCoveragePanelDirJob.folder, "%s.minCoverage%s.beagled"%\ (intervalFileBasenamePrefix, self.minCoverageForRefPanel)) beagleOnHighCoverageJob = self.addBeagle4Job(executable=self.BeagleOnHCMOnkeys, \ inputFile=selectHighCoverageSampleJob.output, refPanelFile=None,\ pedFile = self.outputPedigreeOfHghCoverageSamplesJob.output,\ outputFnamePrefix=outputFnamePrefix, \ burninIterations=7, phaseIterations=10, \ noOfSamplingHaplotypesPerSample=4,\ parentJobLs=[self.highCoveragePanelDirJob, selectHighCoverageSampleJob, self.outputPedigreeOfHghCoverageSamplesJob], \ transferOutput=False, \ extraArguments=None, extraArgumentList=None,\ extraOutputLs=None, extraDependentInputLs=None, \ no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) #index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf tabixIndexFile = File('%s.tbi'%(beagleOnHighCoverageJob.output.name)) tabixOnHighCoverageVCFJob = self.addGenericJob(executable=self.tabix, \ inputFile=beagleOnHighCoverageJob.output, inputArgumentOption="",\ outputFile=None, outputArgumentOption="-o", \ extraDependentInputLs=None, \ extraOutputLs=[tabixIndexFile], transferOutput=False, frontArgumentList=["-p vcf"], \ extraArguments=None, \ extraArgumentList=[], \ parentJobLs=[beagleOnHighCoverageJob, self.highCoveragePanelDirJob],\ no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=5000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=600).value) # select the high-coverage members outputVCF = File(os.path.join(self.highCoveragePanelDirJob.output, \ '%s.minCoverage%s.maxPairwiseKinship%s.refPanel.beagled.vcf'%\ (intervalFileBasenamePrefix, self.minCoverageForRefPanel, self.maxPairwiseKinship))) #selectVariants would re-generate AC, AF so that TrioCaller could read it. #samtools uses 'AC1' instead of AC, 'AF1' instead of AF. selectDistantMembersVariantsJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=beagleOnHighCoverageJob.output, outputF=outputVCF, \ refFastaFList=self.registerReferenceData.refFastaFList, \ sampleIDKeepFile=self.selectDistantMembersFromGenotypeFileJob.output,\ parentJobLs=[self.highCoveragePanelDirJob, beagleOnHighCoverageJob, self.selectDistantMembersFromGenotypeFileJob,\ tabixOnHighCoverageVCFJob], \ extraDependentInputLs=[tabixOnHighCoverageVCFJob.output], transferOutput=False, \ extraArguments=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=7000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) ##### Part 2 run Beagle on everyone with reference panel # run Beagle #refPanelFile=selectDistantMembersVariantsJob.output,\ outputFnamePrefix = os.path.join(self.mapDirJob.folder, '%s.beagled'%(intervalFileBasenamePrefix)) beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \ inputFile=VCFJobData.file, refPanelFile=None,\ pedFile=self.outputPedigreeJob.output,\ outputFnamePrefix=outputFnamePrefix, \ burninIterations=7, phaseIterations=10, \ noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \ extraArguments=None, extraArgumentList=None,\ parentJobLs=[self.mapDirJob, \ self.outputPedigreeJob] + VCFJobData.jobLs, \ transferOutput=False, no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value,\ ) returnData.beagleJob = beagleJob #index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf tabixIndexFile = File('%s.tbi'%(beagleJob.output.name)) tabixJob = self.addGenericJob(executable=self.tabix, \ inputFile=beagleJob.output, inputArgumentOption="",\ outputFile=None, outputArgumentOption="-o", \ extraDependentInputLs=None, \ extraOutputLs=[tabixIndexFile], transferOutput=False, frontArgumentList=["-p vcf"], \ extraArguments=None, \ extraArgumentList=[], \ parentJobLs=[beagleJob, self.mapDirJob],\ no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=600).value) #borrow PL to from pre-Beagle VCF to genotype outputFile = File(os.path.join(self.mapDirJob.folder, '%s.beagled.withPL.vcf'%(intervalFileBasenamePrefix))) combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \ GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \ GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\ inputFile=None, inputArgumentOption=None, \ refFastaFList=self.registerReferenceData.refFastaFList, \ inputFileList=None, argumentForEachFileInInputFileList="--variant",\ interval=None, outputFile=outputFile, outputArgumentOption="--out", \ frontArgumentList=None, extraArguments=None, \ extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", VCFJobData.file, \ "-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \ extraOutputLs=None, \ extraDependentInputLs=[beagleJob.output, VCFJobData.file] + tabixJob.outputLs, \ parentJobLs=[beagleJob, tabixJob]+ VCFJobData.jobLs, transferOutput=False, \ no_of_cpus=None, \ key2ObjectForJob=None,\ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=600).value) #do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format #TrioCaller # 2013.06.11 replicate individuals who appear in more than 1 families round1_IndividualsReplicatedVCF = File( os.path.join(self.mapDirJob.folder, \ '%s.replicate.vcf'%(intervalFileBasenamePrefix))) replicateVCFGenotypeColumnsJob = self.addReplicateVCFGenotypeColumnsJob(\ executable=self.ReplicateVCFGenotypeColumns, \ inputF=combineBeagleAndPreBeagleVariantsJob.output, \ sampleID2FamilyCountF=self.outputReplicatePedigreeJob.sampleID2FamilyCountF, \ outputF=round1_IndividualsReplicatedVCF, \ replicateIndividualTag=self.replicateIndividualTag,\ parentJobLs=[self.outputReplicatePedigreeJob, self.mapDirJob, combineBeagleAndPreBeagleVariantsJob], \ extraDependentInputLs=None, \ transferOutput=False, \ extraArguments=None, \ job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=9000).value, \ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value,\ ) refineGenotypeOutputF = File(os.path.join(self.mapDirJob.folder, \ '%s.trioCaller.vcf'%(intervalFileBasenamePrefix))) refineGenotypeJob = self.addTrioCallerJob(trioCallerWrapper=self.trioCallerWrapper, \ trioCallerPath=self.trioCallerPath, \ inputVCF=replicateVCFGenotypeColumnsJob.output,\ pedFile=self.outputReplicatePedigreeJob.output, outputVCF=refineGenotypeOutputF, \ inputPhased=True,\ parentJobLs=[self.mapDirJob, replicateVCFGenotypeColumnsJob, self.outputReplicatePedigreeJob], \ extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=9000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) #1.2G memory for 12K loci returnData.refineGenotypeJob = refineGenotypeJob """ 2013.07.10 the TrioCaller VCF has some info tags that are not described in VCF header """ outputFile = File(os.path.join(self.mapDirJob.folder, \ '%s.extraInfoDesc.vcf'%(intervalFileBasenamePrefix))) addInfoDescJob = self.addGenericJob(executable=self.AddMissingInfoDescriptionToVCFHeader, \ inputFile=refineGenotypeJob.output, \ inputArgumentOption="-i", \ outputFile=outputFile, outputArgumentOption="-o", \ parentJobLs=[self.mapDirJob, refineGenotypeJob], \ extraDependentInputLs=None, extraOutputLs=None, \ frontArgumentList=None, extraArguments=None, extraArgumentList=None, \ transferOutput=False, sshDBTunnel=None, \ key2ObjectForJob=None, objectWithDBArguments=None, \ no_of_cpus=None, job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \ minJobPropertyValue=1000, maxJobPropertyValue=3000).value, \ walltime=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=500).value,\ max_walltime=None) # a CheckGenotypeConcordanceAmongReplicates.py job trioCallerReplicateConcordanceFile = File(os.path.join(self.statDirJob.folder, \ '%s.trioCaller.concordance.tsv'%(intervalFileBasenamePrefix))) returnData.trioCallerReplicateConcordanceJob = self.addGATKJob(executable=self.CalculateConcordanceJava, \ GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \ GATKAnalysisType="CalculateConcordanceAmongReplicates",\ inputFile=refineGenotypeJob.output, inputArgumentOption="--variant", \ refFastaFList=self.registerReferenceData.refFastaFList, \ interval=None, \ outputFile=trioCallerReplicateConcordanceFile, outputArgumentOption="--concordanceStatFname",\ frontArgumentList=None, extraArguments="--replicateIndividualTag %s"%(self.replicateIndividualTag), \ extraArgumentList=None, extraOutputLs=None, \ parentJobLs=[self.statDirJob, refineGenotypeJob], \ transferOutput=False, \ no_of_cpus=None, \ job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=6000, \ minJobPropertyValue=9000, maxJobPropertyValue=16000).value, \ walltime=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) #2013.06.14 #merge replicates to generate consensus call # (not haplotype-based, as different recombination points across replicate haplotypes make it non-trivial ) mergeReplicateOutputF = File(os.path.join(self.mapDirJob.folder, \ '%s.replicatesMerged.vcf'%(intervalFileBasenamePrefix))) returnData.mergeVCFReplicateColumnsJob = self.addMergeVCFReplicateGenotypeColumnsJob(\ executable=self.MergeVCFReplicateHaplotypesJava,\ GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \ inputF=addInfoDescJob.output, outputF=mergeReplicateOutputF, \ replicateIndividualTag=self.replicateIndividualTag, \ refFastaFList=self.registerReferenceData.refFastaFList, \ parentJobLs=[self.mapDirJob, addInfoDescJob], \ extraDependentInputLs=[], transferOutput=False, \ extraArguments=None, \ analysis_type='MergeVCFReplicateGenotypeColumns',\ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=5000, maxJobPropertyValue=9000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value) return returnData
def addRefineGenotypeJobsViaBeagle(self, inputFile=None, vcfBaseFname=None, outputDirJob=None, statDirJob=None, \ refFastaFList=None, intervalData=None,\ baseInputVolume=450*2000000, realInputVolume=None,\ parentJobLs=None, \ transferOutput=False, \ no_of_cpus=None, job_max_memory=2000, walltime=180, \ max_walltime=None, **keywords): returnData = PassingData() if not hasattr(self, "outputPedigreeJob"): #output pedigree, with no replicating certain individuals, no trio/duo splitting #plink format #for Beagle to read in pedigreeFileFormat = 4 inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(inputFile.name) pedFile = File(os.path.join(outputDirJob.output, 'pedigree.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\ # (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=inputFile, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=parentJobLs + [outputDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) ##### Part 2 run Beagle on everyone with reference panel # run Beagle #refPanelFile=selectDistantMembersVariantsJob.output,\ outputFnamePrefix = os.path.join(outputDirJob.folder, '%s.beagled'%(vcfBaseFname)) beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \ inputFile=inputFile, refPanelFile=None,\ pedFile=self.outputPedigreeJob.output,\ outputFnamePrefix=outputFnamePrefix, \ burninIterations=7, phaseIterations=10, \ noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \ extraArguments=None, extraArgumentList=None,\ parentJobLs=[outputDirJob, \ self.outputPedigreeJob] + parentJobLs, \ transferOutput=False, no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=1200).value,\ ) returnData.beagleJob = beagleJob #index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf tabixIndexFile = File('%s.tbi'%(beagleJob.output.name)) tabixJob = self.addGenericJob(executable=self.tabix, \ inputFile=beagleJob.output, inputArgumentOption="",\ outputFile=None, outputArgumentOption="-o", \ extraDependentInputLs=None, \ extraOutputLs=[beagleJob.output, tabixIndexFile], transferOutput=False, \ frontArgumentList=["-p vcf"], \ extraArguments=None, \ extraArgumentList=None, \ parentJobLs=[beagleJob, outputDirJob],\ no_of_cpus=None, \ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=180).value) #borrow PL to from pre-Beagle VCF to genotype outputFile = File(os.path.join(outputDirJob.folder, '%s.beagled.withPL.vcf'%(vcfBaseFname))) combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \ GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \ GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\ inputFile=None, inputArgumentOption=None, \ refFastaFList=refFastaFList, \ inputFileList=None, argumentForEachFileInInputFileList="--variant",\ interval=None, outputFile=outputFile, outputArgumentOption="--out", \ frontArgumentList=None, extraArguments=None, \ extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", inputFile, \ "-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \ extraOutputLs=None, \ extraDependentInputLs=[inputFile] + tabixJob.outputLs, \ parentJobLs=[beagleJob, tabixJob]+ parentJobLs, transferOutput=False, \ no_of_cpus=None, \ key2ObjectForJob=None,\ job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \ minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\ walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \ baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \ minJobPropertyValue=60, maxJobPropertyValue=600).value) #do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format returnData.refineGenotypeJob = combineBeagleAndPreBeagleVariantsJob #the final gentoype job returnData.refineGenotypeJob.intervalData = intervalData #attached so that it could be used by downstream jobs return returnData