Python PassingData.beagleJob Beispiele

Programmiersprache: Python

Namespace / Paketname: pymodule

Klasse / Typ: PassingData

Methode / Funktion: beagleJob

Beispiele auf hotexamples.com: 2

Python PassingData.beagleJob - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pymodule.PassingData.beagleJob, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

PassingData(30)

data_matrix(6)

refineGenotypeJob(3)

commit(3)

beagleJob(2)

list_type_label_ls(2)

list_type_id_ls(2)

trioCallerReplicateConcordanceJob(2)

candidate_gene_set(2)

totalDistance(2)

list_type_id2index(2)

juxtaposeAFJob(1)

is_4th_col_stop_pos(1)

AlignmentJobAndOutputLs(1)

hist_thumb(1)

hist_plot_fname(1)

max_value_cutoff(1)

hist_plot(1)

hist_log_thumb(1)

hist_log_plot(1)

filename(1)

extractInfoJob(1)

markGenotypeMissingJobLs(1)

plotOutputDirJob(1)

mergeVCFReplicateColumnsJob(1)

reduceEachChromosomeJob(1)

variantAnnotatorJob(1)

trioInconsistencySummaryJob(1)

trioCallerJob(1)

thumb_data(1)

statOutputDirJob(1)

removeHighMissingLocusJob(1)

qq_thumb(1)

missingFractionPerLocusJob(1)

qq_plot_fname(1)

qq_plot(1)

qq_log_thumb(1)

qq_log_plot(1)

drawJob(1)

perSampleMatchFractionReduceJob(1)

estimateOutlierJob(1)

data_matrix_non_candidate_gw_size(1)

description_ls(1)

baiF(1)

call_info_id_ls(1)

call_info_id2fname(1)

call_info_id2call_info(1)

calculaJob(1)

bamFnamePrefix(1)

bamF(1)

Beispiel #1

Datei anzeigen

Datei: BeagleAndTrioCallerOnVCFWorkflow.py Projekt: mjmontague/vervet-web

	def mapEachInterval(self, workflow=None, intervalData=None, chromosome=None, \
					VCFJobData=None, passingData=None, 
					mapEachChromosomeData=None, transferOutput=False, \
					**keywords):
		"""
		2013.04.30
		"""
		if workflow is None:
			workflow = self
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		topOutputDirJob = passingData.topOutputDirJob
		
		intervalFileBasenamePrefix = passingData.intervalFileBasenamePrefix
		span = passingData.span
		noOfIndividuals= passingData.noOfIndividuals
		SNPVCFFile = VCFJobData.file
		SNPVCFJobLs = VCFJobData.jobLs
		"""
		### 2013.06.19 intervalData does not exsit for input that is entirely composed of VCF files (SplitVCFFile job does not return intervals)
		if intervalData.file:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.file
		else:
			mpileupInterval = intervalData.interval
			bcftoolsInterval = intervalData.interval
		intervalFileBasenameSignature = intervalData.intervalFileBasenameSignature
		overlapInterval = intervalData.overlapInterval
		overlapFileBasenameSignature = intervalData.overlapIntervalFnameSignature
		span = intervalData.span
		"""
		if chromosome is None:
			chromosome = getattr(passingData, 'chromosome', None)
		
		#noOfIndividuals
		realInputVolume = noOfIndividuals * span
		baseInputVolume = 600*2000	#600 individuals at 2000 sites
		walltime = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value
		job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=10000).value
		
		#splitVCFJob = passingData.mapEachVCFData.splitVCFJob
		
		#### Part 1 generate high-quality reference panel through Beagle on high-coverage individuals
		# extractRefPanelSampleIDJob outputs sample IDs with replicate tags
		# select the high-coverage members
		outputVCF = File(os.path.join(self.highCoveragePanelDirJob.output, \
									'%s.minCoverageForRefPanel%s.vcf'%(intervalFileBasenamePrefix, self.minCoverageForRefPanel)))
		#selectVariants would re-generate AC, AF so that TrioCaller could read it.
		#samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
		selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
				inputF=VCFJobData.file, outputF=outputVCF, \
				refFastaFList=self.registerReferenceData.refFastaFList, sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\
				parentJobLs=[self.highCoveragePanelDirJob, self.extractRefPanelSampleIDJob] + VCFJobData.jobLs, \
				extraDependentInputLs=[VCFJobData.tbi_F], transferOutput=False, \
				extraArguments=None, \
				job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=9000).value,\
				walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)
		
		# run Beagle 
		outputFnamePrefix = os.path.join(self.highCoveragePanelDirJob.folder, "%s.minCoverage%s.beagled"%\
								(intervalFileBasenamePrefix, self.minCoverageForRefPanel))
		beagleOnHighCoverageJob = self.addBeagle4Job(executable=self.BeagleOnHCMOnkeys, \
								inputFile=selectHighCoverageSampleJob.output, refPanelFile=None,\
								pedFile = self.outputPedigreeOfHghCoverageSamplesJob.output,\
								outputFnamePrefix=outputFnamePrefix, \
								burninIterations=7, phaseIterations=10, \
								noOfSamplingHaplotypesPerSample=4,\
								parentJobLs=[self.highCoveragePanelDirJob, selectHighCoverageSampleJob, self.outputPedigreeOfHghCoverageSamplesJob], \
								transferOutput=False, \
								extraArguments=None, extraArgumentList=None,\
								extraOutputLs=None, extraDependentInputLs=None, \
								no_of_cpus=None, \
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)
		
		#index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf
		tabixIndexFile = File('%s.tbi'%(beagleOnHighCoverageJob.output.name))
		tabixOnHighCoverageVCFJob = self.addGenericJob(executable=self.tabix, \
						inputFile=beagleOnHighCoverageJob.output, inputArgumentOption="",\
						outputFile=None, outputArgumentOption="-o", \
						extraDependentInputLs=None, \
						extraOutputLs=[tabixIndexFile], transferOutput=False, frontArgumentList=["-p vcf"], \
						extraArguments=None, \
						extraArgumentList=[], \
						parentJobLs=[beagleOnHighCoverageJob, self.highCoveragePanelDirJob],\
						no_of_cpus=None, \
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=5000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=600).value)
		
		# select the high-coverage members
		outputVCF = File(os.path.join(self.highCoveragePanelDirJob.output, \
									'%s.minCoverage%s.maxPairwiseKinship%s.refPanel.beagled.vcf'%\
									(intervalFileBasenamePrefix, self.minCoverageForRefPanel, self.maxPairwiseKinship)))
		#selectVariants would re-generate AC, AF so that TrioCaller could read it.
		#samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
		selectDistantMembersVariantsJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
				inputF=beagleOnHighCoverageJob.output, outputF=outputVCF, \
				refFastaFList=self.registerReferenceData.refFastaFList, \
				sampleIDKeepFile=self.selectDistantMembersFromGenotypeFileJob.output,\
				parentJobLs=[self.highCoveragePanelDirJob, beagleOnHighCoverageJob, self.selectDistantMembersFromGenotypeFileJob,\
							tabixOnHighCoverageVCFJob], \
				extraDependentInputLs=[tabixOnHighCoverageVCFJob.output], transferOutput=False, \
				extraArguments=None, \
				job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=7000).value,\
				walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)
		
		
		##### Part 2 run Beagle on everyone with reference panel
		# run Beagle
		#refPanelFile=selectDistantMembersVariantsJob.output,\
		outputFnamePrefix = os.path.join(self.mapDirJob.folder, '%s.beagled'%(intervalFileBasenamePrefix))
		beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \
						inputFile=VCFJobData.file, refPanelFile=None,\
						pedFile=self.outputPedigreeJob.output,\
						outputFnamePrefix=outputFnamePrefix, \
						burninIterations=7, phaseIterations=10, \
						noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \
						extraArguments=None, extraArgumentList=None,\
						parentJobLs=[self.mapDirJob, \
									self.outputPedigreeJob] + VCFJobData.jobLs, \
						transferOutput=False, no_of_cpus=None, \
						job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\
						walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value,\
						)
		returnData.beagleJob = beagleJob
		
		#index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf
		tabixIndexFile = File('%s.tbi'%(beagleJob.output.name))
		tabixJob = self.addGenericJob(executable=self.tabix, \
						inputFile=beagleJob.output, inputArgumentOption="",\
						outputFile=None, outputArgumentOption="-o", \
						extraDependentInputLs=None, \
						extraOutputLs=[tabixIndexFile], transferOutput=False, frontArgumentList=["-p vcf"], \
						extraArguments=None, \
						extraArgumentList=[], \
						parentJobLs=[beagleJob, self.mapDirJob],\
						no_of_cpus=None, \
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=600).value)
		
		#borrow PL to from pre-Beagle VCF to genotype 
		outputFile = File(os.path.join(self.mapDirJob.folder, '%s.beagled.withPL.vcf'%(intervalFileBasenamePrefix)))
		combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \
					GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \
					GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\
					inputFile=None, inputArgumentOption=None, \
					refFastaFList=self.registerReferenceData.refFastaFList, \
					inputFileList=None, argumentForEachFileInInputFileList="--variant",\
					interval=None, outputFile=outputFile, outputArgumentOption="--out", \
					frontArgumentList=None, extraArguments=None, \
					extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", VCFJobData.file, \
								"-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \
					extraOutputLs=None, \
					extraDependentInputLs=[beagleJob.output, VCFJobData.file] + tabixJob.outputLs, \
					parentJobLs=[beagleJob, tabixJob]+ VCFJobData.jobLs, transferOutput=False, \
					no_of_cpus=None, \
					key2ObjectForJob=None,\
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=600).value)
		#do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format
		
		#TrioCaller
		# 2013.06.11 replicate individuals who appear in more than 1 families
		round1_IndividualsReplicatedVCF = File( os.path.join(self.mapDirJob.folder, \
											'%s.replicate.vcf'%(intervalFileBasenamePrefix)))
		replicateVCFGenotypeColumnsJob = self.addReplicateVCFGenotypeColumnsJob(\
					executable=self.ReplicateVCFGenotypeColumns, \
					inputF=combineBeagleAndPreBeagleVariantsJob.output, \
					sampleID2FamilyCountF=self.outputReplicatePedigreeJob.sampleID2FamilyCountF, \
					outputF=round1_IndividualsReplicatedVCF, \
					replicateIndividualTag=self.replicateIndividualTag,\
					parentJobLs=[self.outputReplicatePedigreeJob, self.mapDirJob, combineBeagleAndPreBeagleVariantsJob], \
					extraDependentInputLs=None, \
					transferOutput=False, \
					extraArguments=None, \
					job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=9000).value, \
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value,\
					)
		
		refineGenotypeOutputF = File(os.path.join(self.mapDirJob.folder, \
												'%s.trioCaller.vcf'%(intervalFileBasenamePrefix)))
		refineGenotypeJob = self.addTrioCallerJob(trioCallerWrapper=self.trioCallerWrapper, \
				trioCallerPath=self.trioCallerPath, \
				inputVCF=replicateVCFGenotypeColumnsJob.output,\
				pedFile=self.outputReplicatePedigreeJob.output, outputVCF=refineGenotypeOutputF, \
				inputPhased=True,\
				parentJobLs=[self.mapDirJob, replicateVCFGenotypeColumnsJob, self.outputReplicatePedigreeJob], \
				extraDependentInputLs=[], transferOutput=False, \
				extraArguments=None, \
				job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=9000).value,\
				walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)	#1.2G memory for 12K loci
		
		returnData.refineGenotypeJob = refineGenotypeJob
		
		"""
		2013.07.10 the TrioCaller VCF has some info tags that are not described in VCF header
		"""
		outputFile = File(os.path.join(self.mapDirJob.folder, \
												'%s.extraInfoDesc.vcf'%(intervalFileBasenamePrefix)))
		addInfoDescJob = self.addGenericJob(executable=self.AddMissingInfoDescriptionToVCFHeader, \
					inputFile=refineGenotypeJob.output, \
					inputArgumentOption="-i", \
					outputFile=outputFile, outputArgumentOption="-o", \
					parentJobLs=[self.mapDirJob, refineGenotypeJob], \
					extraDependentInputLs=None, extraOutputLs=None, \
					frontArgumentList=None, extraArguments=None, extraArgumentList=None, \
					transferOutput=False, sshDBTunnel=None, \
					key2ObjectForJob=None, objectWithDBArguments=None, \
					no_of_cpus=None, 
					job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=2000, \
							minJobPropertyValue=1000, maxJobPropertyValue=3000).value, \
					walltime=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=500).value,\
					max_walltime=None)
		
		# a CheckGenotypeConcordanceAmongReplicates.py job
		trioCallerReplicateConcordanceFile = File(os.path.join(self.statDirJob.folder, \
								'%s.trioCaller.concordance.tsv'%(intervalFileBasenamePrefix)))
		returnData.trioCallerReplicateConcordanceJob = self.addGATKJob(executable=self.CalculateConcordanceJava, \
					GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \
					GATKAnalysisType="CalculateConcordanceAmongReplicates",\
					inputFile=refineGenotypeJob.output, inputArgumentOption="--variant", \
					refFastaFList=self.registerReferenceData.refFastaFList, \
					interval=None, \
					outputFile=trioCallerReplicateConcordanceFile, outputArgumentOption="--concordanceStatFname",\
					frontArgumentList=None, extraArguments="--replicateIndividualTag %s"%(self.replicateIndividualTag), \
					extraArgumentList=None, extraOutputLs=None, \
					parentJobLs=[self.statDirJob, refineGenotypeJob], \
					transferOutput=False, \
					no_of_cpus=None, \
					job_max_memory=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=6000, \
							minJobPropertyValue=9000, maxJobPropertyValue=16000).value, \
					walltime=self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)
		
		
		#2013.06.14
		#merge replicates to generate consensus call
		# (not haplotype-based, as different recombination points across replicate haplotypes make it non-trivial )
		mergeReplicateOutputF = File(os.path.join(self.mapDirJob.folder, \
									'%s.replicatesMerged.vcf'%(intervalFileBasenamePrefix)))
		returnData.mergeVCFReplicateColumnsJob = self.addMergeVCFReplicateGenotypeColumnsJob(\
							executable=self.MergeVCFReplicateHaplotypesJava,\
							GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \
							inputF=addInfoDescJob.output, outputF=mergeReplicateOutputF, \
							replicateIndividualTag=self.replicateIndividualTag, \
							refFastaFList=self.registerReferenceData.refFastaFList, \
							parentJobLs=[self.mapDirJob, addInfoDescJob], \
							extraDependentInputLs=[], transferOutput=False, \
							extraArguments=None, \
							analysis_type='MergeVCFReplicateGenotypeColumns',\
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=5000, maxJobPropertyValue=9000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value)
		
		
		return returnData

Beispiel #2

Datei anzeigen

Datei: AlignmentToTrioCallPipeline.py Projekt: mjmontague/vervet-web

	def addRefineGenotypeJobsViaBeagle(self, inputFile=None, vcfBaseFname=None, outputDirJob=None, statDirJob=None, \
					refFastaFList=None, intervalData=None,\
					baseInputVolume=450*2000000, realInputVolume=None,\
					parentJobLs=None, \
					transferOutput=False, \
					no_of_cpus=None, job_max_memory=2000, walltime=180, \
					max_walltime=None, **keywords):
		
		returnData = PassingData()
		
		if not hasattr(self, "outputPedigreeJob"):
			#output pedigree, with no replicating certain individuals, no trio/duo splitting
			#plink format
			#for Beagle to read in
			pedigreeFileFormat = 4
			inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(inputFile.name)
			pedFile = File(os.path.join(outputDirJob.output, 'pedigree.%s.format%s.txt'%\
									(inputFileBasenamePrefix, pedigreeFileFormat)))
			#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\
			#						(inputFileBasenamePrefix, pedigreeFileFormat)))
			self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
					inputFile=inputFile, outputFile=pedFile, \
					sampleID2FamilyCountF=None,\
					polymuttDatFile = None,\
					outputFileFormat=pedigreeFileFormat, \
					replicateIndividualTag=self.replicateIndividualTag,\
					treatEveryOneIndependent=self.treatEveryOneIndependent,\
					parentJobLs=parentJobLs + [outputDirJob], \
					extraDependentInputLs=None, transferOutput=True, \
					extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
			
		##### Part 2 run Beagle on everyone with reference panel
		# run Beagle
		#refPanelFile=selectDistantMembersVariantsJob.output,\
		outputFnamePrefix = os.path.join(outputDirJob.folder, '%s.beagled'%(vcfBaseFname))
		beagleJob = self.addBeagle4Job(executable=self.BeagleJava, \
						inputFile=inputFile, refPanelFile=None,\
						pedFile=self.outputPedigreeJob.output,\
						outputFnamePrefix=outputFnamePrefix, \
						burninIterations=7, phaseIterations=10, \
						noOfSamplingHaplotypesPerSample=4, duoscale=2, trioscale=2, \
						extraArguments=None, extraArgumentList=None,\
						parentJobLs=[outputDirJob, \
									self.outputPedigreeJob] + parentJobLs, \
						transferOutput=False, no_of_cpus=None, \
						job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=4000, maxJobPropertyValue=13000).value,\
						walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=1200).value,\
						)
		returnData.beagleJob = beagleJob
		
		#index .vcf.gz, output of beagle, without index, GATK can't work on gzipped vcf
		tabixIndexFile = File('%s.tbi'%(beagleJob.output.name))
		tabixJob = self.addGenericJob(executable=self.tabix, \
						inputFile=beagleJob.output, inputArgumentOption="",\
						outputFile=None, outputArgumentOption="-o", \
						extraDependentInputLs=None, \
						extraOutputLs=[beagleJob.output, tabixIndexFile], transferOutput=False, \
						frontArgumentList=["-p vcf"], \
						extraArguments=None, \
						extraArgumentList=None, \
						parentJobLs=[beagleJob, outputDirJob],\
						no_of_cpus=None, \
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=180).value)
		
		#borrow PL to from pre-Beagle VCF to genotype 
		outputFile = File(os.path.join(outputDirJob.folder, '%s.beagled.withPL.vcf'%(vcfBaseFname)))
		combineBeagleAndPreBeagleVariantsJob = self.addGATKJob(executable=self.CombineBeagleAndPreBeagleVariantsJava, \
					GenomeAnalysisTKJar=self.GenomeAnalysisTKJar, \
					GATKAnalysisType="CombineBeagleAndPreBeagleVariants",\
					inputFile=None, inputArgumentOption=None, \
					refFastaFList=refFastaFList, \
					inputFileList=None, argumentForEachFileInInputFileList="--variant",\
					interval=None, outputFile=outputFile, outputArgumentOption="--out", \
					frontArgumentList=None, extraArguments=None, \
					extraArgumentList=["--variant:first", beagleJob.output, "--variant:second", inputFile, \
								"-genotypeMergeOptions PRIORITIZE", "-priority first,second"], \
					extraOutputLs=None, \
					extraDependentInputLs=[inputFile] + tabixJob.outputLs, \
					parentJobLs=[beagleJob, tabixJob]+ parentJobLs, transferOutput=False, \
					no_of_cpus=None, \
					key2ObjectForJob=None,\
					job_max_memory = self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=4000, \
							minJobPropertyValue=2000, maxJobPropertyValue=4000).value,\
					walltime= self.scaleJobWalltimeOrMemoryBasedOnInput(realInputVolume=realInputVolume, \
							baseInputVolume=baseInputVolume, baseJobPropertyValue=60, \
							minJobPropertyValue=60, maxJobPropertyValue=600).value)
		#do not use "--variant:beagle" to name your vcf file as GATK would think it's in Beagle format
		returnData.refineGenotypeJob = combineBeagleAndPreBeagleVariantsJob	#the final gentoype job
		returnData.refineGenotypeJob.intervalData = intervalData	#attached so that it could be used by downstream jobs
		return returnData