コード例 #1
0
    def __init__(self, **keywords):
        """
		2011-7-11
		"""
        AbstractVervetWorkflow.__init__(self, **keywords)
        self.inputDir = os.path.abspath(self.inputDir)
        self.smartpca_path = self.insertHomePath(self.smartpca_path, self.home_path)
コード例 #2
0
	def __init__(self,  **keywords):
		"""
		2011-7-11
		"""
		AbstractVervetWorkflow.__init__(self, **keywords)
		
		self.inputDir = os.path.abspath(self.inputDir)
コード例 #3
0
    def __init__(self, **keywords):
        """
		2011-7-11
		"""
        AbstractVervetWorkflow.__init__(self, **keywords)

        if self.ind_seq_id_ls:
            self.ind_seq_id_ls = getListOutOfStr(self.ind_seq_id_ls, data_type=int)
コード例 #4
0
	def __init__(self,  **keywords):
		"""
		2011-8-3
		"""
		AbstractVervetWorkflow.__init__(self, **keywords)
		self.addJobsDict = {1: self.addJobsToProcessWUSTLData,\
						2: self.addJobsToProcessMcGillData,\
						3: self.addJobsToProcessSouthAfricanRNAData,\
						4: self.addJobsToProcessSouthAfricanDNAData,\
						5: self.addJobsToProcessUNGCVervetData}
コード例 #5
0
    def registerCustomExecutables(self, workflow=None):
        """
		2012.3.14
		"""
        if workflow is None:
            workflow = self
        AbstractVervetWorkflow.registerCustomExecutables(self, workflow=workflow)

        self.addOneExecutableFromPathAndAssignProperClusterSize(
            path=os.path.join(self.vervetSrcPath, "mapper/CountFastqReadBaseCount.py"),
            name="CountFastqReadBaseCount",
            clusterSizeMultipler=1,
        )

        self.addOneExecutableFromPathAndAssignProperClusterSize(
            path=os.path.join(self.vervetSrcPath, "db/input/PutReadBaseCountIntoDB.py"),
            name="PutReadBaseCountIntoDB",
            clusterSizeMultipler=0.2,
        )
コード例 #6
0
	def registerCustomExecutables(self, workflow=None):
		"""
		2011-11-28
		"""
		if workflow==None:
			workflow=self
		AbstractVervetWorkflow.registerCustomExecutables(self, workflow)
		
		namespace = workflow.namespace
		version = workflow.version
		operatingSystem = workflow.operatingSystem
		architecture = workflow.architecture
		clusters_size = workflow.clusters_size
		site_handler = workflow.site_handler
		vervetSrcPath = self.vervetSrcPath
		
		executableClusterSizeMultiplierList = []	#2012.8.7 each cell is a tuple of (executable, clusterSizeMultipler (0 if u do not need clustering)		
		AggregateAndHClusterDistanceMatrix = Executable(namespace=namespace, name="AggregateAndHClusterDistanceMatrix", \
											version=version, \
											os=operatingSystem, arch=architecture, installed=True)
		AggregateAndHClusterDistanceMatrix.addPFN(PFN("file://" + os.path.join(vervetSrcPath, "reducer/AggregateAndHClusterDistanceMatrix.py"), \
													site_handler))
		executableClusterSizeMultiplierList.append((AggregateAndHClusterDistanceMatrix, 0))
		self.addExecutableAndAssignProperClusterSize(executableClusterSizeMultiplierList, defaultClustersSize=self.clusters_size)
コード例 #7
0
	def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
		"""
		2012.9.17
		"""
		parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \
							transferOutput=transferOutput, **keywords)
		returnData = PassingData(no_of_jobs = 0)
		returnData.jobDataLs = []
		
		callOutputDir = "call"
		callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir)
		passingData.callOutputDirJob = callOutputDirJob
		
		matrixDir = "pairwiseDistMatrix"
		matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir)
		passingData.matrixDirJob = matrixDirJob
		
		reduceOutputDirJob = passingData.reduceOutputDirJob
		#2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce()
		
		#reduceOutputDir = "aggregateData"
		#reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir)
		#passingData.reduceOutputDirJob = reduceOutputDirJob
		
		figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix')
		aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix))
		PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix))
		aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \
									outputF=aggregateDistanceMatrixOutputF, \
									parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \
																				File('%s.svg'%(figureFnamePrefix))], \
									extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix))
		returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob
		
		#2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey)
		outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix))
		appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \
				outputFile=outputF, \
				parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \
				extraOutputLs=None,\
				transferOutput=True, \
				extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \
				key2ObjectForJob=None, job_max_memory=2000)
		
		
		return returnData
コード例 #8
0
	def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords):
		"""
		2013.05.01
			1. a job that outputs the pedigree from db, with members from the VCF file. used by various filter programs and TrioCaller
			2. a job that extracts the high-coverage individuals from the VCF file
			3. figure out the existence of beagle unrelated cohort, trio cohort, pair/duo cohort for high-coverage group and all individuals
				need the pedigree graph, a VCF file => all sample IDs and only high-coverage individuals
				
		"""
		returnData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, \
													passingData=passingData, transferOutput=transferOutput, **keywords)
		
		self.statDirJob = self.addMkDirJob(outputDir="%sStat"%(outputDirPrefix))
		self.highCoveragePanelDirJob = self.addMkDirJob(outputDir="%sHighCoveragePanel"%(outputDirPrefix))
		self.auxDirJob = self.addMkDirJob(outputDir="%sAuxilliary"%(outputDirPrefix))
		
		self.beagleReduceDirJob = self.addMkDirJob(outputDir="%sReduceBeagle"%(outputDirPrefix))
		# self.reduceOutputDirJob would contain non-replicate VCF files
		#this folder would store all the reduced VCF files with replicates among samles. 
		self.replicateVCFDirJob = self.addMkDirJob(outputDir="%sReplicateVCF"%(outputDirPrefix))
		
		self.pedigreeKinshipFile = self.registerOneInputFile(inputFname=self.pedigreeKinshipFilePath, \
										folderName='aux')
		
		inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(self.firstVCFJobData.file.name)
		# output pedigree to get pedigree file (for TrioCaller etc. that requires pedigree to be split into trios/duos) and sampleID2FamilyCountF 
		#		(for ReplicateVCFGenotypeColumns job, setting TrioCaller up)
		pedigreeFileFormat = 2
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.%s.format%s.txt'%\
								(inputFileBasenamePrefix, pedigreeFileFormat)))
		sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.sampleID2FamilyCount.%s.format%s.txt'%\
												(inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputReplicatePedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=self.firstVCFJobData.file, outputFile=pedFile, \
				sampleID2FamilyCountF=sampleID2FamilyCountF,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, \
				replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \
				extraDependentInputLs=None, transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#output pedigree, with no replicating certain individuals, no trio/duo splitting
		pedigreeFileFormat = 4
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.%s.format%s.txt'%\
								(inputFileBasenamePrefix, pedigreeFileFormat)))
		#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\
		#						(inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=self.firstVCFJobData.file, outputFile=pedFile, \
				sampleID2FamilyCountF=None,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, \
				replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \
				extraDependentInputLs=None, transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#ExtractSamplesFromVCF samples with coverage >=min_coverage
		# the input VCF does not contain replicates.
		outputFile = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.sampleIDList.tsv'%\
									(inputFileBasenamePrefix, self.minCoverageForRefPanel)))
		extractRefPanelSampleIDJob = self.addExtractSampleIDJob(inputFile=self.firstVCFJobData.file, \
							outputFile=outputFile,\
							min_coverage=self.minCoverageForRefPanel, outputFormat=3,\
							returnData=returnData,\
							transferOutput=True, \
							parentJobLs=[self.firstVCFJobData.jobLs, self.auxDirJob])
		self.extractRefPanelSampleIDJob = extractRefPanelSampleIDJob
		
		
		# GATK SelectVariants: select High-coverage individuals out into a new VCF
		#	selectVariants would re-generate AC, AF so that TrioCaller could read it.
		#	samtools uses 'AC1' instead of AC, 'AF1' instead of AF.
		#		?can it deal with Platypus output, which does not have AC/AF/DP?
		# selectHighCoverageSampleJob is needed here because a VCF file of high-coverage members is needed
		# 	for outputPedigreeOfHghCoverageSamplesJob
		#
		highCoverageSampleVCF = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.vcf'%\
												(inputFileBasenamePrefix, self.minCoverageForRefPanel)))
		selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \
				inputF=self.firstVCFJobData.file, \
				outputF=highCoverageSampleVCF, \
				refFastaFList=self.registerReferenceData.refFastaFList, \
				sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\
				parentJobLs=[self.auxDirJob, self.extractRefPanelSampleIDJob]+self.firstVCFJobData.jobLs, \
				extraDependentInputLs=[self.firstVCFJobData.tbi_F], transferOutput=transferOutput, \
				extraArguments=None, job_max_memory=2000)
		
		# output a plink pedigree that contains these HC members only
		# output pedigree to get pedigree file (for GATK, TrioCaller, own programs) and sampleID2FamilyCountF (for ReplicateVCFGenotypeColumns job)
		# find a way to cache this job (used for same set of samples, but different chromosome intervals)
		pedigreeFileFormat = 4
		pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.%s.format%s.txt'%\
								(self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat)))
		#sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.sampleID2FamilyCount.%s.format%s.txt'%\
		#									(self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat)))
		self.outputPedigreeOfHghCoverageSamplesJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \
				inputFile=selectHighCoverageSampleJob.output, outputFile=pedFile, \
				sampleID2FamilyCountF=None,\
				polymuttDatFile = None,\
				outputFileFormat=pedigreeFileFormat, replicateIndividualTag=self.replicateIndividualTag,\
				treatEveryOneIndependent=self.treatEveryOneIndependent,\
				parentJobLs=[self.auxDirJob, selectHighCoverageSampleJob], \
				extraDependentInputLs=[], transferOutput=True, \
				extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		#a job that outputs alignment coverage (alignment.read_group, median_depth)
		alignmentDepthFile = File(os.path.join(self.auxDirJob.folder, '%s.alignmentDepth.tsv'%(inputFileBasenamePrefix)))
		self.outputAlignmentDepthJob = self.addOutputVCFAlignmentDepthRangeJob(executable=self.OutputVCFAlignmentDepthRange, \
						inputFile=self.firstVCFJobData.file, \
						ref_ind_seq_id=self.ref_ind_seq_id, depthFoldChange=None, minGQ=None,\
						outputFile=alignmentDepthFile, outputFileFormat=1,\
						extraArgumentList=None,\
						parentJobLs=[self.auxDirJob]+self.firstVCFJobData.jobLs, \
						extraDependentInputLs=None, transferOutput=True, \
						job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel)
		
		
		#a SelectDistantMembersFromGenotypeFile.py job to generate a ref panel for 2nd-round beagle
		# need the pedigree file
		# produces a list of samples
		phasedRefPanelSampleListFile = File(os.path.join(self.auxDirJob.folder, '%s.RefPanel.sampleList.maxPairwiseKinship%s.tsv'%\
														(inputFileBasenamePrefix, self.maxPairwiseKinship)))
		self.selectDistantMembersFromGenotypeFileJob = self.addGenericJob(executable=self.SelectDistantMembersFromGenotypeFile, \
						inputFile=selectHighCoverageSampleJob.output,
						outputFile=phasedRefPanelSampleListFile, outputArgumentOption="-o", \
						extraDependentInputLs=[self.pedigreeKinshipFile], \
						extraOutputLs=None, transferOutput=False, frontArgumentList=None, \
						extraArguments=None, \
						extraArgumentList=["--maxPairwiseKinship %s"%(self.maxPairwiseKinship), "--sampleSize 90", \
							"--pedigreeKinshipFile", self.pedigreeKinshipFile, \
							"--replicateIndividualTag", self.replicateIndividualTag,\
							"--individualAlignmentCoverageFname", self.outputAlignmentDepthJob.output, \
							"--pedigreeFname", self.outputPedigreeJob.output], \
						parentJobLs=[selectHighCoverageSampleJob, self.outputAlignmentDepthJob,  self.outputPedigreeJob,\
									self.auxDirJob],\
						no_of_cpus=None, job_max_memory = 4000, walltime= 120)
		
		"""
		
		#analyze the pedigree graph to figure out singletons, trios, duos
		self.alignmentLs = self.db.getAlignmentsFromVCFFile(inputFname=yh_pegasus.getAbsPathOutOfFile(self.firstVCFJobData.file))
		#2013.06.14 approach below does not work because pedigree of extracting-high-coverage + replication is different from that of replication + extracting-high-coverage (=reality).
		# some replicates might end up as singletons in the latter, while not so in the former.
		#
		self.highCoverageAlignmentLs = self.db.filterAlignments(alignmentLs=self.alignmentLs, min_coverage=self.minCoverageForRefPanel, \
			max_coverage=None, individual_site_id=None, \
			sequence_filtered=None, individual_site_id_set=None, \
			mask_genotype_method_id=None, parent_individual_alignment_id=None,\
			country_id_set=None, tax_id_set=None, excludeContaminant=False, excludeTissueIDSet=None,\
			local_realigned=None, reduce_reads=None, report=False)
		
		"""
		
		#a stat merge job (keeping track of how many mendel error sites were filtered)
		filterByRemoveMendelErrorSiteStatMergeFile = File(os.path.join(self.statDirJob.folder, 'filterByRemoveMendelErrorSiteStatMerge.tsv'))
		self.filterByRemoveMendelErrorSiteStatMergeJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \
								outputF=filterByRemoveMendelErrorSiteStatMergeFile, \
								transferOutput=False, parentJobLs=[self.statDirJob],\
								extraArguments="--keyColumnLs 1 --valueColumnLs 2-4")	#column 1 is the chromosome length, which are set to be all same.
								#column 2-4 are #sitesInInput1, #sitesInInput2, #overlapping
		returnData.jobDataLs.append(PassingData(jobLs=[self.filterByRemoveMendelErrorSiteStatMergeJob], \
											fileLs=[self.filterByRemoveMendelErrorSiteStatMergeJob.output]))
		#concordance stat reduce jobs
		#reduce the replicate concordance results from before TrioCaller (after beagle phasing)
		#
		"""
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.allSites.tsv'))
		reduceBeaglePhaseReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.h**o.tsv'))
		reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.tsv'))
		concatenateTwoBeaglePhaseConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False)
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \
							parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_AllSites])
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \
							parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_HomoOnly])
		returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoBeaglePhaseConcordanceResultJob], \
											fileLs=[concatenateTwoBeaglePhaseConcordanceResultJob.output]))
		#pass to self, as they will be used in reduceEachVCF()
		self.reduceBeaglePhaseReplicateConcordanceJob_AllSites = reduceBeaglePhaseReplicateConcordanceJob_AllSites
		self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = reduceBeaglePhaseReplicateConcordanceJob_HomoOnly
		"""
		
		#reduce replicate concordance results from after-TrioCaller VCFs 
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.allSites.tsv'))
		reduceTrioCallerReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False)
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.h**o.tsv'))
		reduceTrioCallerReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False)
		
		outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.tsv'))
		concatenateTwoTrioCallerConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \
							outputF=outputFile, \
							extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False)
		
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \
							parentJobLs=[reduceTrioCallerReplicateConcordanceJob_AllSites])
		self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \
							parentJobLs=[reduceTrioCallerReplicateConcordanceJob_HomoOnly])
		returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoTrioCallerConcordanceResultJob], \
											fileLs=[concatenateTwoTrioCallerConcordanceResultJob.output]))
		#pass to self, as they will be used in reduceEachVCF()
		self.reduceTrioCallerReplicateConcordanceJob_AllSites = reduceTrioCallerReplicateConcordanceJob_AllSites
		self.reduceTrioCallerReplicateConcordanceJob_HomoOnly = reduceTrioCallerReplicateConcordanceJob_HomoOnly
		
		return returnData