def __init__(self, **keywords): """ 2011-7-11 """ AbstractVervetWorkflow.__init__(self, **keywords) self.inputDir = os.path.abspath(self.inputDir) self.smartpca_path = self.insertHomePath(self.smartpca_path, self.home_path)
def __init__(self, **keywords): """ 2011-7-11 """ AbstractVervetWorkflow.__init__(self, **keywords) self.inputDir = os.path.abspath(self.inputDir)
def __init__(self, **keywords): """ 2011-7-11 """ AbstractVervetWorkflow.__init__(self, **keywords) if self.ind_seq_id_ls: self.ind_seq_id_ls = getListOutOfStr(self.ind_seq_id_ls, data_type=int)
def __init__(self, **keywords): """ 2011-8-3 """ AbstractVervetWorkflow.__init__(self, **keywords) self.addJobsDict = {1: self.addJobsToProcessWUSTLData,\ 2: self.addJobsToProcessMcGillData,\ 3: self.addJobsToProcessSouthAfricanRNAData,\ 4: self.addJobsToProcessSouthAfricanDNAData,\ 5: self.addJobsToProcessUNGCVervetData}
def registerCustomExecutables(self, workflow=None): """ 2012.3.14 """ if workflow is None: workflow = self AbstractVervetWorkflow.registerCustomExecutables(self, workflow=workflow) self.addOneExecutableFromPathAndAssignProperClusterSize( path=os.path.join(self.vervetSrcPath, "mapper/CountFastqReadBaseCount.py"), name="CountFastqReadBaseCount", clusterSizeMultipler=1, ) self.addOneExecutableFromPathAndAssignProperClusterSize( path=os.path.join(self.vervetSrcPath, "db/input/PutReadBaseCountIntoDB.py"), name="PutReadBaseCountIntoDB", clusterSizeMultipler=0.2, )
def registerCustomExecutables(self, workflow=None): """ 2011-11-28 """ if workflow==None: workflow=self AbstractVervetWorkflow.registerCustomExecutables(self, workflow) namespace = workflow.namespace version = workflow.version operatingSystem = workflow.operatingSystem architecture = workflow.architecture clusters_size = workflow.clusters_size site_handler = workflow.site_handler vervetSrcPath = self.vervetSrcPath executableClusterSizeMultiplierList = [] #2012.8.7 each cell is a tuple of (executable, clusterSizeMultipler (0 if u do not need clustering) AggregateAndHClusterDistanceMatrix = Executable(namespace=namespace, name="AggregateAndHClusterDistanceMatrix", \ version=version, \ os=operatingSystem, arch=architecture, installed=True) AggregateAndHClusterDistanceMatrix.addPFN(PFN("file://" + os.path.join(vervetSrcPath, "reducer/AggregateAndHClusterDistanceMatrix.py"), \ site_handler)) executableClusterSizeMultiplierList.append((AggregateAndHClusterDistanceMatrix, 0)) self.addExecutableAndAssignProperClusterSize(executableClusterSizeMultiplierList, defaultClustersSize=self.clusters_size)
def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2012.9.17 """ parentPreReduceData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, passingData=passingData, \ transferOutput=transferOutput, **keywords) returnData = PassingData(no_of_jobs = 0) returnData.jobDataLs = [] callOutputDir = "call" callOutputDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=callOutputDir) passingData.callOutputDirJob = callOutputDirJob matrixDir = "pairwiseDistMatrix" matrixDirJob = self.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=matrixDir) passingData.matrixDirJob = matrixDirJob reduceOutputDirJob = passingData.reduceOutputDirJob #2012.10.9 reduceOutputDirJob was added to passingData during AbstractVCFWorkflow.preReduce() #reduceOutputDir = "aggregateData" #reduceOutputDirJob = yh_pegasus.addMkDirJob(workflow, mkdir=workflow.mkdirWrap, outputDir=reduceOutputDir) #passingData.reduceOutputDirJob = reduceOutputDirJob figureFnamePrefix = os.path.join(reduceOutputDirJob.output, 'aggregateDistanceMatrix') aggregateDistanceMatrixOutputF = File('%s.tsv'%(figureFnamePrefix)) PCAFile = File('%s_PCA.tsv'%(figureFnamePrefix)) aggregateAndHClusterDistanceMatrixJob = self.addStatMergeJob(workflow, statMergeProgram=workflow.AggregateAndHClusterDistanceMatrix, \ outputF=aggregateDistanceMatrixOutputF, \ parentJobLs=[reduceOutputDirJob],extraOutputLs=[PCAFile, File('%s.png'%(figureFnamePrefix)), \ File('%s.svg'%(figureFnamePrefix))], \ extraDependentInputLs=[], transferOutput=True, extraArguments="-f %s"%(figureFnamePrefix)) returnData.aggregateAndHClusterDistanceMatrixJob = aggregateAndHClusterDistanceMatrixJob #2012.9.5 add the job to append meta info (country, sex, latitude, etc. of each monkey) outputF = File('%s_withMetaInfo.tsv'%(figureFnamePrefix)) appendInfo2PCAOutputJob = self.addGenericDBJob(executable=self.AppendInfo2SmartPCAOutput, inputFile=PCAFile, \ outputFile=outputF, \ parentJobLs=[aggregateAndHClusterDistanceMatrixJob], extraDependentInputLs=None, \ extraOutputLs=None,\ transferOutput=True, \ extraArgumentList=None, extraArguments=None, sshDBTunnel=self.needSSHDBTunnel, \ key2ObjectForJob=None, job_max_memory=2000) return returnData
def preReduce(self, workflow=None, outputDirPrefix="", passingData=None, transferOutput=True, **keywords): """ 2013.05.01 1. a job that outputs the pedigree from db, with members from the VCF file. used by various filter programs and TrioCaller 2. a job that extracts the high-coverage individuals from the VCF file 3. figure out the existence of beagle unrelated cohort, trio cohort, pair/duo cohort for high-coverage group and all individuals need the pedigree graph, a VCF file => all sample IDs and only high-coverage individuals """ returnData = AbstractVervetWorkflow.preReduce(self, workflow=workflow, outputDirPrefix=outputDirPrefix, \ passingData=passingData, transferOutput=transferOutput, **keywords) self.statDirJob = self.addMkDirJob(outputDir="%sStat"%(outputDirPrefix)) self.highCoveragePanelDirJob = self.addMkDirJob(outputDir="%sHighCoveragePanel"%(outputDirPrefix)) self.auxDirJob = self.addMkDirJob(outputDir="%sAuxilliary"%(outputDirPrefix)) self.beagleReduceDirJob = self.addMkDirJob(outputDir="%sReduceBeagle"%(outputDirPrefix)) # self.reduceOutputDirJob would contain non-replicate VCF files #this folder would store all the reduced VCF files with replicates among samles. self.replicateVCFDirJob = self.addMkDirJob(outputDir="%sReplicateVCF"%(outputDirPrefix)) self.pedigreeKinshipFile = self.registerOneInputFile(inputFname=self.pedigreeKinshipFilePath, \ folderName='aux') inputFileBasenamePrefix = utils.getFileBasenamePrefixFromPath(self.firstVCFJobData.file.name) # output pedigree to get pedigree file (for TrioCaller etc. that requires pedigree to be split into trios/duos) and sampleID2FamilyCountF # (for ReplicateVCFGenotypeColumns job, setting TrioCaller up) pedigreeFileFormat = 2 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.replicates.sampleID2FamilyCount.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputReplicatePedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=self.firstVCFJobData.file, outputFile=pedFile, \ sampleID2FamilyCountF=sampleID2FamilyCountF,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #output pedigree, with no replicating certain individuals, no trio/duo splitting pedigreeFileFormat = 4 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.%s.format%s.txt'%\ (inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.sampleID2FamilyCount.%s.format%s.txt'%\ # (inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=self.firstVCFJobData.file, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, \ replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=self.firstVCFJobData.jobLs + [self.auxDirJob], \ extraDependentInputLs=None, transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #ExtractSamplesFromVCF samples with coverage >=min_coverage # the input VCF does not contain replicates. outputFile = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.sampleIDList.tsv'%\ (inputFileBasenamePrefix, self.minCoverageForRefPanel))) extractRefPanelSampleIDJob = self.addExtractSampleIDJob(inputFile=self.firstVCFJobData.file, \ outputFile=outputFile,\ min_coverage=self.minCoverageForRefPanel, outputFormat=3,\ returnData=returnData,\ transferOutput=True, \ parentJobLs=[self.firstVCFJobData.jobLs, self.auxDirJob]) self.extractRefPanelSampleIDJob = extractRefPanelSampleIDJob # GATK SelectVariants: select High-coverage individuals out into a new VCF # selectVariants would re-generate AC, AF so that TrioCaller could read it. # samtools uses 'AC1' instead of AC, 'AF1' instead of AF. # ?can it deal with Platypus output, which does not have AC/AF/DP? # selectHighCoverageSampleJob is needed here because a VCF file of high-coverage members is needed # for outputPedigreeOfHghCoverageSamplesJob # highCoverageSampleVCF = File(os.path.join(self.auxDirJob.output, '%s.minCoverage%s.vcf'%\ (inputFileBasenamePrefix, self.minCoverageForRefPanel))) selectHighCoverageSampleJob = self.addSelectVariantsJob(SelectVariantsJava=self.SelectVariantsJava, \ inputF=self.firstVCFJobData.file, \ outputF=highCoverageSampleVCF, \ refFastaFList=self.registerReferenceData.refFastaFList, \ sampleIDKeepFile=self.extractRefPanelSampleIDJob.output,\ parentJobLs=[self.auxDirJob, self.extractRefPanelSampleIDJob]+self.firstVCFJobData.jobLs, \ extraDependentInputLs=[self.firstVCFJobData.tbi_F], transferOutput=transferOutput, \ extraArguments=None, job_max_memory=2000) # output a plink pedigree that contains these HC members only # output pedigree to get pedigree file (for GATK, TrioCaller, own programs) and sampleID2FamilyCountF (for ReplicateVCFGenotypeColumns job) # find a way to cache this job (used for same set of samples, but different chromosome intervals) pedigreeFileFormat = 4 pedFile = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.%s.format%s.txt'%\ (self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat))) #sampleID2FamilyCountF = File(os.path.join(self.auxDirJob.output, 'pedigree.minCoverage%s.sampleID2FamilyCount.%s.format%s.txt'%\ # (self.minCoverageForRefPanel, inputFileBasenamePrefix, pedigreeFileFormat))) self.outputPedigreeOfHghCoverageSamplesJob = self.addOutputVRCPedigreeInTFAMGivenOrderFromFileJob(executable=self.OutputVRCPedigreeInTFAMGivenOrderFromFile, \ inputFile=selectHighCoverageSampleJob.output, outputFile=pedFile, \ sampleID2FamilyCountF=None,\ polymuttDatFile = None,\ outputFileFormat=pedigreeFileFormat, replicateIndividualTag=self.replicateIndividualTag,\ treatEveryOneIndependent=self.treatEveryOneIndependent,\ parentJobLs=[self.auxDirJob, selectHighCoverageSampleJob], \ extraDependentInputLs=[], transferOutput=True, \ extraArguments=None, job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #a job that outputs alignment coverage (alignment.read_group, median_depth) alignmentDepthFile = File(os.path.join(self.auxDirJob.folder, '%s.alignmentDepth.tsv'%(inputFileBasenamePrefix))) self.outputAlignmentDepthJob = self.addOutputVCFAlignmentDepthRangeJob(executable=self.OutputVCFAlignmentDepthRange, \ inputFile=self.firstVCFJobData.file, \ ref_ind_seq_id=self.ref_ind_seq_id, depthFoldChange=None, minGQ=None,\ outputFile=alignmentDepthFile, outputFileFormat=1,\ extraArgumentList=None,\ parentJobLs=[self.auxDirJob]+self.firstVCFJobData.jobLs, \ extraDependentInputLs=None, transferOutput=True, \ job_max_memory=2000, sshDBTunnel=self.needSSHDBTunnel) #a SelectDistantMembersFromGenotypeFile.py job to generate a ref panel for 2nd-round beagle # need the pedigree file # produces a list of samples phasedRefPanelSampleListFile = File(os.path.join(self.auxDirJob.folder, '%s.RefPanel.sampleList.maxPairwiseKinship%s.tsv'%\ (inputFileBasenamePrefix, self.maxPairwiseKinship))) self.selectDistantMembersFromGenotypeFileJob = self.addGenericJob(executable=self.SelectDistantMembersFromGenotypeFile, \ inputFile=selectHighCoverageSampleJob.output, outputFile=phasedRefPanelSampleListFile, outputArgumentOption="-o", \ extraDependentInputLs=[self.pedigreeKinshipFile], \ extraOutputLs=None, transferOutput=False, frontArgumentList=None, \ extraArguments=None, \ extraArgumentList=["--maxPairwiseKinship %s"%(self.maxPairwiseKinship), "--sampleSize 90", \ "--pedigreeKinshipFile", self.pedigreeKinshipFile, \ "--replicateIndividualTag", self.replicateIndividualTag,\ "--individualAlignmentCoverageFname", self.outputAlignmentDepthJob.output, \ "--pedigreeFname", self.outputPedigreeJob.output], \ parentJobLs=[selectHighCoverageSampleJob, self.outputAlignmentDepthJob, self.outputPedigreeJob,\ self.auxDirJob],\ no_of_cpus=None, job_max_memory = 4000, walltime= 120) """ #analyze the pedigree graph to figure out singletons, trios, duos self.alignmentLs = self.db.getAlignmentsFromVCFFile(inputFname=yh_pegasus.getAbsPathOutOfFile(self.firstVCFJobData.file)) #2013.06.14 approach below does not work because pedigree of extracting-high-coverage + replication is different from that of replication + extracting-high-coverage (=reality). # some replicates might end up as singletons in the latter, while not so in the former. # self.highCoverageAlignmentLs = self.db.filterAlignments(alignmentLs=self.alignmentLs, min_coverage=self.minCoverageForRefPanel, \ max_coverage=None, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=None, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=None, tax_id_set=None, excludeContaminant=False, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) """ #a stat merge job (keeping track of how many mendel error sites were filtered) filterByRemoveMendelErrorSiteStatMergeFile = File(os.path.join(self.statDirJob.folder, 'filterByRemoveMendelErrorSiteStatMerge.tsv')) self.filterByRemoveMendelErrorSiteStatMergeJob = self.addStatMergeJob(statMergeProgram=workflow.ReduceMatrixByChosenColumn, \ outputF=filterByRemoveMendelErrorSiteStatMergeFile, \ transferOutput=False, parentJobLs=[self.statDirJob],\ extraArguments="--keyColumnLs 1 --valueColumnLs 2-4") #column 1 is the chromosome length, which are set to be all same. #column 2-4 are #sitesInInput1, #sitesInInput2, #overlapping returnData.jobDataLs.append(PassingData(jobLs=[self.filterByRemoveMendelErrorSiteStatMergeJob], \ fileLs=[self.filterByRemoveMendelErrorSiteStatMergeJob.output])) #concordance stat reduce jobs #reduce the replicate concordance results from before TrioCaller (after beagle phasing) # """ outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.allSites.tsv')) reduceBeaglePhaseReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.h**o.tsv')) reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'beaglePhaseReplicateConcordance.tsv')) concatenateTwoBeaglePhaseConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False) self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \ parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_AllSites]) self.addInputToStatMergeJob(statMergeJob=concatenateTwoBeaglePhaseConcordanceResultJob, \ parentJobLs=[reduceBeaglePhaseReplicateConcordanceJob_HomoOnly]) returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoBeaglePhaseConcordanceResultJob], \ fileLs=[concatenateTwoBeaglePhaseConcordanceResultJob.output])) #pass to self, as they will be used in reduceEachVCF() self.reduceBeaglePhaseReplicateConcordanceJob_AllSites = reduceBeaglePhaseReplicateConcordanceJob_AllSites self.reduceBeaglePhaseReplicateConcordanceJob_HomoOnly = reduceBeaglePhaseReplicateConcordanceJob_HomoOnly """ #reduce replicate concordance results from after-TrioCaller VCFs outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.allSites.tsv')) reduceTrioCallerReplicateConcordanceJob_AllSites = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.h**o.tsv')) reduceTrioCallerReplicateConcordanceJob_HomoOnly = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixBySumSameKeyColsAndThenDivide, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 5,6', transferOutput=False) outputFile = File(os.path.join(self.statDirJob.folder, 'trioCallerReplicateConcordance.tsv')) concatenateTwoTrioCallerConcordanceResultJob = self.addStatMergeJob(statMergeProgram=self.ReduceMatrixByMergeColumnsWithSameKey, \ outputF=outputFile, \ extraArguments='--keyColumnLs 0,1 --valueColumnLs 2,3,4', transferOutput=False) self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \ parentJobLs=[reduceTrioCallerReplicateConcordanceJob_AllSites]) self.addInputToStatMergeJob(statMergeJob=concatenateTwoTrioCallerConcordanceResultJob, \ parentJobLs=[reduceTrioCallerReplicateConcordanceJob_HomoOnly]) returnData.jobDataLs.append(PassingData(jobLs=[concatenateTwoTrioCallerConcordanceResultJob], \ fileLs=[concatenateTwoTrioCallerConcordanceResultJob.output])) #pass to self, as they will be used in reduceEachVCF() self.reduceTrioCallerReplicateConcordanceJob_AllSites = reduceTrioCallerReplicateConcordanceJob_AllSites self.reduceTrioCallerReplicateConcordanceJob_HomoOnly = reduceTrioCallerReplicateConcordanceJob_HomoOnly return returnData