def runStats(self, taskPrefix="", dependencies=None): statsPath = self.paths.getStatsPath() statsFilename = os.path.basename(statsPath) tmpStatsDir = statsPath + ".tmpdir" makeTmpStatsDirCmd = getMkdirCmd() + [tmpStatsDir] dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), makeTmpStatsDirCmd, dependencies=dependencies, isForceLocal=True) tmpStatsFiles = [] statsTasks = set() for (bamIndex, bamPath) in enumerate(self.params.normalBamList + self.params.tumorBamList): indexStr = str(bamIndex).zfill(3) tmpStatsFiles.append( os.path.join(tmpStatsDir, statsFilename + "." + indexStr + ".xml")) cmd = [self.params.mantaStatsBin] cmd.extend(["--output-file", tmpStatsFiles[-1]]) cmd.extend(["--align-file", bamPath]) statsTasks.add( self.addTask(preJoin(taskPrefix, "generateStats_" + indexStr), cmd, dependencies=dirTask)) cmd = [self.params.mantaMergeStatsBin] cmd.extend(["--output-file", statsPath]) for tmpStatsFile in tmpStatsFiles: cmd.extend(["--align-stats-file", tmpStatsFile]) mergeTask = self.addTask(preJoin(taskPrefix, "mergeStats"), cmd, dependencies=statsTasks, isForceLocal=True) nextStepWait = set() nextStepWait.add(mergeTask) if not self.params.isRetainTempFiles: rmStatsTmpCmd = getRmdirCmd() + [tmpStatsDir] rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"), rmStatsTmpCmd, dependencies=mergeTask, isForceLocal=True) # summarize stats in format that's easier for human review cmd = [self.params.mantaStatsSummaryBin] cmd.extend(["--align-stats ", statsPath]) cmd.extend(["--output-file", self.paths.getStatsSummaryPath()]) self.addTask(preJoin(taskPrefix, "summarizeStats"), cmd, dependencies=mergeTask) return nextStepWait
def getSequenceErrorEstimates(self, taskPrefix="", dependencies=None): """ Count sequence errors and use these to estimate error parameters """ mkDirTask = preJoin(taskPrefix, "makeTmpDir") tmpErrorEstimationDir = self.paths.getTmpErrorEstimationDir() mkDirCmd = getMkdirCmd() + [tmpErrorEstimationDir] self.addTask(mkDirTask, mkDirCmd, dependencies=dependencies, isForceLocal=True) estimationIntervals = getErrorEstimationIntervals(self.params) assert (len(estimationIntervals) != 0) # The count and estimation processes are currently independent for each sample sampleTasks = set() for sampleIndex in range(len(self.params.bamList)): sampleIndexStr = str(sampleIndex).zfill(3) sampleTask = preJoin(taskPrefix, "Sample" + sampleIndexStr) workflow = EstimateSequenceErrorWorkflowForSample( self.params, self.paths, estimationIntervals, sampleIndex) sampleTasks.add( self.addWorkflowTask(sampleTask, workflow, dependencies=mkDirTask)) if not self.params.isRetainTempFiles: rmTmpCmd = getRmdirCmd() + [tmpErrorEstimationDir] self.addTask(preJoin(taskPrefix, "removeTmpDir"), rmTmpCmd, dependencies=sampleTasks, isForceLocal=True) nextStepWait = sampleTasks return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None): """ run variant caller on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params): segmentTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() finishTasks.add( self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.denovo, self.paths.getDenovoOutputPath(), "denovo")) # merge segment stats: finishTasks.add( self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats)) if self.params.isOutputCallableRegions: finishTasks.add( self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.callable, self.paths.getRegionOutputPath(), "callableRegions")) if not self.params.isRetainTempFiles: rmStatsTmpCmd = getRmdirCmd() + [tmpSegmentDir] rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"), rmStatsTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None): """ run counter on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params): segmentTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() # merge segment stats: finishTasks.add( mergeSequenceErrorCounts(self, taskPrefix, completeSegmentsTask, segFiles.counts)) if self.params.isReportObservedIndels: finishTasks.add( self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.observedIndelBed, self.paths.getObservedIndelBedPath(), "observedIndels")) if not self.params.isRetainTempFiles: rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def callGenome(self,taskPrefix="",dependencies=None): """ run strelka on all genome segments """ tmpSegmentDir=self.paths.getTmpSegmentDir() dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempVariantCallingSegmentFiles() for gsegGroup in self.getStrelkaGenomeSegmentGroupIterator() : segmentTasks |= callGenomeSegment(self, gsegGroup, segFiles, dependencies=dirTask) if len(segmentTasks) == 0 : raise Exception("No genome regions to analyze. Possible target region parse error.") # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix,"completedAllGenomeSegments"),dependencies=segmentTasks) finishTasks = set() finishTasks.add(self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.snv, self.paths.getSnvOutputPath(),"SNV")) finishTasks.add(self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.indel, self.paths.getIndelOutputPath(),"Indel")) # merge segment stats: finishTasks.add(self.mergeRunStats(taskPrefix,completeSegmentsTask, segFiles.stats)) if self.params.isOutputCallableRegions : finishTasks.add(self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.callable, self.paths.getRegionOutputPath(), "callableRegions")) if self.params.isWriteRealignedBam : def catRealignedBam(label, segmentList) : output = self.paths.getRealignedBamPath(label) bamCatCmd = bamListCatCmd(self.params.samtoolsBin, segmentList, output) bamCatTaskLabel = preJoin(taskPrefix, "realignedBamCat_" + label) finishTasks.add(self.addTask(bamCatTaskLabel, bamCatCmd, dependencies=completeSegmentsTask)) catRealignedBam("normal", segFiles.normalRealign) catRealignedBam("tumor", segFiles.tumorRealign) if not self.params.isRetainTempFiles : rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] self.addTask(preJoin(taskPrefix,"removeTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def runStats(self,taskPrefix="",dependencies=None) : statsPath=self.paths.getStatsPath() statsFilename=os.path.basename(statsPath) tmpStatsDir=statsPath+".tmpdir" makeTmpStatsDirCmd = getMkdirCmd() + [tmpStatsDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpStatsDirCmd, dependencies=dependencies, isForceLocal=True) tmpStatsFiles = [] statsTasks = set() for (bamIndex,bamPath) in enumerate(self.params.normalBamList + self.params.tumorBamList) : indexStr = str(bamIndex).zfill(3) tmpStatsFiles.append(os.path.join(tmpStatsDir,statsFilename+"."+ indexStr +".xml")) cmd = [ self.params.mantaStatsBin ] cmd.extend(["--output-file",tmpStatsFiles[-1]]) cmd.extend(["--align-file",bamPath]) statsTasks.add(self.addTask(preJoin(taskPrefix,"generateStats_"+indexStr),cmd,dependencies=dirTask)) cmd = [ self.params.mantaMergeStatsBin ] cmd.extend(["--output-file",statsPath]) for tmpStatsFile in tmpStatsFiles : cmd.extend(["--align-stats-file",tmpStatsFile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeStats"),cmd,dependencies=statsTasks,isForceLocal=True) nextStepWait = set() nextStepWait.add(mergeTask) if not self.params.isRetainTempFiles : rmStatsTmpCmd = getRmdirCmd() + [tmpStatsDir] rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmStatsTmpCmd,dependencies=mergeTask, isForceLocal=True) # summarize stats in format that's easier for human review cmd = [self.params.mantaStatsSummaryBin] cmd.extend(["--align-stats ", statsPath]) cmd.extend(["--output-file", self.paths.getStatsSummaryPath()]) self.addTask(preJoin(taskPrefix,"summarizeStats"),cmd,dependencies=mergeTask) return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None): """ run variant caller on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() sampleCount = len(self.params.bamList) segFiles = TempVariantCallingSegmentFiles(sampleCount) for gsegGroup in self.getStrelkaGenomeSegmentGroupIterator( contigsExcludedFromGrouping=self.params.callContinuousVf): segmentTasks |= callGenomeSegment(self, gsegGroup, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() # merge various VCF outputs finishTasks.add( self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.variants, self.paths.getVariantsOutputPath(), "variants")) for sampleIndex in range(sampleCount): concatTask = self.concatIndexVcf( taskPrefix, completeSegmentsTask, segFiles.sample[sampleIndex].gvcf, self.paths.getGvcfOutputPath(sampleIndex), gvcfSampleLabel(sampleIndex)) finishTasks.add(concatTask) if sampleIndex == 0: outputPath = self.paths.getGvcfOutputPath(sampleIndex) outputDirname = os.path.dirname(outputPath) outputBasename = os.path.basename(outputPath) def linkLegacy(extension): return "ln -s " + quote( outputBasename + extension) + " " + quote( self.paths.getGvcfLegacyFilename() + extension) linkCmd = linkLegacy("") + " && " + linkLegacy(".tbi") self.addTask(preJoin(taskPrefix, "addLegacyOutputLink"), linkCmd, dependencies=concatTask, isForceLocal=True, cwd=outputDirname) # merge segment stats: finishTasks.add( self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats)) if self.params.isWriteRealignedBam: def finishBam(tmpList, output, label): cmd = bamListCatCmd(self.params.samtoolsBin, tmpList, output) finishTasks.add( self.addTask(preJoin(taskPrefix, label + "_finalizeBAM"), cmd, dependencies=completeSegmentsTask)) finishBam(segFiles.bamRealign, self.paths.getRealignedBamPath(), "realigned") if not self.params.isRetainTempFiles: rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] self.addTask(preJoin(taskPrefix, "removeTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isTumorNormal = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isTumorNormal) and len(self.params.tumorBamList)) hygenTasks=set() if self.params.isGenerateSupportBam : sortBamVcfTasks = set() self.candidateVcfPaths = [] self.diploidVcfPaths = [] self.somaticVcfPaths = [] self.tumorVcfPaths = [] self.rnaVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) elif self.params.isRNA: self.rnaVcfPaths.append(self.paths.getHyGenRnaPath(binStr)) else: self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isTumorNormal : self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--max-edge-count", str(self.params.graphNodeMaxEdgeCount)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]]) elif self.params.isRNA: hygenCmd.extend(["--rna-output-file", self.rnaVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isTumorNormal : hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # Setup remote read retrieval for insertions: def isEnableRemoteReadRetrieval() : if isTumorOnly or isTumorNormal : return self.params.enableRemoteReadRetrievalForInsertionsInCancerCallingModes else : return self.params.enableRemoteReadRetrievalForInsertionsInGermlineCallingModes if isEnableRemoteReadRetrieval() : hygenCmd.append("--enable-remote-read-retrieval") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) if self.params.isGenerateSupportBam : hygenCmd.extend(["--evidence-bam-stub", self.paths.getSupportBamStub(binStr)]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.useOverlapPairEvidence: hygenCmd.append("--use-overlapping-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") if self.params.isOutputContig : hygenCmd.append("--output-contigs") hygenTask = preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=self.params.hyGenMemMb)) # TODO: if the bam is large, for efficiency, consider # 1) filtering the bin-specific bam first w.r.t. the final candidate vcf # 2) then sort the bin-specific bam and merge them # This would require moving the filter/sort bam jobs outside the hygen loop if self.params.isGenerateSupportBam : bamIndex = 0 # sort supporting bams extracted from normal samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=True, bamIdx=bamIndex, dependencies=hygenTask) # sort supporting bams extracted from tumor samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=False, bamIdx=bamIndex, dependencies=hygenTask) vcfTasks = sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks) nextStepWait = copy.deepcopy(hygenTasks) if self.params.isGenerateSupportBam : sortBamVcfTasks.union(vcfTasks) mergeBamTasks = set() bamCount = 0 # merge supporting bams for each normal sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=True, bamIdx=bamCount, dependencies=sortBamVcfTasks) # merge supporting bams for each tumor sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=False, bamIdx=bamCount, dependencies=sortBamVcfTasks) nextStepWait = nextStepWait.union(sortBamVcfTasks) nextStepWait = nextStepWait.union(mergeBamTasks) # # sort the edge runtime logs # logListFile = self.paths.getEdgeRuntimeLogListPath() logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList") self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks) def getEdgeLogSortCmd(logListFile, outPath) : cmd = [sys.executable, self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath] return cmd edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True) # # merge all edge stats # statsFileList = self.paths.getStatsFileListPath() statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList") self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks) edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True) if not self.params.isRetainTempFiles : # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present. # rmDirCmd = getRmdirCmd() + [hygenDir] # rmDirTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY) pass return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() tmpGraphDir=self.paths.getTmpGraphDir() makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask = self.addTask(preJoin(taskPrefix,"makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id tmpGraphFiles.append(self.paths.getTmpGraphFile(gid)) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) for gseg in gsegGroup : graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTask=preJoin(taskPrefix,"makeLocusGraph_"+gid) graphTasks.add(self.addTask(graphTask,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") tmpGraphFileList = self.paths.getTmpGraphFileListPath() tmpGraphFileListTask = preJoin(taskPrefix,"mergeLocusGraphInputList") self.addWorkflowTask(tmpGraphFileListTask,listFileWorkflow(tmpGraphFileList,tmpGraphFiles),dependencies=graphTasks) mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) mergeCmd.extend(["--graph-file-list",tmpGraphFileList]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=tmpGraphFileListTask,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) if not self.params.isRetainTempFiles : rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin,"--global"] graphStatsCmd.extend(["--graph-file",graphPath]) graphStatsCmd.extend(["--output-file",graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None): """ Run hypothesis generation on each SV locus """ import copy statsPath = self.paths.getStatsPath() graphPath = self.paths.getGraphPath() hygenDir = self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix, "makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isTumorNormal = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isTumorNormal) and len(self.params.tumorBamList)) hygenTasks = set() if self.params.isGenerateSupportBam: sortBamVcfTasks = set() self.candidateVcfPaths = [] self.diploidVcfPaths = [] self.somaticVcfPaths = [] self.tumorVcfPaths = [] self.rnaVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins): binStr = str(binId).zfill(4) self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly: self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) elif self.params.isRNA: self.rnaVcfPaths.append(self.paths.getHyGenRnaPath(binStr)) else: self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isTumorNormal: self.somaticVcfPaths.append( self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [self.params.mantaHyGenBin] hygenCmd.extend(["--align-stats", statsPath]) hygenCmd.extend(["--graph-file", graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend( ["--max-edge-count", str(self.params.graphNodeMaxEdgeCount)]) hygenCmd.extend( ["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend([ "--min-candidate-spanning-count", self.params.minCandidateSpanningCount ]) hygenCmd.extend( ["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref", self.params.referenceFasta]) hygenCmd.extend( ["--candidate-output-file", self.candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly: hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]]) elif self.params.isRNA: hygenCmd.extend(["--rna-output-file", self.rnaVcfPaths[-1]]) else: hygenCmd.extend( ["--diploid-output-file", self.diploidVcfPaths[-1]]) hygenCmd.extend( ["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend([ "--min-pass-qual-score", self.params.minPassDiploidVariantScore ]) hygenCmd.extend( ["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isTumorNormal: hygenCmd.extend( ["--somatic-output-file", self.somaticVcfPaths[-1]]) hygenCmd.extend( ["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend([ "--min-pass-somatic-score", self.params.minPassSomaticScore ]) # Setup remote read retrieval for insertions: def isEnableRemoteReadRetrieval(): if isTumorOnly or isTumorNormal: return self.params.enableRemoteReadRetrievalForInsertionsInCancerCallingModes else: return self.params.enableRemoteReadRetrievalForInsertionsInGermlineCallingModes if isEnableRemoteReadRetrieval(): hygenCmd.append("--enable-remote-read-retrieval") if self.params.isHighDepthFilter: hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append( self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) if self.params.isGenerateSupportBam: hygenCmd.extend( ["--evidence-bam-stub", self.paths.getSupportBamStub(binStr)]) for bamPath in self.params.normalBamList: hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList: hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair: hygenCmd.append("--ignore-anom-proper-pair") if self.params.useOverlapPairEvidence: hygenCmd.append("--use-overlapping-pair") if self.params.isRNA: hygenCmd.append("--rna") if self.params.isUnstrandedRNA: hygenCmd.append("--unstranded") if self.params.isOutputContig: hygenCmd.append("--output-contigs") hygenTask = preJoin(taskPrefix, "generateCandidateSV_" + binStr) hygenTasks.add( self.addTask(hygenTask, hygenCmd, dependencies=dirTask, memMb=self.params.hyGenMemMb)) # TODO: if the bam is large, for efficiency, consider # 1) filtering the bin-specific bam first w.r.t. the final candidate vcf # 2) then sort the bin-specific bam and merge them # This would require moving the filter/sort bam jobs outside the hygen loop if self.params.isGenerateSupportBam: bamIndex = 0 # sort supporting bams extracted from normal samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=True, bamIdx=bamIndex, dependencies=hygenTask) # sort supporting bams extracted from tumor samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=False, bamIdx=bamIndex, dependencies=hygenTask) vcfTasks = sortAllVcfs(self, taskPrefix=taskPrefix, dependencies=hygenTasks) nextStepWait = copy.deepcopy(hygenTasks) if self.params.isGenerateSupportBam: sortBamVcfTasks.union(vcfTasks) mergeBamTasks = set() bamCount = 0 # merge supporting bams for each normal sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=True, bamIdx=bamCount, dependencies=sortBamVcfTasks) # merge supporting bams for each tumor sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=False, bamIdx=bamCount, dependencies=sortBamVcfTasks) nextStepWait = nextStepWait.union(sortBamVcfTasks) nextStepWait = nextStepWait.union(mergeBamTasks) # # sort the edge runtime logs # logListFile = self.paths.getEdgeRuntimeLogListPath() logListTask = preJoin(taskPrefix, "sortEdgeRuntimeLogsInputList") self.addWorkflowTask(logListTask, listFileWorkflow(logListFile, edgeRuntimeLogPaths), dependencies=hygenTasks) def getEdgeLogSortCmd(logListFile, outPath): cmd = [ sys.executable, self.params.mantaSortEdgeLogs, "-f", logListFile, "-o", outPath ] return cmd edgeSortCmd = getEdgeLogSortCmd(logListFile, self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(preJoin(taskPrefix, "sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True) # # merge all edge stats # statsFileList = self.paths.getStatsFileListPath() statsListTask = preJoin(taskPrefix, "mergeEdgeStatsInputList") self.addWorkflowTask(statsListTask, listFileWorkflow(statsFileList, edgeStatsLogPaths), dependencies=hygenTasks) edgeStatsMergeTask = preJoin(taskPrefix, "mergeEdgeStats") edgeStatsMergeCmd = [self.params.mantaStatsMergeBin] edgeStatsMergeCmd.extend(["--stats-file-list", statsFileList]) edgeStatsMergeCmd.extend( ["--output-file", self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend( ["--report-file", self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True) if not self.params.isRetainTempFiles: # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present. # rmDirCmd = getRmdirCmd() + [hygenDir] # rmDirTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY) pass return nextStepWait
def runLocusGraph(self, taskPrefix="", dependencies=None): """ Create the full SV locus graph """ statsPath = self.paths.getStatsPath() graphPath = self.paths.getGraphPath() graphStatsPath = self.paths.getGraphStatsPath() tmpGraphDir = self.paths.getTmpGraphDir() makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask = self.addTask(preJoin(taskPrefix, "makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)): assert (len(gsegGroup) != 0) gid = gsegGroup[0].id if len(gsegGroup) > 1: gid += "_to_" + gsegGroup[-1].id tmpGraphFiles.append(self.paths.getTmpGraphFile(gid)) graphCmd = [self.params.mantaGraphBin] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats", statsPath]) for gseg in gsegGroup: graphCmd.extend(["--region", gseg.bamRegion]) graphCmd.extend( ["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend( ["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref", self.params.referenceFasta]) for bamPath in self.params.normalBamList: graphCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList: graphCmd.extend(["--tumor-align-file", bamPath]) if self.params.isHighDepthFilter: graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair: graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA: graphCmd.append("--rna") graphTask = preJoin(taskPrefix, "makeLocusGraph_" + gid) graphTasks.add( self.addTask(graphTask, graphCmd, dependencies=dirTask, memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0: raise Exception( "No SV Locus graphs to create. Possible target region parse error." ) tmpGraphFileList = self.paths.getTmpGraphFileListPath() tmpGraphFileListTask = preJoin(taskPrefix, "mergeLocusGraphInputList") self.addWorkflowTask(tmpGraphFileListTask, listFileWorkflow(tmpGraphFileList, tmpGraphFiles), dependencies=graphTasks) mergeCmd = [self.params.mantaGraphMergeBin] mergeCmd.extend(["--output-file", graphPath]) mergeCmd.extend(["--graph-file-list", tmpGraphFileList]) mergeTask = self.addTask(preJoin(taskPrefix, "mergeLocusGraph"), mergeCmd, dependencies=tmpGraphFileListTask, memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [self.params.mantaGraphCheckBin] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix, "checkLocusGraph"), checkCmd, dependencies=mergeTask, memMb=self.params.mergeMemMb) if not self.params.isRetainTempFiles: rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask = self.addTask(preJoin(taskPrefix, "removeTmpDir"), rmGraphTmpCmd, dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin, "--global"] graphStatsCmd.extend(["--graph-file", graphPath]) graphStatsCmd.extend(["--output-file", graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix, "locusGraphStats"), graphStatsCmd, dependencies=mergeTask, memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() candidateVcfPaths = [] diploidVcfPaths = [] somaticVcfPaths = [] tumorVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) else: diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", tumorVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isSomatic : hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # temporary fix for FFPE: hygenCmd.append("--skip-remote-reads") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) def getVcfSortCmd(vcfPaths, outPath, isDiploid) : cmd = "\"%s\" -E \"%s\" -u " % (sys.executable,self.params.mantaSortVcf) cmd += " ".join(quoteStringList(vcfPaths)) # apply the ploidy filter to diploid variants if isDiploid: tempVcf = self.paths.getTempDiploidPath() cmd += " > \"%s\"" % (tempVcf) cmd += " && \"%s\" -E \"%s\" \"%s\"" % (sys.executable, self.params.mantaPloidyFilter, tempVcf) cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath) if isDiploid: cmd += " && " + " ".join(getRmCmd()) + " \"%s\"" % (self.paths.getTempDiploidPath()) return cmd def getVcfTabixCmd(vcfPath) : return [self.params.tabixBin,"-f","-p","vcf", vcfPath] def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() # make header modifications to first vcf in list of files to be sorted: headerFixTask=preJoin(taskPrefix,"fixVcfHeader_"+label) def getHeaderFixCmd(fileName) : tmpName=fileName+".reheader.tmp" cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' cmd += " < \"%s\" > \"%s\"" % (fileName,tmpName) cmd += " && " + " ".join(getMvCmd()) + " \"%s\" \"%s\"" % (tmpName, fileName) return cmd self.addTask(headerFixTask,getHeaderFixCmd(pathList[0]),dependencies=hygenTasks,isForceLocal=True) sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortTask=self.addTask(preJoin(taskPrefix,"sort_"+label),sortCmd,dependencies=headerFixTask) nextStepWait.add(self.addTask(preJoin(taskPrefix,"tabix_"+label),getVcfTabixCmd(outPath),dependencies=sortTask,isForceLocal=True)) return sortTask candSortTask = sortVcfs(candidateVcfPaths, self.paths.getSortedCandidatePath(), "sortCandidateSV") sortVcfs(diploidVcfPaths, self.paths.getSortedDiploidPath(), "sortDiploidSV", isDiploid=True) sortVcfs(somaticVcfPaths, self.paths.getSortedSomaticPath(), "sortSomaticSV") sortVcfs(tumorVcfPaths, self.paths.getSortedTumorPath(), "sortTumorSV") def getExtractSmallCmd(maxSize, inPath, outPath) : cmd = "\"%s\" -dc \"%s\"" % (self.params.bgzipBin, inPath) cmd += " | \"%s\" -E \"%s\" --maxSize %i" % (sys.executable, self.params.mantaExtraSmallVcf, maxSize) cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath) return cmd def extractSmall(inPath, outPath) : maxSize = int(self.params.minScoredVariantSize) - 1 if maxSize < 1 : return smallCmd = getExtractSmallCmd(maxSize, inPath, outPath) smallLabel=self.addTask(preJoin(taskPrefix,"extractSmallIndels"), smallCmd, dependencies=candSortTask, isForceLocal=True) nextStepWait.add(self.addTask(smallLabel+"_tabix", getVcfTabixCmd(outPath), dependencies=smallLabel, isForceLocal=True)) extractSmall(self.paths.getSortedCandidatePath(), self.paths.getSortedCandidateSmallIndelsPath()) # sort edge logs: def getEdgeLogSortCmd(logPaths, outPath) : cmd = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-o",outPath] cmd.extend(logPaths) return cmd edgeSortLabel=preJoin(taskPrefix,"sortEdgeRuntimeLogs") edgeSortCmd=getEdgeLogSortCmd(edgeRuntimeLogPaths,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(edgeSortLabel, edgeSortCmd, dependencies=hygenTasks, isForceLocal=True) # merge edge stats: edgeStatsMergeLabel=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] for statsFile in edgeStatsLogPaths : edgeStatsMergeCmd.extend(["--stats-file",statsFile]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeLabel, edgeStatsMergeCmd, dependencies=hygenTasks, isForceLocal=True) return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") makeTmpDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() def getGenomeSegmentGroups(params) : """ Iterate segment groups and 'clump' small contigs together """ minSegmentGroupSize=200000 group = [] headSize = 0 for gseg in getNextGenomeSegment(self.params) : if headSize+gseg.size() <= minSegmentGroupSize : group.append(gseg) headSize += gseg.size() else : if len(group) != 0 : yield(group) group = [gseg] headSize = gseg.size() if len(group) != 0 : yield(group) for gsegGroup in getGenomeSegmentGroups(self.params) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gid+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) for gseg in gsegGroup : graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gid) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin,"--global"] graphStatsCmd.extend(["--graph-file",graphPath]) graphStatsCmd.extend(["--output-file",graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() self.candidateVcfPaths = [] self.diploidVcfPaths = [] self.somaticVcfPaths = [] self.tumorVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) else: self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isSomatic : hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # temporary fix for FFPE: hygenCmd.append("--skip-remote-reads") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") hygenTask=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks) # # sort the edge runtime logs # logListFile = self.paths.getEdgeRuntimeLogListPath() logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList") self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks) def getEdgeLogSortCmd(logListFile, outPath) : cmd = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath] return cmd edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True) # # merge all edge stats # statsFileList = self.paths.getStatsFileListPath() statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList") self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks) edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True) if not self.params.isRetainTempFiles : # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present. # rmDirCmd = getRmdirCmd() + [hygenDir] # rmDirTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY) pass return nextStepWait