def _getDepthShared(self,taskPrefix, dependencies, bamList, outputPath, depthFunc) : """ estimate chrom depth using the specified depthFunc to compute per-sample depth """ outputFilename=os.path.basename(outputPath) tmpDir=outputPath+".tmpdir" makeTmpDirCmd = getMkdirCmd() + [tmpDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True) tmpFiles = [] scatterTasks = set() for (bamIndex, bamFile) in enumerate(bamList) : indexStr = str(bamIndex).zfill(3) tmpFiles.append(os.path.join(tmpDir,outputFilename+"."+ indexStr +".txt")) scatterTasks |= setzer(depthFunc(self,taskPrefix+"_sample"+indexStr,dirTask,bamFile,tmpFiles[-1])) cmd = [ self.params.mergeChromDepth ] cmd.extend(["--out",outputPath]) for tmpFile in tmpFiles : cmd.extend(["--in",tmpFile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeChromDepth"),cmd,dependencies=scatterTasks,isForceLocal=True) nextStepWait = set() nextStepWait.add(mergeTask) if not self.params.isRetainTempFiles : rmTmpCmd = getRmdirCmd() + [tmpDir] rmTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmTmpCmd,dependencies=mergeTask, isForceLocal=True) return nextStepWait
def mergeSupportBams(self, mergeBamTasks, taskPrefix="", isNormal=True, bamIdx=0, dependencies=None) : if isNormal: bamList = self.params.normalBamList else: bamList = self.params.tumorBamList for bamPath in bamList: # merge support bams supportBamFile = self.paths.getFinalSupportBamPath(bamPath, bamIdx) mergeCmd = [ sys.executable, self.params.mantaMergeBam, self.params.samtoolsBin, self.paths.getSortedSupportBamMask(bamIdx), supportBamFile, self.paths.getSupportBamListPath(bamIdx) ] mergeBamTask=self.addTask(preJoin(taskPrefix,"merge_evidenceBam_%s" % (bamIdx)), mergeCmd, dependencies=dependencies) mergeBamTasks.add(mergeBamTask) # index the filtered bam ### TODO still needs to handle the case where supportBamFile does not exist indexCmd = [ self.params.samtoolsBin, "index", supportBamFile ] indexBamTask = self.addTask(preJoin(taskPrefix,"index_evidenceBam_%s" % (bamIdx)), indexCmd, dependencies=mergeBamTask) mergeBamTasks.add(indexBamTask) bamIdx += 1 return bamIdx
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gseg in getNextGenomeSegment(self.params) : tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gseg.id+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gseg.id) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = "rm -rf " + tmpGraphDir #rmTask=self.addTask(preJoin(taskPrefix,"rmGraphTmp"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = self.params.mantaGraphStatsBin graphStatsCmd += " --global" graphStatsCmd += " --graph-file " + graphPath graphStatsCmd += " >| " + graphStatsPath graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def depthFunc(self,taskPrefix,dependencies,bamFile,outFile) : outputPath=outFile outputFilename=os.path.basename(outputPath) tmpDir=os.path.join(outputPath+".tmpdir") makeTmpDirCmd = getMkdirCmd() + [tmpDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True) tmpFiles = [] scatterTasks = set() def getChromosomeGroups(params) : """ Iterate through chromosomes/contigs and group small contigs together. This functions as a generator yielding successive contig groups. """ minSize=200000 group = [] headSize = 0 chromCount = len(params.chromSizes) assert(len(params.chromOrder) == chromCount) for chromIndex in range(chromCount) : chromLabel = params.chromOrder[chromIndex] if chromLabel in params.chromIsSkipped : continue chromSize = params.chromSizes[chromLabel] if headSize+chromSize <= minSize : group.append((chromIndex,chromLabel)) headSize += chromSize else : if len(group) != 0 : yield(group) group = [(chromIndex,chromLabel)] headSize = chromSize if len(group) != 0 : yield(group) for chromGroup in getChromosomeGroups(self.params) : assert(len(chromGroup) > 0) cid = getRobustChromId(chromGroup[0][0], chromGroup[0][1]) if len(chromGroup) > 1 : cid += "_to_"+getRobustChromId(chromGroup[-1][0], chromGroup[-1][1]) tmpFiles.append(os.path.join(tmpDir,outputFilename+"_"+cid)) cmd = [self.params.getChromDepthBin,"--ref", self.params.referenceFasta, "--align-file", bamFile, "--output", tmpFiles[-1]] for (chromIndex,chromLabel) in chromGroup : cmd.extend(["--chrom",chromLabel]) scatterTasks.add(self.addTask(preJoin(taskPrefix,"estimateChromDepth_"+cid),cmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) assert(len(tmpFiles) != 0) catCmd = [self.params.catScript,"--output",outputPath]+tmpFiles catTask = self.addTask(preJoin(taskPrefix,"catChromDepth"),catCmd,dependencies=scatterTasks, isForceLocal=True) nextStepWait = set() nextStepWait.add(catTask) return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gseg in getNextGenomeSegment(self.params) : tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gseg.id+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) graphCmd.extend(["--region",gseg.bamRegion]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gseg.id) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask)) mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks) rmGraphTmpCmd = "rm -rf " + tmpGraphDir #rmTask=self.addTask(preJoin(taskPrefix,"rmGraphTmp"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = self.params.mantaGraphStatsBin graphStatsCmd += " --global" graphStatsCmd += " --graph-file " + graphPath graphStatsCmd += " >| " + graphStatsPath graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask) nextStepWait = set() nextStepWait.add(mergeTask) return nextStepWait
def extractSmall(inPath, outPath) : maxSize = int(self.params.minScoredVariantSize) - 1 if maxSize < 1 : return smallCmd = getExtractSmallCmd(maxSize, inPath, outPath) smallLabel=preJoin(taskPrefix,"extractSmallIndels") nextStepWait.add(self.addTask(smallLabel, smallCmd, dependencies=candSortTask, isForceLocal=True))
def runStats(self,taskPrefix="",dependencies=None) : statsPath=self.paths.getStatsPath() statsFilename=os.path.basename(statsPath) tmpStatsDir=statsPath+".tmpdir" makeTmpStatsDirCmd = getMkdirCmd() + [tmpStatsDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpStatsDirCmd, dependencies=dependencies, isForceLocal=True) tmpStatsFiles = [] statsTasks = set() for (bamIndex,bamPath) in enumerate(self.params.normalBamList + self.params.tumorBamList) : indexStr = str(bamIndex).zfill(3) tmpStatsFiles.append(os.path.join(tmpStatsDir,statsFilename+"."+ indexStr +".xml")) cmd = [ self.params.mantaStatsBin ] cmd.extend(["--output-file",tmpStatsFiles[-1]]) cmd.extend(["--align-file",bamPath]) statsTasks.add(self.addTask(preJoin(taskPrefix,"generateStats_"+indexStr),cmd,dependencies=dirTask)) cmd = [ self.params.mantaMergeStatsBin ] cmd.extend(["--output-file",statsPath]) for tmpStatsFile in tmpStatsFiles : cmd.extend(["--align-stats-file",tmpStatsFile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeStats"),cmd,dependencies=statsTasks,isForceLocal=True) nextStepWait = set() nextStepWait.add(mergeTask) if not self.params.isRetainTempFiles : rmStatsTmpCmd = getRmdirCmd() + [tmpStatsDir] rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmStatsTmpCmd,dependencies=mergeTask, isForceLocal=True) # summarize stats in format that's easier for human review cmd = [self.params.mantaStatsSummaryBin] cmd.extend(["--align-stats ", statsPath]) cmd.extend(["--output-file", self.paths.getStatsSummaryPath()]) self.addTask(preJoin(taskPrefix,"summarizeStats"),cmd,dependencies=mergeTask) return nextStepWait
def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() # make header modifications to first vcf in list of files to be sorted: headerFixTask=preJoin(taskPrefix,"fixVcfHeader_"+label) def getHeaderFixCmd(fileName) : tmpName=fileName+".reheader.tmp" cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' cmd += " < \"%s\" > \"%s\"" % (fileName,tmpName) cmd += " && " + " ".join(getMvCmd()) + " \"%s\" \"%s\"" % (tmpName, fileName) return cmd self.addTask(headerFixTask,getHeaderFixCmd(pathList[0]),dependencies=hygenTasks,isForceLocal=True) sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortTask=self.addTask(preJoin(taskPrefix,"sort_"+label),sortCmd,dependencies=headerFixTask) nextStepWait.add(self.addTask(preJoin(taskPrefix,"tabix_"+label),getVcfTabixCmd(outPath),dependencies=sortTask,isForceLocal=True)) return sortTask
def catRealignedBam(label, segmentList): output = self.paths.getRealignedBamPath(label) bamCatCmd = bamListCatCmd(self.params.samtoolsBin, segmentList, output) bamCatTaskLabel = preJoin(taskPrefix, "realignedBamCat_" + label) finishTasks.add( self.addTask(bamCatTaskLabel, bamCatCmd, dependencies=completeSegmentsTask))
def _runDepthShared(self,taskPrefix,dependencies, depthFunc) : """ estimate chrom depth using the specified depthFunc to compute per-sample dpeth """ bamList=[] if len(self.params.normalBamList) : bamList = self.params.normalBamList elif len(self.params.tumorBamList) : bamList = self.params.tumorBamList else : return set() outputPath=self.paths.getChromDepth() outputFilename=os.path.basename(outputPath) tmpDir=outputPath+".tmpdir" dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpDir, dependencies=dependencies, isForceLocal=True) tmpFiles = [] scatterTasks = set() for (bamIndex, bamFile) in enumerate(bamList) : indexStr = str(bamIndex).zfill(3) tmpFiles.append(os.path.join(tmpDir,outputFilename+"."+ indexStr +".txt")) scatterTasks |= setzer(depthFunc(self,taskPrefix+"_sample"+indexStr,dirTask,bamFile,tmpFiles[-1])) cmd = [ self.params.mergeChromDepth ] cmd.extend(["--out",outputPath]) for tmpFile in tmpFiles : cmd.extend(["--in",tmpFile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeChromDepth"),cmd,dependencies=scatterTasks,isForceLocal=True) nextStepWait = set() nextStepWait.add(mergeTask) rmTmpCmd = "rm -rf " + tmpDir rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmTmpCmd,dependencies=mergeTask, isForceLocal=True) return nextStepWait
def sortRealignBam(label, sortList) : unsorted = self.paths.getTmpUnsortRealignBamPath(gid, label) sorted = self.paths.getTmpRealignBamPath(gid, label) sortList.append(sorted) # adjust sorted to remove the ".bam" suffix sorted = sorted[:-4] sortCmd="\"%s\" sort \"%s\" \"%s\" && rm -f \"%s\"" % (self.params.samtoolsBin,unsorted,sorted,unsorted) sortTaskLabel=preJoin(taskPrefix,"sortRealignedSegment_"+label+"_"+gid) self.addTask(sortTaskLabel,sortCmd,dependencies=callTask,memMb=self.params.callMemMb) nextStepWait.add(sortTaskLabel)
def estimateParametersFromErrorCounts(self, sampleIndex, taskPrefix="", dependencies=None) : """ Estimate variant error parameters from sequencing error count data """ runEstimateLabel=preJoin(taskPrefix,"estimateVariantErrorRates") runEstimateCmd=[self.params.estimateVariantErrorRatesBin] runEstimateCmd.extend(["--counts-file", self.paths.getErrorCountsOutputPath(sampleIndex)]) runEstimateCmd.extend(["--theta-file",self.params.thetaParamFile]) runEstimateCmd.extend(["--output-file", self.paths.getIndelErrorModelPath(sampleIndex)]) runEstimateCmd.extend(["--fallback-file",self.params.indelErrorRateDefault]) return self.addTask(runEstimateLabel, runEstimateCmd, dependencies=dependencies, isForceLocal=True)
def summarizeStats(self, taskPrefix="", dependencies=None) : statsPath=self.paths.getStatsPath() summaryTasks = set() # summarize stats in format that's easier for human review cmd = [self.params.mantaStatsSummaryBin] cmd.extend(["--align-stats ", statsPath]) cmd.extend(["--output-file", self.paths.getStatsSummaryPath()]) summarizeTask = self.addTask(preJoin(taskPrefix,"summarizeStats"),cmd,dependencies=dependencies, isForceLocal=True) summaryTasks.add(summarizeTask) return summaryTasks
def finishVcf(tmpList, output, label) : assert(len(tmpList) > 0) if len(tmpList) > 1 : catCmd=[self.params.bgcatBin,"-o",output] catCmd.extend(tmpList) catCmd = " ".join(catCmd) else : catCmd="mv -f %s %s" % (tmpList[0],output) catCmd += " && %s -p vcf %s" % (self.params.tabixBin, output) finishTasks.add(self.addTask(preJoin(taskPrefix,label+"_finalizeVCF"), catCmd, dependencies=completeSegmentsTask))
def callGenome(self,taskPrefix="",dependencies=None): """ run variant caller on all genome segments """ tmpGraphDir=self.paths.getTmpSegmentDir() dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) graphTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params) : graphTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix,"completedAllGenomeSegments"),dependencies=graphTasks) finishTasks = set() def finishVcf(tmpList, output, label) : assert(len(tmpList) > 0) if len(tmpList) > 1 : catCmd=[self.params.bgcatBin,"-o",output] catCmd.extend(tmpList) catCmd = " ".join(catCmd) else : catCmd="mv -f %s %s" % (tmpList[0],output) catCmd += " && %s -p vcf %s" % (self.params.tabixBin, output) finishTasks.add(self.addTask(preJoin(taskPrefix,label+"_finalizeVCF"), catCmd, dependencies=completeSegmentsTask)) finishVcf(segFiles.gvcf, self.paths.getGvcfOutputPath(),"gVCF") cleanTask=self.addTask(preJoin(taskPrefix,"cleanTmpDir"), "rm -rf "+tmpGraphDir, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def countGenomeSegment(self, sampleIndex, gseg, segFiles, taskPrefix="", dependencies=None): """ Extract sequencing error count data from the genome segment specified by gseg.bamRegion """ genomeSegmentLabel = gseg.id segCmd = [self.params.getCountsBin] segCmd.extend(["--region", gseg.bamRegion]) segCmd.extend(["--ref", self.params.referenceFasta]) segCmd.extend(["-genome-size", str(self.params.totalKnownReferenceSize)]) segCmd.extend(["-max-indel-size", "50"]) segFiles.counts.append( self.paths.getTmpSegmentErrorCountsPath(sampleIndex, genomeSegmentLabel)) segCmd.extend(["--counts-file", segFiles.counts[-1]]) segFiles.nonEmptySiteCounts.append( self.paths.getTmpSegmentNonemptySiteCountsPath(sampleIndex, genomeSegmentLabel)) segCmd.extend( ["--nonempty-site-count-file", segFiles.nonEmptySiteCounts[-1]]) bamPath = self.params.bamList[sampleIndex] segCmd.extend(["--align-file", bamPath]) if self.params.isHighDepthFilter: segCmd.extend(["--chrom-depth-file", self.paths.getChromDepth()]) def addListCmdOption(optList, arg): if optList is None: return for val in optList: segCmd.extend([arg, val]) addListCmdOption(self.params.indelCandidatesList, '--candidate-indel-input-vcf') addListCmdOption(self.params.forcedGTList, '--force-output-vcf') setTaskLabel = preJoin(taskPrefix, "countErrors_" + gseg.id) self.addTask(setTaskLabel, segCmd, dependencies=dependencies, memMb=self.params.callMemMb) return setTaskLabel
def mergeSequenceErrorCounts(self, taskPrefix, dependencies, runStatsLogPaths): runMergeLabel = preJoin(taskPrefix, "mergeCounts") runMergeCmd = [self.params.mergeCountsBin] for statsFile in runStatsLogPaths: runMergeCmd.extend(["--counts-file", statsFile]) runMergeCmd.extend( ["--output-file", self.paths.getErrorCountsOutputPath()]) return self.addTask(runMergeLabel, runMergeCmd, dependencies=dependencies, isForceLocal=True)
def catRealignedBam(sampleIndex): segmentList = segFiles.sample[sampleIndex].bamRealign output = self.paths.getRealignedBamPath(sampleIndex) bamCatCmd = bamListCatCmd(self.params.samtoolsBin, segmentList, output) bamCatTaskLabel = preJoin( taskPrefix, "realignedBamCat_" + self.paths.sampleLabel(sampleIndex)) finishTasks.add( self.addTask(bamCatTaskLabel, bamCatCmd, dependencies=completeSegmentsTask))
def mergeSupportBams(self, mergeBamTasks, taskPrefix="", isNormal=True, bamIdx=0, dependencies=None): if isNormal: bamList = self.params.normalBamList else: bamList = self.params.tumorBamList for bamPath in bamList: # merge support bams supportBamFile = self.paths.getFinalSupportBamPath(bamPath, bamIdx) mergeCmd = [ sys.executable, self.params.mantaMergeBam, self.params.samtoolsBin, self.paths.getSortedSupportBamMask(bamIdx), supportBamFile, self.paths.getSupportBamListPath(bamIdx) ] mergeBamTask = self.addTask(preJoin(taskPrefix, "merge_evidenceBam_%s" % (bamIdx)), mergeCmd, dependencies=dependencies) mergeBamTasks.add(mergeBamTask) # index the filtered bam ### TODO still needs to handle the case where supportBamFile does not exist indexCmd = [self.params.samtoolsBin, "index", supportBamFile] indexBamTask = self.addTask(preJoin(taskPrefix, "index_evidenceBam_%s" % (bamIdx)), indexCmd, dependencies=mergeBamTask) mergeBamTasks.add(indexBamTask) bamIdx += 1 return bamIdx
def extractSmall(inPath, outPath): maxSize = int(self.params.minScoredVariantSize) - 1 if maxSize < 1: return smallCmd = getExtractSmallCmd(maxSize, inPath, outPath) smallTask = self.addTask(preJoin(taskPrefix, "extractSmallIndels"), smallCmd, dependencies=candSortTask, isForceLocal=True) nextStepWait.add( self.addTask(smallTask + "_tabix", getVcfTabixCmd(outPath), dependencies=smallTask, isForceLocal=True))
def sortVcfs(pathList, outPath, label, isDiploid=False, isCandidate=False): if len(pathList) == 0: return set() # make header modifications to first vcf in list of files to be sorted: headerFixTask = preJoin(taskPrefix, "fixVcfHeader_" + label) def getHeaderFixCmd(fileName): tmpName = fileName + ".reheader.tmp" cmd = "\"%s\" \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' cmd += " < \"%s\" > \"%s\"" % (fileName, tmpName) cmd += " && " + " ".join( getMvCmd()) + " \"%s\" \"%s\"" % (tmpName, fileName) return cmd self.addTask(headerFixTask, getHeaderFixCmd(pathList[0]), dependencies=dependencies, isForceLocal=True) vcfListFile = self.paths.getVcfListPath(label) inputVcfTask = self.addWorkflowTask(preJoin(taskPrefix, label + "InputList"), listFileWorkflow( vcfListFile, pathList), dependencies=headerFixTask) sortCmd = getVcfSortCmd(vcfListFile, outPath, isDiploid, isCandidate) sortTask = self.addTask(preJoin(taskPrefix, "sort_" + label), sortCmd, dependencies=inputVcfTask) nextStepWait.add( self.addTask(preJoin(taskPrefix, "tabix_" + label), getVcfTabixCmd(outPath), dependencies=sortTask, isForceLocal=True)) return sortTask
def runStats(self,taskPrefix="",dependencies=None) : statsPath=self.paths.getStatsPath() cmd = [ self.params.mantaStatsBin ] cmd.extend(["--output-file",statsPath]) for bamPath in self.params.normalBamList : cmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : cmd.extend(["--tumor-align-file",bamPath]) statsTask = self.addTask(preJoin(taskPrefix,"generateStats"),cmd,dependencies=dependencies) nextStepWait = set() nextStepWait.add(statsTask) # summarize stats for humans, no need for follow-up tasks to wait for this: cmd = self.params.mantaStatsSummaryBin cmd += " --align-stats " + statsPath cmd += " > " + self.paths.getStatsSummaryPath() self.addTask(preJoin(taskPrefix,"summarizeStats"),cmd,dependencies=statsTask) return nextStepWait
def depthFunc(self,taskPrefix,dependencies,bamFile,outFile) : outputPath=outFile outputFilename=os.path.basename(outputPath) tmpDir=os.path.join(outputPath+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpDir, dependencies=dependencies, isForceLocal=True) tmpFiles = [] scatterTasks = set() for (chromIndex, chromLabel) in enumerate(self.params.chromOrder) : cid = getRobustChromId(chromIndex, chromLabel) tmpFiles.append(os.path.join(tmpDir,outputFilename+"_"+cid)) cmd = [self.params.mantaGetChromDepthBin,"--align-file",bamFile,"--chrom",chromLabel,"--output",tmpFiles[-1]] scatterTasks.add(self.addTask(preJoin(taskPrefix,"estimateChromDepth_"+cid),cmd,dependencies=dirTask)) catCmd = "cat " + " ".join(["'%s'" % (x) for x in tmpFiles]) + " > '%s'" % (outputPath) catTask = self.addTask(preJoin(taskPrefix,"catChromDepth"),catCmd,dependencies=scatterTasks, isForceLocal=True) nextStepWait = set() nextStepWait.add(catTask) return nextStepWait
def callGenome(self,taskPrefix="",dependencies=None): """ run counter on all genome segments """ tmpSegmentDir=self.paths.getTmpSegmentDir() dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params) : segmentTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) if len(segmentTasks) == 0 : raise Exception("No genome regions to analyze. Possible target region parse error.") # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix,"completedAllGenomeSegments"),dependencies=segmentTasks) finishTasks = set() # merge segment stats: finishTasks.add(mergeSequenceAlleleCounts(self, taskPrefix, completeSegmentsTask, segFiles.counts)) if self.params.isReportObservedIndels : finishTasks.add(self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.observedIndelBed, self.paths.getObservedIndelBedPath(), "observedIndels")) if not self.params.isRetainTempFiles : rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmTmpCmd,dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def summarizeStats(self, taskPrefix="", dependencies=None): statsPath = self.paths.getStatsPath() summaryTasks = set() # summarize stats in format that's easier for human review cmd = [self.params.mantaStatsSummaryBin] cmd.extend(["--align-stats ", statsPath]) cmd.extend(["--output-file", self.paths.getStatsSummaryPath()]) summarizeTask = self.addTask(preJoin(taskPrefix, "summarizeStats"), cmd, dependencies=dependencies, isForceLocal=True) summaryTasks.add(summarizeTask) return summaryTasks
def runStats(self,taskPrefix="",dependencies=None): statsPath=self.paths.getStatsPath() cmd = [ self.params.mantaStatsBin ] cmd.extend(["--output-file",statsPath]) for bamPath in self.params.normalBamList : cmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : cmd.extend(["--tumor-align-file",bamPath]) nextStepWait = set() nextStepWait.add(self.addTask(preJoin(taskPrefix,"generateStats"),cmd,dependencies=dependencies)) return nextStepWait
def sortRealignBam(sampleIndex) : """ Sort each realigned bam output """ sortList = segFiles.sample[sampleIndex].bamRealign unsorted = self.paths.getTmpUnsortRealignBamPath(genomeSegmentLabel, sampleIndex) sorted = self.paths.getTmpSortRealignBamPath(genomeSegmentLabel, sampleIndex) sortList.append(sorted) sortCmd="\"%s\" sort \"%s\" -o \"%s\" && rm -f \"%s\"" %\ (self.params.samtoolsBin, unsorted, sorted, unsorted) sortTaskLabel=preJoin(taskPrefix,"sortRealignedSegment_"+ genomeSegmentLabel + "_" + self.paths.sampleLabel(sampleIndex)) self.addTask(sortTaskLabel, sortCmd, dependencies=segTaskLabel, memMb=self.params.callMemMb) nextStepWait.add(sortTaskLabel)
def mergeRunStats(self, taskPrefix, dependencies, runStatsLogPaths): """ merge run stats: """ runStatsMergeLabel = preJoin(taskPrefix, "mergeRunStats") runStatsMergeCmd = [self.params.statsMergeBin] for statsFile in runStatsLogPaths: runStatsMergeCmd.extend(["--stats-file", statsFile]) runStatsMergeCmd.extend( ["--output-file", self.paths.getRunStatsPath()]) runStatsMergeCmd.extend( ["--report-file", self.paths.getRunStatsReportPath()]) return self.addTask(runStatsMergeLabel, runStatsMergeCmd, dependencies=dependencies, isForceLocal=True)
def sortRealignBam(label, sortList): unsorted = self.paths.getTmpUnsortRealignBamPath( genomeSegmentLabel, label) sorted = self.paths.getTmpSortRealignBamPath( genomeSegmentLabel, label) sortList.append(sorted) sortCmd="\"%s\" sort \"%s\" -o \"%s\" && rm -f \"%s\"" %\ (self.params.samtoolsBin, unsorted, sorted, unsorted) sortTaskLabel = preJoin( taskPrefix, "sortRealignedSegment_" + genomeSegmentLabel + "_" + label) self.addTask(sortTaskLabel, sortCmd, dependencies=callTask, memMb=self.params.callMemMb) nextStepWait.add(sortTaskLabel)
def getSequenceErrorEstimatesForSample(self, estimationIntervals, sampleIndex, taskPrefix="", dependencies=None): """ Count sequencing errors in one sample and use these to estimate sample error parameters """ segmentTasks = set() segFiles = TempSequenceAlleleCountsSegmentFiles() if self.params.isErrorEstimationFromAllData: # get error counts from full data set: segmentTasks |= countAllEligibleSequenceEvidence( self, estimationIntervals, sampleIndex, segFiles, taskPrefix, dependencies) else: # Launch tasks until the required counts are found segmentTasks |= countSequenceEvidenceUntilTargetIsReached( self, estimationIntervals, sampleIndex, segFiles, taskPrefix, dependencies) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) # merge segment stats: mergeCountsTask = mergeSequenceAlleleCounts( self, sampleIndex, segFiles.counts, taskPrefix=taskPrefix, dependencies=completeSegmentsTask) # get error parameters: estimateTask = estimateParametersFromAlleleCounts( self, sampleIndex, taskPrefix=taskPrefix, dependencies=mergeCountsTask) nextStepWait = set() nextStepWait.add(estimateTask) return nextStepWait
def callGenomeSegment(self, gseg, segFiles, taskPrefix="", dependencies=None) : isFirstSegment = (len(segFiles.gvcf) == 0) segStr = str(gseg.id) segCmd = [ self.params.snoiseBin ] segCmd.extend(["--region", gseg.chromLabel + ":" + str(gseg.beginPos) + "-" + str(gseg.endPos)]) segCmd.extend(["-min-mapping-quality",self.params.minMapq]) segCmd.extend(["--ref", self.params.referenceFasta ]) segCmd.extend(["-max-window-mismatch", "2", "20" ]) segCmd.extend(["-genome-size", str(self.params.totalKnownReferenceSize)] ) segCmd.extend(["-max-indel-size", "50"] ) segCmd.extend(['-min-qscore','17']) segCmd.extend(['-bsnp-ssd-no-mismatch', '0.35']) segCmd.extend(['-bsnp-ssd-one-mismatch', '0.6']) segCmd.extend(['-min-vexp', '0.25']) for bamPath in self.params.bamList : segCmd.extend(["--align-file",bamPath]) if not isFirstSegment : segCmd.append("--skip-vcf-header") if self.params.indelCandidates is not None : segCmd.extend(['--candidate-indel-input-vcf', self.params.indelCandidates]) # vcf is written to stdout so we need shell features: segCmd = " ".join(segCmd) segFiles.gvcf.append(self.paths.getTmpSegmentGvcfPath(segStr)) segCmd += " | %s -c >| %s" % (self.params.bgzip9Bin, segFiles.gvcf[-1]) nextStepWait = set() setTaskLabel=preJoin(taskPrefix,"callGenomeSegment_"+gseg.id) self.addTask(setTaskLabel,segCmd,dependencies=dependencies,memMb=self.params.callMemMb) nextStepWait.add(setTaskLabel) return nextStepWait
def mergeSequenceAlleleCounts(self, sampleIndex, segmentAlleleCountsFiles, taskPrefix="", dependencies=None): """ Given sequencing error counts generated from multiple genome regions, merge these into a single error count set """ runMergeLabel = preJoin(taskPrefix, "mergeCounts") runMergeCmd = [self.params.mergeCountsBin] runMergeCmd.extend( ["--output-file", self.paths.getAlleleCountsOutputPath(sampleIndex)]) for segmentAlleleCountsFile in segmentAlleleCountsFiles: runMergeCmd.extend(["--counts-file", segmentAlleleCountsFile]) return self.addTask(runMergeLabel, runMergeCmd, dependencies=dependencies, isForceLocal=True)
def compressRawVcf(rawVcfFilename, label) : """ Process each raw vcf file with header modifications and bgzip compression """ compressedVariantsPath = rawVcfFilename +".gz" compressCmd = "cat "+quote(rawVcfFilename) if isFirstSegment : def getHeaderFixCmd() : cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' return cmd compressCmd += " | " + getHeaderFixCmd() compressCmd += " | \"%s\" -c >| \"%s\"" % (self.params.bgzip9Bin, compressedVariantsPath) compressTaskLabel=preJoin(taskPrefix,"compressGenomeSegment_"+genomeSegmentLabel+"_"+label) self.addTask(compressTaskLabel, compressCmd, dependencies=segTaskLabel, memMb=self.params.callMemMb) nextStepWait.add(compressTaskLabel) return compressedVariantsPath
def runDepth(self,taskPrefix="",dependencies=None) : """ estimate chrom depth """ bamFile="" if len(self.params.normalBamList) : bamFile = self.params.normalBamList[0] elif len(self.params.tumorBamList) : bamFile = self.params.tumorBamList[0] else : return set() cmd = "%s -E %s" % (sys.executable, self.params.getChromDepth) cmd += " --bam '%s'" % (bamFile) cmd += " > %s" % (self.paths.getChromDepth()) nextStepWait = set() nextStepWait.add(self.addTask(preJoin(taskPrefix,"estimateChromDepth"),cmd,dependencies=dependencies)) return nextStepWait
def sortEvidenceBams(self, sortBamTasks, taskPrefix="", binStr="", dependencies=None): for bamIdx, _ in enumerate(self.params.normalBamList + self.params.tumorBamList): supportBam = self.paths.getSupportBamPath(bamIdx, binStr) sortedBam = self.paths.getSortedSupportBamPath(bamIdx, binStr) # first check the existence of the supporting bam # then sort the bam only if it exists sortBamCmd = [ sys.executable, self.params.mantaSortBam, self.params.samtoolsBin, supportBam, sortedBam ] sortBamTask = preJoin(taskPrefix, "sortEvidenceBam_%s_%s" % (binStr, bamIdx)) sortBamTasks.add( self.addTask(sortBamTask, sortBamCmd, dependencies=dependencies))
def sortBams(self, sortBamTasks, taskPrefix="", binStr="", isNormal=True, bamIdx=0, dependencies=None): if isNormal: bamList = self.params.normalBamList else: bamList = self.params.tumorBamList for _ in bamList: supportBam = self.paths.getSupportBamPath(bamIdx, binStr) sortedBam = self.paths.getSortedSupportBamPath(bamIdx, binStr) # first check the existence of the supporting bam # then sort the bam only if it exists sortBamCmd = [ sys.executable, self.params.mantaSortBam, self.params.samtoolsBin, supportBam, sortedBam ] sortBamTask = preJoin(taskPrefix, "sortEvidenceBam_%s_%s" % (binStr, bamIdx)) sortBamTasks.add(self.addTask(sortBamTask, sortBamCmd, dependencies=dependencies)) bamIdx += 1 return bamIdx
def launchNextTask() : """ Launch the next task in queue for this sample Return false if there are no more jobs to launch """ taskIndex = len(allTasks) if taskIndex >= len(estimationIntervals) : return False gseg = estimationIntervals[taskIndex] countTask = countGenomeSegment(self, sampleIndex, gseg, segFiles, taskPrefix=taskPrefix, dependencies=dependencies) #self.flowLog("ZZZ Sample%i launching taskIndex/task %i %s" % (sampleIndex, taskIndex, countTask)) allTasks.add(countTask) taskByIndex.append(countTask) updateTaskLabel=preJoin(taskPrefix,"trackCounts_"+gseg.id) updateWorkflow = UpdateCompletedTaskTrackerWorkflow(taskIndex, segFiles.nonEmptySiteCounts[-1], completedTaskTracker) self.addWorkflowTask(updateTaskLabel, updateWorkflow, dependencies=countTask, isEphemeral=True) return True
def callGenomeSegment(self, gsegGroup, segFiles, taskPrefix="", dependencies=None) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id isFirstSegment = (len(segFiles.snv) == 0) segCmd = [ self.params.strelkaSomaticBin ] for gseg in gsegGroup : segCmd.extend(["--region", gseg.bamRegion]) segCmd.append("-filter-unanchored") segCmd.extend(["-min-mapping-quality",str(self.params.minTier1Mapq)]) segCmd.extend(["-min-qscore","0"]) segCmd.extend(["--ref", self.params.referenceFasta ]) segCmd.extend(["-max-window-mismatch", "3", "20" ]) segCmd.extend(["-genome-size", str(self.params.knownSize)] ) segCmd.extend(["-max-indel-size", "50"] ) segCmd.extend(["-indel-nonsite-match-prob", "0.5"] ) segCmd.extend(["--somatic-snv-rate", str(self.params.ssnvPrior) ] ) segCmd.extend(["--shared-site-error-rate", str(self.params.ssnvNoise) ] ) segCmd.extend(["--shared-site-error-strand-bias-fraction", str(self.params.ssnvNoiseStrandBiasFrac) ] ) segCmd.extend(["--somatic-indel-rate", str(self.params.sindelPrior) ] ) segCmd.extend(["--shared-indel-error-factor", str(self.params.sindelNoiseFactor)]) segCmd.extend(["--tier2-min-mapping-quality", str(self.params.minTier2Mapq) ] ) segCmd.extend(["--tier2-mismatch-density-filter-count", "10"] ) segCmd.append("--tier2-no-filter-unanchored") segCmd.extend(["--tier2-indel-nonsite-match-prob", "0.25"] ) segCmd.append("--tier2-include-singleton") segCmd.append("--tier2-include-anomalous") segCmd.extend(["--strelka-snv-max-filtered-basecall-frac", str(self.params.snvMaxFilteredBasecallFrac)]) segCmd.extend(["--strelka-snv-max-spanning-deletion-frac", str(self.params.snvMaxSpanningDeletionFrac)]) segCmd.extend(["--strelka-snv-min-qss-ref", str(self.params.ssnvQuality_LowerBound)]) segCmd.extend(["--strelka-indel-max-window-filtered-basecall-frac", str(self.params.indelMaxWindowFilteredBasecallFrac)]) segCmd.extend(["--strelka-indel-min-qsi-ref", str(self.params.sindelQuality_LowerBound)]) if self.params.indelErrorModelName is not None : segCmd.extend(['--indel-error-model-name',self.params.indelErrorModelName]) if self.params.inputIndelErrorModelsFile is not None : segCmd.extend(['--indel-error-models-file', self.params.inputIndelErrorModelsFile]) segCmd.extend(["--ssnv-contam-tolerance", str(self.params.ssnvContamTolerance) ] ) segCmd.extend(["--indel-contam-tolerance", str(self.params.indelContamTolerance) ] ) if self.params.isEVS : if self.params.somaticSnvScoringModelFile is not None : segCmd.extend(['--somatic-snv-scoring-model-file', self.params.somaticSnvScoringModelFile]) if self.params.somaticIndelScoringModelFile is not None : segCmd.extend(['--somatic-indel-scoring-model-file', self.params.somaticIndelScoringModelFile]) if self.params.isReportEVSFeatures : segCmd.append("--report-evs-features") for bamPath in self.params.normalBamList : segCmd.extend(["--normal-align-file", bamPath]) for bamPath in self.params.tumorBamList : segCmd.extend(["--tumor-align-file", bamPath]) tmpSnvPath = self.paths.getTmpSegmentSnvPath(gid) segFiles.snv.append(tmpSnvPath+".gz") segCmd.extend(["--somatic-snv-file ", tmpSnvPath ] ) tmpIndelPath = self.paths.getTmpSegmentIndelPath(gid) segFiles.indel.append(tmpIndelPath+".gz") segCmd.extend(["--somatic-indel-file", tmpIndelPath ] ) if self.params.isOutputCallableRegions : tmpCallablePath = self.paths.getTmpSegmentRegionPath(gid) segFiles.callable.append(tmpCallablePath+".gz") segCmd.extend(["--somatic-callable-regions-file", tmpCallablePath ]) if self.params.isWriteRealignedBam : segCmd.extend(["-realigned-read-file", self.paths.getTmpUnsortRealignBamPath(gid, "normal")]) segCmd.extend(["--tumor-realigned-read-file",self.paths.getTmpUnsortRealignBamPath(gid, "tumor")]) def addListCmdOption(optList,arg) : if optList is None : return for val in optList : segCmd.extend([arg, val]) addListCmdOption(self.params.indelCandidatesList, '--candidate-indel-input-vcf') addListCmdOption(self.params.forcedGTList, '--force-output-vcf') addListCmdOption(self.params.noiseVcfList, '--noise-vcf') segFiles.stats.append(self.paths.getTmpRunStatsPath(gid)) segCmd.extend(["--stats-file", segFiles.stats[-1]]) if not isFirstSegment : segCmd.append("--strelka-skip-header") if self.params.isHighDepthFilter : segCmd.extend(["--strelka-chrom-depth-file", self.paths.getChromDepth()]) segCmd.extend(["--strelka-max-depth-factor", self.params.depthFilterMultiple]) if self.params.extraVariantCallerArguments is not None : for arg in self.params.extraVariantCallerArguments.strip().split() : segCmd.append(arg) nextStepWait = set() callTask=preJoin(taskPrefix,"callGenomeSegment_"+gid) self.addTask(callTask,segCmd,dependencies=dependencies,memMb=self.params.callMemMb) # fix vcf header to use parent pyflow cmdline instead of random segment command: compressWaitFor=callTask if isFirstSegment : headerFixTask=preJoin(taskPrefix,"fixVcfHeader_"+gid) def getHeaderFixCmd(fileName) : tmpName=fileName+".reheader.tmp" cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' cmd += " < \"%s\" > \"%s\" && mv \"%s\" \"%s\"" % (fileName,tmpName, tmpName, fileName) return cmd headerFixCmd = getHeaderFixCmd(tmpSnvPath) headerFixCmd += " && " headerFixCmd += getHeaderFixCmd(tmpIndelPath) self.addTask(headerFixTask, headerFixCmd, dependencies=callTask, isForceLocal=True) compressWaitFor=headerFixTask compressTask=preJoin(taskPrefix,"compressSegmentOutput_"+gid) compressCmd="\"%s\" \"%s\" && \"%s\" \"%s\"" % (self.params.bgzipBin, tmpSnvPath, self.params.bgzipBin, tmpIndelPath) if self.params.isOutputCallableRegions : compressCmd += " && \"%s\" \"%s\"" % (self.params.bgzipBin, self.paths.getTmpSegmentRegionPath(gid)) self.addTask(compressTask, compressCmd, dependencies=compressWaitFor, isForceLocal=True) nextStepWait.add(compressTask) if self.params.isWriteRealignedBam : def sortRealignBam(label, sortList) : unsorted = self.paths.getTmpUnsortRealignBamPath(gid, label) sorted = self.paths.getTmpRealignBamPath(gid, label) sortList.append(sorted) # adjust sorted to remove the ".bam" suffix sorted = sorted[:-4] sortCmd="\"%s\" sort \"%s\" \"%s\" && rm -f \"%s\"" % (self.params.samtoolsBin,unsorted,sorted,unsorted) sortTaskLabel=preJoin(taskPrefix,"sortRealignedSegment_"+label+"_"+gid) self.addTask(sortTaskLabel,sortCmd,dependencies=callTask,memMb=self.params.callMemMb) nextStepWait.add(sortTaskLabel) sortRealignBam("normal", segFiles.normalRealign) sortRealignBam("tumor", segFiles.tumorRealign) return nextStepWait
def depthFunc(self, taskPrefix, dependencies, bamFile, outFile): outputPath = outFile outputFilename = os.path.basename(outputPath) tmpDir = os.path.join(outputPath + ".tmpdir") makeTmpDirCmd = getMkdirCmd() + [tmpDir] dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True) tmpFiles = [] scatterTasks = set() def getChromosomeGroups(params): """ Iterate through chromosomes/contigs and group small contigs together. This functions as a generator yielding successive contig groups. """ minSize = 200000 group = [] headSize = 0 chromCount = len(params.chromSizes) assert (len(params.chromOrder) == chromCount) for chromIndex in range(chromCount): chromLabel = params.chromOrder[chromIndex] if chromLabel in params.chromIsSkipped: continue chromSize = params.chromSizes[chromLabel] if headSize + chromSize <= minSize: group.append((chromIndex, chromLabel)) headSize += chromSize else: if len(group) != 0: yield (group) group = [(chromIndex, chromLabel)] headSize = chromSize if len(group) != 0: yield (group) for chromGroup in getChromosomeGroups(self.params): assert (len(chromGroup) > 0) cid = getRobustChromId(chromGroup[0][0], chromGroup[0][1]) if len(chromGroup) > 1: cid += "_to_" + getRobustChromId(chromGroup[-1][0], chromGroup[-1][1]) tmpFiles.append(os.path.join(tmpDir, outputFilename + "_" + cid)) cmd = [ self.params.getChromDepthBin, "--align-file", bamFile, "--output", tmpFiles[-1] ] for (chromIndex, chromLabel) in chromGroup: cmd.extend(["--chrom", chromLabel]) scatterTasks.add( self.addTask(preJoin(taskPrefix, "estimateChromDepth_" + cid), cmd, dependencies=dirTask)) catCmd = [self.params.catScript, "--output", outputPath] + tmpFiles catTask = self.addTask(preJoin(taskPrefix, "catChromDepth"), catCmd, dependencies=scatterTasks, isForceLocal=True) nextStepWait = set() nextStepWait.add(catTask) return nextStepWait
def callGenomeSegment(self, gsegGroup, segFiles, taskPrefix="", dependencies=None): assert (len(gsegGroup) != 0) gid = gsegGroup[0].id if len(gsegGroup) > 1: gid += "_to_" + gsegGroup[-1].id isFirstSegment = (len(segFiles.variants) == 0) segCmd = [self.params.strelkaGermlineBin] self.appendCommonGenomeSegmentCommandOptions(gsegGroup, segCmd) segCmd.extend(["-min-mapping-quality", self.params.minMapq]) segCmd.extend(["-max-window-mismatch", "2", "20"]) segCmd.extend( ["--gvcf-output-prefix", self.paths.getTmpSegmentGvcfPrefix(gid)]) segCmd.extend(['--gvcf-min-gqx', '15']) segCmd.extend(['--gvcf-min-homref-gqx', '15']) segCmd.extend(['--gvcf-max-snv-strand-bias', '10']) segCmd.extend(['-min-qscore', '17']) segCmd.extend(['-bsnp-ssd-no-mismatch', '0.35']) segCmd.extend(['-bsnp-ssd-one-mismatch', '0.6']) segCmd.extend(['-min-vexp', '0.25']) segCmd.extend(['--enable-read-backed-phasing']) segFiles.stats.append(self.paths.getTmpRunStatsPath(gid)) segCmd.extend(["--stats-file", segFiles.stats[-1]]) if self.params.isRNA: segCmd.extend(['-bsnp-diploid-het-bias', '0.45']) segCmd.extend(['--use-rna-scoring']) segCmd.extend(['--retain-optimal-soft-clipping']) # Empirical Variant Scoring(EVS): if self.params.isEVS: if self.params.snvScoringModelFile is not None: segCmd.extend( ['--snv-scoring-model-file', self.params.snvScoringModelFile]) if self.params.indelScoringModelFile is not None: segCmd.extend([ '--indel-scoring-model-file', self.params.indelScoringModelFile ]) for bamPath in self.params.bamList: segCmd.extend(["--align-file", bamPath]) if not isFirstSegment: segCmd.append("--gvcf-skip-header") elif len(self.params.callContinuousVf) > 0: segCmd.extend(["--gvcf-include-header", "VF"]) if self.params.isHighDepthFilter: segCmd.extend(["--chrom-depth-file", self.paths.getChromDepth()]) # TODO STREL-125 come up with new solution for outbams if self.params.isWriteRealignedBam: segCmd.extend([ "-realigned-read-file", self.paths.getTmpUnsortRealignBamPath(gid) ]) if self.params.noCompressBed is not None: segCmd.extend(['--nocompress-bed', self.params.noCompressBed]) if self.params.ploidyFilename is not None: segCmd.extend(['--ploidy-region-vcf', self.params.ploidyFilename]) for gseg in gsegGroup: # we have special logic to prevent the continuousVF targets from being grouped, the assertion here # verifies that this is working as expected: if self.params.callContinuousVf is not None and gseg.chromLabel in self.params.callContinuousVf: assert (len(gsegGroup) == 1) segCmd.append('--call-continuous-vf') if self.params.isEstimateSequenceError: for bamIndex in range(len(self.params.bamList)): segCmd.extend([ '--indel-error-models-file', self.paths.getIndelErrorModelPath(bamIndex) ]) else: segCmd.extend( ['--indel-error-models-file', self.params.indelErrorRateDefault]) segCmd.extend(['--theta-file', self.params.thetaParamFile]) segTaskLabel = preJoin(taskPrefix, "callGenomeSegment_" + gid) self.addTask(segTaskLabel, segCmd, dependencies=dependencies, memMb=self.params.callMemMb) # clean up and compress genome segment files: nextStepWait = set() def compressRawVcf(rawVcfFilename, label): """ process each raw vcf file with header modifications and bgzip compression """ compressedVariantsPath = rawVcfFilename + ".gz" compressCmd = "cat " + quote(rawVcfFilename) if isFirstSegment: def getHeaderFixCmd(): cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' return cmd compressCmd += " | " + getHeaderFixCmd() compressCmd += " | \"%s\" -c >| \"%s\"" % (self.params.bgzip9Bin, compressedVariantsPath) compressTaskLabel = preJoin( taskPrefix, "compressGenomeSegment_" + gid + "_" + label) self.addTask(compressTaskLabel, compressCmd, dependencies=segTaskLabel, memMb=self.params.callMemMb) nextStepWait.add(compressTaskLabel) return compressedVariantsPath rawVariantsPath = self.paths.getTmpSegmentVariantsPath(gid) compressedVariantsPath = compressRawVcf(rawVariantsPath, "variants") segFiles.variants.append(compressedVariantsPath) sampleCount = len(self.params.bamList) for sampleIndex in range(sampleCount): rawVariantsPath = self.paths.getTmpSegmentGvcfPath(gid, sampleIndex) compressedVariantsPath = compressRawVcf(rawVariantsPath, gvcfSampleLabel(sampleIndex)) segFiles.sample[sampleIndex].gvcf.append(compressedVariantsPath) if self.params.isWriteRealignedBam: def sortRealignBam(sortList): unsorted = self.paths.getTmpUnsortRealignBamPath(gid) sorted = self.paths.getTmpRealignBamPath(gid) sortList.append(sorted) # adjust sorted to remove the ".bam" suffix sorted = sorted[:-4] sortCmd = "\"%s\" sort \"%s\" \"%s\" && rm -f \"%s\"" % ( self.params.samtoolsBin, unsorted, sorted, unsorted) sortTaskLabel = preJoin(taskPrefix, "sortRealignedSegment_" + gid) self.addTask(sortTaskLabel, sortCmd, dependencies=segTaskLabel, memMb=self.params.callMemMb) nextStepWait.add(sortTaskLabel) sortRealignBam(segFiles.bamRealign) return nextStepWait
def finishBam(tmpList, output, label): cmd = bamListCatCmd(self.params.samtoolsBin, tmpList, output) finishTasks.add( self.addTask(preJoin(taskPrefix, label + "_finalizeBAM"), cmd, dependencies=completeSegmentsTask))
def callGenome(self, taskPrefix="", dependencies=None): """ run variant caller on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() sampleCount = len(self.params.bamList) segFiles = TempVariantCallingSegmentFiles(sampleCount) for gsegGroup in self.getStrelkaGenomeSegmentGroupIterator( contigsExcludedFromGrouping=self.params.callContinuousVf): segmentTasks |= callGenomeSegment(self, gsegGroup, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() # merge various VCF outputs finishTasks.add( self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.variants, self.paths.getVariantsOutputPath(), "variants")) for sampleIndex in range(sampleCount): concatTask = self.concatIndexVcf( taskPrefix, completeSegmentsTask, segFiles.sample[sampleIndex].gvcf, self.paths.getGvcfOutputPath(sampleIndex), gvcfSampleLabel(sampleIndex)) finishTasks.add(concatTask) if sampleIndex == 0: outputPath = self.paths.getGvcfOutputPath(sampleIndex) outputDirname = os.path.dirname(outputPath) outputBasename = os.path.basename(outputPath) def linkLegacy(extension): return "ln -s " + quote( outputBasename + extension) + " " + quote( self.paths.getGvcfLegacyFilename() + extension) linkCmd = linkLegacy("") + " && " + linkLegacy(".tbi") self.addTask(preJoin(taskPrefix, "addLegacyOutputLink"), linkCmd, dependencies=concatTask, isForceLocal=True, cwd=outputDirname) # merge segment stats: finishTasks.add( self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats)) if self.params.isWriteRealignedBam: def finishBam(tmpList, output, label): cmd = bamListCatCmd(self.params.samtoolsBin, tmpList, output) finishTasks.add( self.addTask(preJoin(taskPrefix, label + "_finalizeBAM"), cmd, dependencies=completeSegmentsTask)) finishBam(segFiles.bamRealign, self.paths.getRealignedBamPath(), "realigned") if not self.params.isRetainTempFiles: rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] self.addTask(preJoin(taskPrefix, "removeTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def callGenomeSegment(self, gseg, segFiles, taskPrefix="", dependencies=None): segStr = str(gseg.id) segCmd = [self.params.getCountsBin] segCmd.extend([ "--region", gseg.chromLabel + ":" + str(gseg.beginPos) + "-" + str(gseg.endPos) ]) segCmd.extend(["--ref", self.params.referenceFasta]) segCmd.extend(["-genome-size", str(self.params.knownSize)]) segCmd.extend(["-max-indel-size", "50"]) segFiles.counts.append(self.paths.getTmpSegmentCountsPath(segStr)) segCmd.extend(["--counts-file", segFiles.counts[-1]]) for bamPath in self.params.bamList: segCmd.extend(["--align-file", bamPath]) if self.params.isHighDepthFilter: segCmd.extend(["--chrom-depth-file", self.paths.getChromDepth()]) def addListCmdOption(optList, arg): if optList is None: return for val in optList: segCmd.extend([arg, val]) addListCmdOption(self.params.indelCandidatesList, '--candidate-indel-input-vcf') addListCmdOption(self.params.forcedGTList, '--force-output-vcf') addListCmdOption(self.params.excludedRegions, "--excluded-regions-bed-file") if self.params.knownVariants is not None: segCmd.extend(["--known-variants-vcf-file", self.params.knownVariants]) if self.params.isReportObservedIndels: tmpObservedIndelBedPath = self.paths.getTmpObservedIndelBedPath(segStr) segFiles.observedIndelBed.append(tmpObservedIndelBedPath + ".gz") segCmd.extend(['--observation-bed-file', tmpObservedIndelBedPath]) if self.params.extraCountsArguments is not None: for arg in self.params.extraCountsArguments.strip().split(): segCmd.append(arg) nextStepWait = set() setTaskLabel = preJoin(taskPrefix, "countGenomeSegment_" + gseg.id) self.addTask(setTaskLabel, segCmd, dependencies=dependencies, memMb=self.params.callMemMb) nextStepWait.add(setTaskLabel) if self.params.isReportObservedIndels: compressTask = preJoin(taskPrefix, "compressSegmentOutput_" + gseg.id) compressCmd = "\"%s\" \"%s\"" % (self.params.bgzipBin, tmpObservedIndelBedPath) self.addTask(compressTask, compressCmd, dependencies=setTaskLabel, isForceLocal=True) nextStepWait.add(compressTask) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() dirTask=self.addTask(preJoin(taskPrefix,"makeHyGenDir"), "mkdir -p "+ hygenDir, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() candidateVcfPaths = [] diploidVcfPaths = [] somaticVcfPaths = [] tumorVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) else: diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", tumorVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isSomatic : hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # temporary fix for FFPE: hygenCmd.append("--skip-remote-reads") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) def getVcfSortCmd(vcfPaths, outPath, isDiploid) : cmd = "%s -E %s -u " % (sys.executable,self.params.mantaSortVcf) cmd += " ".join(vcfPaths) # apply the ploidy filter to diploid variants if isDiploid: tempVcf = self.paths.getTempDiploidPath() cmd += " > %s" % (tempVcf) cmd += " && %s %s" % (self.params.mantaPloidyFilter, tempVcf) cmd += " | %s -c > %s && %s -p vcf %s" % (self.params.bgzipBin, outPath, self.params.tabixBin, outPath) if isDiploid: cmd += " && rm -f %s" % (self.paths.getTempDiploidPath()) return cmd def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortLabel=preJoin(taskPrefix,label) nextStepWait.add(self.addTask(sortLabel,sortCmd,dependencies=hygenTasks)) return sortLabel candSortTask = sortVcfs(candidateVcfPaths, self.paths.getSortedCandidatePath(), "sortCandidateSV") sortVcfs(diploidVcfPaths, self.paths.getSortedDiploidPath(), "sortDiploidSV", isDiploid=True) sortVcfs(somaticVcfPaths, self.paths.getSortedSomaticPath(), "sortSomaticSV") sortVcfs(tumorVcfPaths, self.paths.getSortedTumorPath(), "sortTumorSV") def getExtractSmallCmd(maxSize, inPath, outPath) : cmd = "%s -dc %s" % (self.params.bgzipBin, inPath) cmd += " | %s -E %s --maxSize %i" % (sys.executable, self.params.mantaExtraSmallVcf, maxSize) cmd += " | %s -c > %s" % (self.params.bgzipBin, outPath) cmd += " && %s -p vcf %s" % (self.params.tabixBin, outPath) return cmd def extractSmall(inPath, outPath) : maxSize = int(self.params.minScoredVariantSize) - 1 if maxSize < 1 : return smallCmd = getExtractSmallCmd(maxSize, inPath, outPath) smallLabel=preJoin(taskPrefix,"extractSmallIndels") nextStepWait.add(self.addTask(smallLabel, smallCmd, dependencies=candSortTask, isForceLocal=True)) extractSmall(self.paths.getSortedCandidatePath(), self.paths.getSortedCandidateSmallIndelsPath()) # sort edge logs: edgeSortLabel=preJoin(taskPrefix,"sortEdgeRuntimeLogs") edgeSortCmd="sort -rnk2 " + " ".join(edgeRuntimeLogPaths) + " >| " + self.paths.getSortedEdgeRuntimeLogPath() self.addTask(edgeSortLabel, edgeSortCmd, dependencies=hygenTasks, isForceLocal=True) # merge edge stats: edgeStatsMergeLabel=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] for statsFile in edgeStatsLogPaths : edgeStatsMergeCmd.extend(["--stats-file",statsFile]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeLabel, edgeStatsMergeCmd, dependencies=hygenTasks, isForceLocal=True) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isTumorNormal = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isTumorNormal) and len(self.params.tumorBamList)) hygenTasks=set() if self.params.isGenerateSupportBam : sortBamVcfTasks = set() self.candidateVcfPaths = [] self.diploidVcfPaths = [] self.somaticVcfPaths = [] self.tumorVcfPaths = [] self.rnaVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) elif self.params.isRNA: self.rnaVcfPaths.append(self.paths.getHyGenRnaPath(binStr)) else: self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isTumorNormal : self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--max-edge-count", str(self.params.graphNodeMaxEdgeCount)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]]) elif self.params.isRNA: hygenCmd.extend(["--rna-output-file", self.rnaVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isTumorNormal : hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # Setup remote read retrieval for insertions: def isEnableRemoteReadRetrieval() : if isTumorOnly or isTumorNormal : return self.params.enableRemoteReadRetrievalForInsertionsInCancerCallingModes else : return self.params.enableRemoteReadRetrievalForInsertionsInGermlineCallingModes if isEnableRemoteReadRetrieval() : hygenCmd.append("--enable-remote-read-retrieval") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) if self.params.isGenerateSupportBam : hygenCmd.extend(["--evidence-bam-stub", self.paths.getSupportBamStub(binStr)]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.useOverlapPairEvidence: hygenCmd.append("--use-overlapping-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") if self.params.isOutputContig : hygenCmd.append("--output-contigs") hygenTask = preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=self.params.hyGenMemMb)) # TODO: if the bam is large, for efficiency, consider # 1) filtering the bin-specific bam first w.r.t. the final candidate vcf # 2) then sort the bin-specific bam and merge them # This would require moving the filter/sort bam jobs outside the hygen loop if self.params.isGenerateSupportBam : bamIndex = 0 # sort supporting bams extracted from normal samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=True, bamIdx=bamIndex, dependencies=hygenTask) # sort supporting bams extracted from tumor samples bamIndex = sortBams(self, sortBamVcfTasks, taskPrefix=taskPrefix, binStr=binStr, isNormal=False, bamIdx=bamIndex, dependencies=hygenTask) vcfTasks = sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks) nextStepWait = copy.deepcopy(hygenTasks) if self.params.isGenerateSupportBam : sortBamVcfTasks.union(vcfTasks) mergeBamTasks = set() bamCount = 0 # merge supporting bams for each normal sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=True, bamIdx=bamCount, dependencies=sortBamVcfTasks) # merge supporting bams for each tumor sample bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix, isNormal=False, bamIdx=bamCount, dependencies=sortBamVcfTasks) nextStepWait = nextStepWait.union(sortBamVcfTasks) nextStepWait = nextStepWait.union(mergeBamTasks) # # sort the edge runtime logs # logListFile = self.paths.getEdgeRuntimeLogListPath() logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList") self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks) def getEdgeLogSortCmd(logListFile, outPath) : cmd = [sys.executable, self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath] return cmd edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True) # # merge all edge stats # statsFileList = self.paths.getStatsFileListPath() statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList") self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks) edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True) if not self.params.isRetainTempFiles : # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present. # rmDirCmd = getRmdirCmd() + [hygenDir] # rmDirTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY) pass return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() dirTask=self.addTask(preJoin(taskPrefix,"makeHyGenDir"), "mkdir -p "+ hygenDir, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() candidateVcfPaths = [] diploidVcfPaths = [] somaticVcfPaths = [] edgeLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) edgeLogPaths.append(self.paths.getHyGenEdgeLogPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]]) hygenCmd.extend(["--diploid-output-file", diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassGTScore]) if isSomatic : hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) hygenCmd.extend(["--edge-runtime-log", edgeLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) def getVcfSortCmd(vcfPaths, outPath) : cmd = "%s -E %s -u " % (sys.executable,self.params.mantaSortVcf) cmd += " ".join(vcfPaths) cmd += " | %s -c > %s && %s -p vcf %s" % (self.params.bgzipBin, outPath, self.params.tabixBin, outPath) return cmd def sortVcfs(pathList, outPath, label) : if len(pathList) == 0 : return sortCmd = getVcfSortCmd(pathList,outPath) sortLabel=preJoin(taskPrefix,label) nextStepWait.add(self.addTask(sortLabel,sortCmd,dependencies=hygenTasks)) sortVcfs(candidateVcfPaths, self.paths.getSortedCandidatePath(), "sortCandidateSV") sortVcfs(diploidVcfPaths, self.paths.getSortedDiploidPath(), "sortDiploidSV") sortVcfs(somaticVcfPaths, self.paths.getSortedSomaticPath(), "sortSomaticSV") # sort edge logs: edgeSortLabel=preJoin(taskPrefix,"sortEdgeLogs") edgeSortCmd="sort -rnk2 " + " ".join(edgeLogPaths) + " >| " + self.paths.getSortedEdgeLogPath() self.addTask(edgeSortLabel, edgeSortCmd, dependencies=hygenTasks, isForceLocal=True) return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() tmpGraphDir=self.paths.getTmpGraphDir() makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask = self.addTask(preJoin(taskPrefix,"makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id tmpGraphFiles.append(self.paths.getTmpGraphFile(gid)) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) for gseg in gsegGroup : graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTask=preJoin(taskPrefix,"makeLocusGraph_"+gid) graphTasks.add(self.addTask(graphTask,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") tmpGraphFileList = self.paths.getTmpGraphFileListPath() tmpGraphFileListTask = preJoin(taskPrefix,"mergeLocusGraphInputList") self.addWorkflowTask(tmpGraphFileListTask,listFileWorkflow(tmpGraphFileList,tmpGraphFiles),dependencies=graphTasks) mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) mergeCmd.extend(["--graph-file-list",tmpGraphFileList]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=tmpGraphFileListTask,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) if not self.params.isRetainTempFiles : rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin,"--global"] graphStatsCmd.extend(["--graph-file",graphPath]) graphStatsCmd.extend(["--output-file",graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortTask=self.addTask(preJoin(taskPrefix,"sort_"+label),sortCmd,dependencies=hygenTasks) nextStepWait.add(self.addTask(preJoin(taskPrefix,"tabix_"+label),getVcfTabixCmd(outPath),dependencies=sortTask,isForceLocal=True)) return sortTask
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() dirTask=self.addTask(preJoin(taskPrefix,"makeHyGenDir"), "mkdir -p "+ hygenDir, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) hygenTasks=set() candidateVcfPaths = [] somaticVcfPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isSomatic : somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]]) if isSomatic : hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]]) if not self.params.isExome : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=self.params.hyGenMemMb)) nextStepWait = hygenTasks def getVcfSortCmd(vcfPaths, outPath) : cmd = "%s -E %s " % (sys.executable,self.params.mantaSortVcf) cmd += " ".join(vcfPaths) cmd += " | %s -c > %s && %s -p vcf %s" % (self.params.bgzipBin, outPath, self.params.tabixBin, outPath) return cmd # consolidate output: if len(candidateVcfPaths) : outPath = self.paths.getSortedCandidatePath() candSortCmd = getVcfSortCmd(candidateVcfPaths,outPath) candSortLabel=preJoin(taskPrefix,"sortCandidateSV") nextStepWait.add(self.addTask(candSortLabel,candSortCmd,dependencies=hygenTasks)) if len(somaticVcfPaths) : outPath = self.paths.getSortedSomaticPath() candSortCmd = getVcfSortCmd(somaticVcfPaths,outPath) candSortLabel=preJoin(taskPrefix,"sortSomaticSV") nextStepWait.add(self.addTask(candSortLabel,candSortCmd,dependencies=hygenTasks)) return nextStepWait
def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortLabel=preJoin(taskPrefix,label) nextStepWait.add(self.addTask(sortLabel,sortCmd,dependencies=hygenTasks)) return sortLabel
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() self.candidateVcfPaths = [] self.diploidVcfPaths = [] self.somaticVcfPaths = [] self.tumorVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) else: self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isSomatic : hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # temporary fix for FFPE: hygenCmd.append("--skip-remote-reads") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") hygenTask=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks) # # sort the edge runtime logs # logListFile = self.paths.getEdgeRuntimeLogListPath() logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList") self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks) def getEdgeLogSortCmd(logListFile, outPath) : cmd = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath] return cmd edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True) # # merge all edge stats # statsFileList = self.paths.getStatsFileListPath() statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList") self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks) edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True) if not self.params.isRetainTempFiles : # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present. # rmDirCmd = getRmdirCmd() + [hygenDir] # rmDirTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY) pass return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") makeTmpDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() def getGenomeSegmentGroups(params) : """ Iterate segment groups and 'clump' small contigs together """ minSegmentGroupSize=200000 group = [] headSize = 0 for gseg in getNextGenomeSegment(self.params) : if headSize+gseg.size() <= minSegmentGroupSize : group.append(gseg) headSize += gseg.size() else : if len(group) != 0 : yield(group) group = [gseg] headSize = gseg.size() if len(group) != 0 : yield(group) for gsegGroup in getGenomeSegmentGroups(self.params) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gid+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) for gseg in gsegGroup : graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gid) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin,"--global"] graphStatsCmd.extend(["--graph-file",graphPath]) graphStatsCmd.extend(["--output-file",graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def depthFunc(self,taskPrefix,dependencies,bamFile,outFile) : cmd = "%s -E '%s'" % (sys.executable, self.params.getChromDepth) cmd += " --bam '%s'" % (bamFile) cmd += " > %s" % (outFile) return self.addTask(preJoin(taskPrefix,"estimateChromDepth"),cmd,dependencies=dependencies)
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gseg in getNextGenomeSegment(self.params) : tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gseg.id+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gseg.pyflowId) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = "rm -rf " + tmpGraphDir rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = self.params.mantaGraphStatsBin graphStatsCmd += " --global" graphStatsCmd += " --graph-file " + graphPath graphStatsCmd += " >| " + graphStatsPath graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runHyGen(self, taskPrefix="", dependencies=None) : """ Run hypothesis generation on each SV locus """ import copy statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() hygenDir=self.paths.getHyGenDir() makeHyGenDirCmd = getMkdirCmd() + [hygenDir] dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True) isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList)) isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList)) hyGenMemMb = self.params.hyGenLocalMemMb if self.getRunMode() == "sge" : hyGenMemMb = self.params.hyGenSGEMemMb hygenTasks=set() candidateVcfPaths = [] diploidVcfPaths = [] somaticVcfPaths = [] tumorVcfPaths = [] edgeRuntimeLogPaths = [] edgeStatsLogPaths = [] for binId in range(self.params.nonlocalWorkBins) : binStr = str(binId).zfill(4) candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr)) if isTumorOnly : tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr)) else: diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr)) if isSomatic : somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr)) hygenCmd = [ self.params.mantaHyGenBin ] hygenCmd.extend(["--align-stats",statsPath]) hygenCmd.extend(["--graph-file",graphPath]) hygenCmd.extend(["--bin-index", str(binId)]) hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)]) hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount]) hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize]) hygenCmd.extend(["--ref",self.params.referenceFasta]) hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]]) # tumor-only mode if isTumorOnly : hygenCmd.extend(["--tumor-output-file", tumorVcfPaths[-1]]) else: hygenCmd.extend(["--diploid-output-file", diploidVcfPaths[-1]]) hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore]) hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore]) hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore]) # tumor/normal mode if isSomatic : hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]]) hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore]) hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore]) # temporary fix for FFPE: hygenCmd.append("--skip-remote-reads") if self.params.isHighDepthFilter : hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr)) hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]]) edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr)) hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]]) for bamPath in self.params.normalBamList : hygenCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList : hygenCmd.extend(["--tumor-align-file", bamPath]) if self.params.isIgnoreAnomProperPair : hygenCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : hygenCmd.append("--rna") if self.params.isUnstrandedRNA : hygenCmd.append("--unstranded") hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr) hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb)) nextStepWait = copy.deepcopy(hygenTasks) def getVcfSortCmd(vcfPaths, outPath, isDiploid) : cmd = "\"%s\" -E \"%s\" -u " % (sys.executable,self.params.mantaSortVcf) cmd += " ".join(quoteStringList(vcfPaths)) # apply the ploidy filter to diploid variants if isDiploid: tempVcf = self.paths.getTempDiploidPath() cmd += " > \"%s\"" % (tempVcf) cmd += " && \"%s\" -E \"%s\" \"%s\"" % (sys.executable, self.params.mantaPloidyFilter, tempVcf) cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath) if isDiploid: cmd += " && " + " ".join(getRmCmd()) + " \"%s\"" % (self.paths.getTempDiploidPath()) return cmd def getVcfTabixCmd(vcfPath) : return [self.params.tabixBin,"-f","-p","vcf", vcfPath] def sortVcfs(pathList, outPath, label, isDiploid=False) : if len(pathList) == 0 : return set() # make header modifications to first vcf in list of files to be sorted: headerFixTask=preJoin(taskPrefix,"fixVcfHeader_"+label) def getHeaderFixCmd(fileName) : tmpName=fileName+".reheader.tmp" cmd = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper) cmd += ' "' + " ".join(self.params.configCommandLine) + '"' cmd += " < \"%s\" > \"%s\"" % (fileName,tmpName) cmd += " && " + " ".join(getMvCmd()) + " \"%s\" \"%s\"" % (tmpName, fileName) return cmd self.addTask(headerFixTask,getHeaderFixCmd(pathList[0]),dependencies=hygenTasks,isForceLocal=True) sortCmd = getVcfSortCmd(pathList, outPath, isDiploid) sortTask=self.addTask(preJoin(taskPrefix,"sort_"+label),sortCmd,dependencies=headerFixTask) nextStepWait.add(self.addTask(preJoin(taskPrefix,"tabix_"+label),getVcfTabixCmd(outPath),dependencies=sortTask,isForceLocal=True)) return sortTask candSortTask = sortVcfs(candidateVcfPaths, self.paths.getSortedCandidatePath(), "sortCandidateSV") sortVcfs(diploidVcfPaths, self.paths.getSortedDiploidPath(), "sortDiploidSV", isDiploid=True) sortVcfs(somaticVcfPaths, self.paths.getSortedSomaticPath(), "sortSomaticSV") sortVcfs(tumorVcfPaths, self.paths.getSortedTumorPath(), "sortTumorSV") def getExtractSmallCmd(maxSize, inPath, outPath) : cmd = "\"%s\" -dc \"%s\"" % (self.params.bgzipBin, inPath) cmd += " | \"%s\" -E \"%s\" --maxSize %i" % (sys.executable, self.params.mantaExtraSmallVcf, maxSize) cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath) return cmd def extractSmall(inPath, outPath) : maxSize = int(self.params.minScoredVariantSize) - 1 if maxSize < 1 : return smallCmd = getExtractSmallCmd(maxSize, inPath, outPath) smallLabel=self.addTask(preJoin(taskPrefix,"extractSmallIndels"), smallCmd, dependencies=candSortTask, isForceLocal=True) nextStepWait.add(self.addTask(smallLabel+"_tabix", getVcfTabixCmd(outPath), dependencies=smallLabel, isForceLocal=True)) extractSmall(self.paths.getSortedCandidatePath(), self.paths.getSortedCandidateSmallIndelsPath()) # sort edge logs: def getEdgeLogSortCmd(logPaths, outPath) : cmd = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-o",outPath] cmd.extend(logPaths) return cmd edgeSortLabel=preJoin(taskPrefix,"sortEdgeRuntimeLogs") edgeSortCmd=getEdgeLogSortCmd(edgeRuntimeLogPaths,self.paths.getSortedEdgeRuntimeLogPath()) self.addTask(edgeSortLabel, edgeSortCmd, dependencies=hygenTasks, isForceLocal=True) # merge edge stats: edgeStatsMergeLabel=preJoin(taskPrefix,"mergeEdgeStats") edgeStatsMergeCmd=[self.params.mantaStatsMergeBin] for statsFile in edgeStatsLogPaths : edgeStatsMergeCmd.extend(["--stats-file",statsFile]) edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()]) edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()]) self.addTask(edgeStatsMergeLabel, edgeStatsMergeCmd, dependencies=hygenTasks, isForceLocal=True) return nextStepWait