def callGenome(self, taskPrefix="", dependencies=None): """ run variant caller on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params): segmentTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() finishTasks.add( self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.denovo, self.paths.getDenovoOutputPath(), "denovo")) # merge segment stats: finishTasks.add( self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats)) if self.params.isOutputCallableRegions: finishTasks.add( self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.callable, self.paths.getRegionOutputPath(), "callableRegions")) if not self.params.isRetainTempFiles: rmStatsTmpCmd = getRmdirCmd() + [tmpSegmentDir] rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"), rmStatsTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def getStrelkaGenomeSegmentGroupIterator(self, contigsExcludedFromGrouping=None): """ setup genome segment iteration for germline and somatic calling, including clumping together small segments into groups. """ genomeSegmentIterator = self.filterUncalledChromosomeSegments( getNextGenomeSegment(self.params)) return getGenomeSegmentGroups(genomeSegmentIterator, contigsExcludedFromGrouping)
def callGenome(self, taskPrefix="", dependencies=None): """ run counter on all genome segments """ tmpSegmentDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), getMkdirCmd() + [tmpSegmentDir], dependencies=dependencies, isForceLocal=True) segmentTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params): segmentTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) if len(segmentTasks) == 0: raise Exception( "No genome regions to analyze. Possible target region parse error." ) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=segmentTasks) finishTasks = set() # merge segment stats: finishTasks.add( mergeSequenceErrorCounts(self, taskPrefix, completeSegmentsTask, segFiles.counts)) if self.params.isReportObservedIndels: finishTasks.add( self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.observedIndelBed, self.paths.getObservedIndelBedPath(), "observedIndels")) if not self.params.isRetainTempFiles: rmTmpCmd = getRmdirCmd() + [tmpSegmentDir] rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None): """ run variant caller on all genome segments """ tmpGraphDir = self.paths.getTmpSegmentDir() dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"), "mkdir -p " + tmpGraphDir, dependencies=dependencies, isForceLocal=True) graphTasks = set() segFiles = TempSegmentFiles() for gseg in getNextGenomeSegment(self.params): graphTasks |= callGenomeSegment(self, gseg, segFiles, dependencies=dirTask) # create a checkpoint for all segments: completeSegmentsTask = self.addTask(preJoin(taskPrefix, "completedAllGenomeSegments"), dependencies=graphTasks) finishTasks = set() def finishVcf(tmpList, output, label): assert (len(tmpList) > 0) if len(tmpList) > 1: catCmd = [self.params.bgcatBin, "-o", output] catCmd.extend(tmpList) catCmd = " ".join(catCmd) else: catCmd = "mv -f %s %s" % (tmpList[0], output) catCmd += " && %s -p vcf %s" % (self.params.tabixBin, output) finishTasks.add( self.addTask(preJoin(taskPrefix, label + "_finalizeVCF"), catCmd, dependencies=completeSegmentsTask)) finishVcf(segFiles.gvcf, self.paths.getGvcfOutputPath(), "gVCF") cleanTask = self.addTask(preJoin(taskPrefix, "cleanTmpDir"), "rm -rf " + tmpGraphDir, dependencies=finishTasks, isForceLocal=True) nextStepWait = finishTasks return nextStepWait
def getGenomeSegmentGroups(params): """ Iterate segment groups and 'clump' small contigs together """ minSegmentGroupSize = 200000 group = [] headSize = 0 for gseg in getNextGenomeSegment(self.params): if headSize + gseg.size() <= minSegmentGroupSize: group.append(gseg) headSize += gseg.size() else: if len(group) != 0: yield (group) group = [gseg] headSize = gseg.size() if len(group) != 0: yield (group)
def getGenomeSegmentGroups(params) : """ Iterate segment groups and 'clump' small contigs together """ minSegmentGroupSize=200000 group = [] headSize = 0 for gseg in getNextGenomeSegment(self.params) : if headSize+gseg.size() <= minSegmentGroupSize : group.append(gseg) headSize += gseg.size() else : if len(group) != 0 : yield(group) group = [gseg] headSize = gseg.size() if len(group) != 0 : yield(group)
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() tmpGraphDir=self.paths.getTmpGraphDir() makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask = self.addTask(preJoin(taskPrefix,"makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)) : assert(len(gsegGroup) != 0) gid=gsegGroup[0].id if len(gsegGroup) > 1 : gid += "_to_"+gsegGroup[-1].id tmpGraphFiles.append(self.paths.getTmpGraphFile(gid)) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) for gseg in gsegGroup : graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTask=preJoin(taskPrefix,"makeLocusGraph_"+gid) graphTasks.add(self.addTask(graphTask,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") tmpGraphFileList = self.paths.getTmpGraphFileListPath() tmpGraphFileListTask = preJoin(taskPrefix,"mergeLocusGraphInputList") self.addWorkflowTask(tmpGraphFileListTask,listFileWorkflow(tmpGraphFileList,tmpGraphFiles),dependencies=graphTasks) mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) mergeCmd.extend(["--graph-file-list",tmpGraphFileList]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=tmpGraphFileListTask,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) if not self.params.isRetainTempFiles : rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin,"--global"] graphStatsCmd.extend(["--graph-file",graphPath]) graphStatsCmd.extend(["--output-file",graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gseg in getNextGenomeSegment(self.params) : tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gseg.id+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gseg.pyflowId) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = "rm -rf " + tmpGraphDir rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = self.params.mantaGraphStatsBin graphStatsCmd += " --global" graphStatsCmd += " --graph-file " + graphPath graphStatsCmd += " >| " + graphStatsPath graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runLocusGraph(self,taskPrefix="",dependencies=None): """ Create the full SV locus graph """ statsPath=self.paths.getStatsPath() graphPath=self.paths.getGraphPath() graphStatsPath=self.paths.getGraphStatsPath() graphFilename=os.path.basename(graphPath) tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir") dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), "mkdir -p "+tmpGraphDir, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gseg in getNextGenomeSegment(self.params) : tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gseg.id+".bin")) graphCmd = [ self.params.mantaGraphBin ] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats",statsPath]) graphCmd.extend(["--region",gseg.bamRegion]) graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref",self.params.referenceFasta]) for bamPath in self.params.normalBamList : graphCmd.extend(["--align-file",bamPath]) for bamPath in self.params.tumorBamList : graphCmd.extend(["--tumor-align-file",bamPath]) if self.params.isHighDepthFilter : graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair : graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA : graphCmd.append("--rna") graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gseg.pyflowId) graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0 : raise Exception("No SV Locus graphs to create. Possible target region parse error.") mergeCmd = [ self.params.mantaGraphMergeBin ] mergeCmd.extend(["--output-file", graphPath]) for gfile in tmpGraphFiles : mergeCmd.extend(["--graph-file", gfile]) mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [ self.params.mantaGraphCheckBin ] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) rmGraphTmpCmd = "rm -rf " + tmpGraphDir rmTask=self.addTask(preJoin(taskPrefix,"rmGraphTmp"),rmGraphTmpCmd,dependencies=mergeTask) graphStatsCmd = self.params.mantaGraphStatsBin graphStatsCmd += " --global" graphStatsCmd += " --graph-file " + graphPath graphStatsCmd += " >| " + graphStatsPath graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait
def runLocusGraph(self, taskPrefix="", dependencies=None): """ Create the full SV locus graph """ statsPath = self.paths.getStatsPath() graphPath = self.paths.getGraphPath() graphStatsPath = self.paths.getGraphStatsPath() tmpGraphDir = self.paths.getTmpGraphDir() makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir] dirTask = self.addTask(preJoin(taskPrefix, "makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True) tmpGraphFiles = [] graphTasks = set() for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)): assert (len(gsegGroup) != 0) gid = gsegGroup[0].id if len(gsegGroup) > 1: gid += "_to_" + gsegGroup[-1].id tmpGraphFiles.append(self.paths.getTmpGraphFile(gid)) graphCmd = [self.params.mantaGraphBin] graphCmd.extend(["--output-file", tmpGraphFiles[-1]]) graphCmd.extend(["--align-stats", statsPath]) for gseg in gsegGroup: graphCmd.extend(["--region", gseg.bamRegion]) graphCmd.extend( ["--min-candidate-sv-size", self.params.minCandidateVariantSize]) graphCmd.extend( ["--min-edge-observations", self.params.minEdgeObservations]) graphCmd.extend(["--ref", self.params.referenceFasta]) for bamPath in self.params.normalBamList: graphCmd.extend(["--align-file", bamPath]) for bamPath in self.params.tumorBamList: graphCmd.extend(["--tumor-align-file", bamPath]) if self.params.isHighDepthFilter: graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()]) if self.params.isIgnoreAnomProperPair: graphCmd.append("--ignore-anom-proper-pair") if self.params.isRNA: graphCmd.append("--rna") graphTask = preJoin(taskPrefix, "makeLocusGraph_" + gid) graphTasks.add( self.addTask(graphTask, graphCmd, dependencies=dirTask, memMb=self.params.estimateMemMb)) if len(tmpGraphFiles) == 0: raise Exception( "No SV Locus graphs to create. Possible target region parse error." ) tmpGraphFileList = self.paths.getTmpGraphFileListPath() tmpGraphFileListTask = preJoin(taskPrefix, "mergeLocusGraphInputList") self.addWorkflowTask(tmpGraphFileListTask, listFileWorkflow(tmpGraphFileList, tmpGraphFiles), dependencies=graphTasks) mergeCmd = [self.params.mantaGraphMergeBin] mergeCmd.extend(["--output-file", graphPath]) mergeCmd.extend(["--graph-file-list", tmpGraphFileList]) mergeTask = self.addTask(preJoin(taskPrefix, "mergeLocusGraph"), mergeCmd, dependencies=tmpGraphFileListTask, memMb=self.params.mergeMemMb) # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but # this makes the check much more clear: checkCmd = [self.params.mantaGraphCheckBin] checkCmd.extend(["--graph-file", graphPath]) checkTask = self.addTask(preJoin(taskPrefix, "checkLocusGraph"), checkCmd, dependencies=mergeTask, memMb=self.params.mergeMemMb) if not self.params.isRetainTempFiles: rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir] rmTask = self.addTask(preJoin(taskPrefix, "removeTmpDir"), rmGraphTmpCmd, dependencies=mergeTask) graphStatsCmd = [self.params.mantaGraphStatsBin, "--global"] graphStatsCmd.extend(["--graph-file", graphPath]) graphStatsCmd.extend(["--output-file", graphStatsPath]) graphStatsTask = self.addTask(preJoin(taskPrefix, "locusGraphStats"), graphStatsCmd, dependencies=mergeTask, memMb=self.params.mergeMemMb) nextStepWait = set() nextStepWait.add(checkTask) return nextStepWait