Example #1
0
def runStats(self, taskPrefix="", dependencies=None):

    statsPath = self.paths.getStatsPath()
    statsFilename = os.path.basename(statsPath)

    tmpStatsDir = statsPath + ".tmpdir"

    makeTmpStatsDirCmd = getMkdirCmd() + [tmpStatsDir]
    dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"),
                           makeTmpStatsDirCmd,
                           dependencies=dependencies,
                           isForceLocal=True)

    tmpStatsFiles = []
    statsTasks = set()

    for (bamIndex, bamPath) in enumerate(self.params.normalBamList +
                                         self.params.tumorBamList):
        indexStr = str(bamIndex).zfill(3)
        tmpStatsFiles.append(
            os.path.join(tmpStatsDir, statsFilename + "." + indexStr + ".xml"))

        cmd = [self.params.mantaStatsBin]
        cmd.extend(["--output-file", tmpStatsFiles[-1]])
        cmd.extend(["--align-file", bamPath])

        statsTasks.add(
            self.addTask(preJoin(taskPrefix, "generateStats_" + indexStr),
                         cmd,
                         dependencies=dirTask))

    cmd = [self.params.mantaMergeStatsBin]
    cmd.extend(["--output-file", statsPath])
    for tmpStatsFile in tmpStatsFiles:
        cmd.extend(["--align-stats-file", tmpStatsFile])

    mergeTask = self.addTask(preJoin(taskPrefix, "mergeStats"),
                             cmd,
                             dependencies=statsTasks,
                             isForceLocal=True)

    nextStepWait = set()
    nextStepWait.add(mergeTask)

    if not self.params.isRetainTempFiles:
        rmStatsTmpCmd = getRmdirCmd() + [tmpStatsDir]
        rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"),
                              rmStatsTmpCmd,
                              dependencies=mergeTask,
                              isForceLocal=True)

    # summarize stats in format that's easier for human review
    cmd = [self.params.mantaStatsSummaryBin]
    cmd.extend(["--align-stats ", statsPath])
    cmd.extend(["--output-file", self.paths.getStatsSummaryPath()])
    self.addTask(preJoin(taskPrefix, "summarizeStats"),
                 cmd,
                 dependencies=mergeTask)

    return nextStepWait
def getSequenceErrorEstimates(self, taskPrefix="", dependencies=None):
    """
    Count sequence errors and use these to estimate error parameters
    """

    mkDirTask = preJoin(taskPrefix, "makeTmpDir")
    tmpErrorEstimationDir = self.paths.getTmpErrorEstimationDir()
    mkDirCmd = getMkdirCmd() + [tmpErrorEstimationDir]
    self.addTask(mkDirTask,
                 mkDirCmd,
                 dependencies=dependencies,
                 isForceLocal=True)

    estimationIntervals = getErrorEstimationIntervals(self.params)
    assert (len(estimationIntervals) != 0)

    # The count and estimation processes are currently independent for each sample
    sampleTasks = set()
    for sampleIndex in range(len(self.params.bamList)):
        sampleIndexStr = str(sampleIndex).zfill(3)
        sampleTask = preJoin(taskPrefix, "Sample" + sampleIndexStr)
        workflow = EstimateSequenceErrorWorkflowForSample(
            self.params, self.paths, estimationIntervals, sampleIndex)
        sampleTasks.add(
            self.addWorkflowTask(sampleTask, workflow, dependencies=mkDirTask))

    if not self.params.isRetainTempFiles:
        rmTmpCmd = getRmdirCmd() + [tmpErrorEstimationDir]
        self.addTask(preJoin(taskPrefix, "removeTmpDir"),
                     rmTmpCmd,
                     dependencies=sampleTasks,
                     isForceLocal=True)

    nextStepWait = sampleTasks
    return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None):
    """
    run variant caller on all genome segments
    """

    tmpSegmentDir = self.paths.getTmpSegmentDir()
    dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"),
                           getMkdirCmd() + [tmpSegmentDir],
                           dependencies=dependencies,
                           isForceLocal=True)

    segmentTasks = set()

    segFiles = TempSegmentFiles()
    for gseg in getNextGenomeSegment(self.params):

        segmentTasks |= callGenomeSegment(self,
                                          gseg,
                                          segFiles,
                                          dependencies=dirTask)

    if len(segmentTasks) == 0:
        raise Exception(
            "No genome regions to analyze. Possible target region parse error."
        )

    # create a checkpoint for all segments:
    completeSegmentsTask = self.addTask(preJoin(taskPrefix,
                                                "completedAllGenomeSegments"),
                                        dependencies=segmentTasks)

    finishTasks = set()

    finishTasks.add(
        self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.denovo,
                            self.paths.getDenovoOutputPath(), "denovo"))

    # merge segment stats:
    finishTasks.add(
        self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats))

    if self.params.isOutputCallableRegions:
        finishTasks.add(
            self.concatIndexBed(taskPrefix, completeSegmentsTask,
                                segFiles.callable,
                                self.paths.getRegionOutputPath(),
                                "callableRegions"))

    if not self.params.isRetainTempFiles:
        rmStatsTmpCmd = getRmdirCmd() + [tmpSegmentDir]
        rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"),
                              rmStatsTmpCmd,
                              dependencies=finishTasks,
                              isForceLocal=True)

    nextStepWait = finishTasks

    return nextStepWait
def callGenome(self, taskPrefix="", dependencies=None):
    """
    run counter on all genome segments
    """

    tmpSegmentDir = self.paths.getTmpSegmentDir()
    dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"),
                           getMkdirCmd() + [tmpSegmentDir],
                           dependencies=dependencies,
                           isForceLocal=True)

    segmentTasks = set()

    segFiles = TempSegmentFiles()
    for gseg in getNextGenomeSegment(self.params):

        segmentTasks |= callGenomeSegment(self,
                                          gseg,
                                          segFiles,
                                          dependencies=dirTask)

    if len(segmentTasks) == 0:
        raise Exception(
            "No genome regions to analyze. Possible target region parse error."
        )

    # create a checkpoint for all segments:
    completeSegmentsTask = self.addTask(preJoin(taskPrefix,
                                                "completedAllGenomeSegments"),
                                        dependencies=segmentTasks)

    finishTasks = set()

    # merge segment stats:
    finishTasks.add(
        mergeSequenceErrorCounts(self, taskPrefix, completeSegmentsTask,
                                 segFiles.counts))

    if self.params.isReportObservedIndels:
        finishTasks.add(
            self.concatIndexBed(taskPrefix, completeSegmentsTask,
                                segFiles.observedIndelBed,
                                self.paths.getObservedIndelBedPath(),
                                "observedIndels"))

    if not self.params.isRetainTempFiles:
        rmTmpCmd = getRmdirCmd() + [tmpSegmentDir]
        rmTask = self.addTask(preJoin(taskPrefix, "rmTmpDir"),
                              rmTmpCmd,
                              dependencies=finishTasks,
                              isForceLocal=True)

    nextStepWait = finishTasks

    return nextStepWait
def callGenome(self,taskPrefix="",dependencies=None):
    """
    run strelka on all genome segments
    """

    tmpSegmentDir=self.paths.getTmpSegmentDir()
    dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), getMkdirCmd() + [tmpSegmentDir],
                         dependencies=dependencies, isForceLocal=True)

    segmentTasks = set()

    segFiles = TempVariantCallingSegmentFiles()

    for gsegGroup in self.getStrelkaGenomeSegmentGroupIterator() :
        segmentTasks |= callGenomeSegment(self, gsegGroup, segFiles, dependencies=dirTask)

    if len(segmentTasks) == 0 :
        raise Exception("No genome regions to analyze. Possible target region parse error.")

    # create a checkpoint for all segments:
    completeSegmentsTask = self.addTask(preJoin(taskPrefix,"completedAllGenomeSegments"),dependencies=segmentTasks)

    finishTasks = set()

    finishTasks.add(self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.snv,
                                        self.paths.getSnvOutputPath(),"SNV"))
    finishTasks.add(self.concatIndexVcf(taskPrefix, completeSegmentsTask, segFiles.indel,
                                        self.paths.getIndelOutputPath(),"Indel"))

    # merge segment stats:
    finishTasks.add(self.mergeRunStats(taskPrefix,completeSegmentsTask, segFiles.stats))

    if self.params.isOutputCallableRegions :
        finishTasks.add(self.concatIndexBed(taskPrefix, completeSegmentsTask, segFiles.callable,
                                            self.paths.getRegionOutputPath(), "callableRegions"))

    if self.params.isWriteRealignedBam :
        def catRealignedBam(label, segmentList) :
            output = self.paths.getRealignedBamPath(label)

            bamCatCmd = bamListCatCmd(self.params.samtoolsBin, segmentList, output)
            bamCatTaskLabel = preJoin(taskPrefix, "realignedBamCat_" + label)

            finishTasks.add(self.addTask(bamCatTaskLabel, bamCatCmd, dependencies=completeSegmentsTask))

        catRealignedBam("normal", segFiles.normalRealign)
        catRealignedBam("tumor", segFiles.tumorRealign)

    if not self.params.isRetainTempFiles :
        rmTmpCmd = getRmdirCmd() + [tmpSegmentDir]
        self.addTask(preJoin(taskPrefix,"removeTmpDir"), rmTmpCmd, dependencies=finishTasks, isForceLocal=True)

    nextStepWait = finishTasks

    return nextStepWait
Example #6
0
def runStats(self,taskPrefix="",dependencies=None) :

    statsPath=self.paths.getStatsPath()
    statsFilename=os.path.basename(statsPath)

    tmpStatsDir=statsPath+".tmpdir"

    makeTmpStatsDirCmd = getMkdirCmd() + [tmpStatsDir]
    dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpStatsDirCmd, dependencies=dependencies, isForceLocal=True)

    tmpStatsFiles = []
    statsTasks = set()

    for (bamIndex,bamPath) in enumerate(self.params.normalBamList + self.params.tumorBamList) :
        indexStr = str(bamIndex).zfill(3)
        tmpStatsFiles.append(os.path.join(tmpStatsDir,statsFilename+"."+ indexStr +".xml"))

        cmd = [ self.params.mantaStatsBin ]
        cmd.extend(["--output-file",tmpStatsFiles[-1]])
        cmd.extend(["--align-file",bamPath])

        statsTasks.add(self.addTask(preJoin(taskPrefix,"generateStats_"+indexStr),cmd,dependencies=dirTask))

    cmd = [ self.params.mantaMergeStatsBin ]
    cmd.extend(["--output-file",statsPath])
    for tmpStatsFile in tmpStatsFiles :
        cmd.extend(["--align-stats-file",tmpStatsFile])

    mergeTask = self.addTask(preJoin(taskPrefix,"mergeStats"),cmd,dependencies=statsTasks,isForceLocal=True)

    nextStepWait = set()
    nextStepWait.add(mergeTask)

    if not self.params.isRetainTempFiles :
        rmStatsTmpCmd = getRmdirCmd() + [tmpStatsDir]
        rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmStatsTmpCmd,dependencies=mergeTask, isForceLocal=True)

    # summarize stats in format that's easier for human review
    cmd = [self.params.mantaStatsSummaryBin]
    cmd.extend(["--align-stats ", statsPath])
    cmd.extend(["--output-file", self.paths.getStatsSummaryPath()])
    self.addTask(preJoin(taskPrefix,"summarizeStats"),cmd,dependencies=mergeTask)

    return nextStepWait
Example #7
0
def callGenome(self, taskPrefix="", dependencies=None):
    """
    run variant caller on all genome segments
    """

    tmpSegmentDir = self.paths.getTmpSegmentDir()
    dirTask = self.addTask(preJoin(taskPrefix, "makeTmpDir"),
                           getMkdirCmd() + [tmpSegmentDir],
                           dependencies=dependencies,
                           isForceLocal=True)

    segmentTasks = set()
    sampleCount = len(self.params.bamList)

    segFiles = TempVariantCallingSegmentFiles(sampleCount)

    for gsegGroup in self.getStrelkaGenomeSegmentGroupIterator(
            contigsExcludedFromGrouping=self.params.callContinuousVf):
        segmentTasks |= callGenomeSegment(self,
                                          gsegGroup,
                                          segFiles,
                                          dependencies=dirTask)

    if len(segmentTasks) == 0:
        raise Exception(
            "No genome regions to analyze. Possible target region parse error."
        )

    # create a checkpoint for all segments:
    completeSegmentsTask = self.addTask(preJoin(taskPrefix,
                                                "completedAllGenomeSegments"),
                                        dependencies=segmentTasks)

    finishTasks = set()

    # merge various VCF outputs
    finishTasks.add(
        self.concatIndexVcf(taskPrefix, completeSegmentsTask,
                            segFiles.variants,
                            self.paths.getVariantsOutputPath(), "variants"))
    for sampleIndex in range(sampleCount):
        concatTask = self.concatIndexVcf(
            taskPrefix, completeSegmentsTask,
            segFiles.sample[sampleIndex].gvcf,
            self.paths.getGvcfOutputPath(sampleIndex),
            gvcfSampleLabel(sampleIndex))
        finishTasks.add(concatTask)
        if sampleIndex == 0:
            outputPath = self.paths.getGvcfOutputPath(sampleIndex)
            outputDirname = os.path.dirname(outputPath)
            outputBasename = os.path.basename(outputPath)

            def linkLegacy(extension):
                return "ln -s " + quote(
                    outputBasename + extension) + " " + quote(
                        self.paths.getGvcfLegacyFilename() + extension)

            linkCmd = linkLegacy("") + " && " + linkLegacy(".tbi")
            self.addTask(preJoin(taskPrefix, "addLegacyOutputLink"),
                         linkCmd,
                         dependencies=concatTask,
                         isForceLocal=True,
                         cwd=outputDirname)

    # merge segment stats:
    finishTasks.add(
        self.mergeRunStats(taskPrefix, completeSegmentsTask, segFiles.stats))

    if self.params.isWriteRealignedBam:

        def finishBam(tmpList, output, label):
            cmd = bamListCatCmd(self.params.samtoolsBin, tmpList, output)
            finishTasks.add(
                self.addTask(preJoin(taskPrefix, label + "_finalizeBAM"),
                             cmd,
                             dependencies=completeSegmentsTask))

        finishBam(segFiles.bamRealign, self.paths.getRealignedBamPath(),
                  "realigned")

    if not self.params.isRetainTempFiles:
        rmTmpCmd = getRmdirCmd() + [tmpSegmentDir]
        self.addTask(preJoin(taskPrefix, "removeTmpDir"),
                     rmTmpCmd,
                     dependencies=finishTasks,
                     isForceLocal=True)

    nextStepWait = finishTasks

    return nextStepWait
Example #8
0
def runHyGen(self, taskPrefix="", dependencies=None) :
    """
    Run hypothesis generation on each SV locus
    """

    import copy

    statsPath=self.paths.getStatsPath()
    graphPath=self.paths.getGraphPath()
    hygenDir=self.paths.getHyGenDir()

    makeHyGenDirCmd = getMkdirCmd() + [hygenDir]
    dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True)

    isTumorNormal = (len(self.params.normalBamList) and len(self.params.tumorBamList))
    isTumorOnly = ((not isTumorNormal) and len(self.params.tumorBamList))

    hygenTasks=set()
    if self.params.isGenerateSupportBam :
        sortBamVcfTasks = set()

    self.candidateVcfPaths = []
    self.diploidVcfPaths = []
    self.somaticVcfPaths = []
    self.tumorVcfPaths = []
    self.rnaVcfPaths = []

    edgeRuntimeLogPaths = []
    edgeStatsLogPaths = []

    for binId in range(self.params.nonlocalWorkBins) :
        binStr = str(binId).zfill(4)
        self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr))
        if isTumorOnly :
            self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr))
        elif self.params.isRNA:
            self.rnaVcfPaths.append(self.paths.getHyGenRnaPath(binStr))
        else:
            self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr))
            if isTumorNormal :
                self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr))

        hygenCmd = [ self.params.mantaHyGenBin ]
        hygenCmd.extend(["--align-stats",statsPath])
        hygenCmd.extend(["--graph-file",graphPath])
        hygenCmd.extend(["--bin-index", str(binId)])
        hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)])
        hygenCmd.extend(["--max-edge-count", str(self.params.graphNodeMaxEdgeCount)])
        hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount])
        hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize])
        hygenCmd.extend(["--ref",self.params.referenceFasta])
        hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]])

        # tumor-only mode
        if isTumorOnly :
            hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]])
        elif self.params.isRNA:
            hygenCmd.extend(["--rna-output-file", self.rnaVcfPaths[-1]])
        else:
            hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]])
            hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore])
            hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore])
            hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore])
            # tumor/normal mode
            if isTumorNormal :
                hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]])
                hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore])
                hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore])

        # Setup remote read retrieval for insertions:
        def isEnableRemoteReadRetrieval() :
            if isTumorOnly or isTumorNormal :
                return self.params.enableRemoteReadRetrievalForInsertionsInCancerCallingModes
            else :
                return self.params.enableRemoteReadRetrievalForInsertionsInGermlineCallingModes

        if isEnableRemoteReadRetrieval() :
            hygenCmd.append("--enable-remote-read-retrieval")

        if self.params.isHighDepthFilter :
            hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr))
        hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]])

        edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr))
        hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]])

        if self.params.isGenerateSupportBam :
            hygenCmd.extend(["--evidence-bam-stub", self.paths.getSupportBamStub(binStr)])

        for bamPath in self.params.normalBamList :
            hygenCmd.extend(["--align-file", bamPath])
        for bamPath in self.params.tumorBamList :
            hygenCmd.extend(["--tumor-align-file", bamPath])

        if self.params.isIgnoreAnomProperPair :
            hygenCmd.append("--ignore-anom-proper-pair")

        if self.params.useOverlapPairEvidence:
            hygenCmd.append("--use-overlapping-pair")

        if self.params.isRNA :
            hygenCmd.append("--rna")
            if self.params.isUnstrandedRNA :
                hygenCmd.append("--unstranded")

        if self.params.isOutputContig :
            hygenCmd.append("--output-contigs")

        hygenTask = preJoin(taskPrefix,"generateCandidateSV_"+binStr)
        hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=self.params.hyGenMemMb))

        # TODO: if the bam is large, for efficiency, consider
        # 1) filtering the bin-specific bam first w.r.t. the final candidate vcf
        # 2) then sort the bin-specific bam and merge them
        # This would require moving the filter/sort bam jobs outside the hygen loop
        if self.params.isGenerateSupportBam :
            bamIndex = 0
            # sort supporting bams extracted from normal samples
            bamIndex  = sortBams(self, sortBamVcfTasks,
                                 taskPrefix=taskPrefix, binStr=binStr,
                                 isNormal=True, bamIdx=bamIndex,
                                 dependencies=hygenTask)
            # sort supporting bams extracted from tumor samples
            bamIndex = sortBams(self, sortBamVcfTasks,
                                taskPrefix=taskPrefix, binStr=binStr,
                                isNormal=False, bamIdx=bamIndex,
                                dependencies=hygenTask)

    vcfTasks = sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks)
    nextStepWait = copy.deepcopy(hygenTasks)

    if self.params.isGenerateSupportBam :
        sortBamVcfTasks.union(vcfTasks)
        mergeBamTasks = set()
        bamCount = 0
        # merge supporting bams for each normal sample
        bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix,
                                    isNormal=True, bamIdx=bamCount,
                                    dependencies=sortBamVcfTasks)

        # merge supporting bams for each tumor sample
        bamCount = mergeSupportBams(self, mergeBamTasks, taskPrefix=taskPrefix,
                                    isNormal=False, bamIdx=bamCount,
                                    dependencies=sortBamVcfTasks)

        nextStepWait = nextStepWait.union(sortBamVcfTasks)
        nextStepWait = nextStepWait.union(mergeBamTasks)

    #
    # sort the edge runtime logs
    #
    logListFile = self.paths.getEdgeRuntimeLogListPath()
    logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList")
    self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks)

    def getEdgeLogSortCmd(logListFile, outPath) :
        cmd  = [sys.executable, self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath]
        return cmd

    edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath())
    self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True)

    #
    # merge all edge stats
    #
    statsFileList = self.paths.getStatsFileListPath()
    statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList")
    self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks)

    edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats")
    edgeStatsMergeCmd=[self.params.mantaStatsMergeBin]
    edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList])
    edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()])
    edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()])
    self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True)

    if not self.params.isRetainTempFiles :
        # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present.
        # rmDirCmd = getRmdirCmd() + [hygenDir]
        # rmDirTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY)
        pass

    return nextStepWait
Example #9
0
def runLocusGraph(self,taskPrefix="",dependencies=None):
    """
    Create the full SV locus graph
    """

    statsPath=self.paths.getStatsPath()
    graphPath=self.paths.getGraphPath()
    graphStatsPath=self.paths.getGraphStatsPath()

    tmpGraphDir=self.paths.getTmpGraphDir()

    makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir]
    dirTask = self.addTask(preJoin(taskPrefix,"makeGraphTmpDir"), makeTmpGraphDirCmd, dependencies=dependencies, isForceLocal=True)

    tmpGraphFiles = []
    graphTasks = set()

    for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)) :
        assert(len(gsegGroup) != 0)
        gid=gsegGroup[0].id
        if len(gsegGroup) > 1 :
            gid += "_to_"+gsegGroup[-1].id
        tmpGraphFiles.append(self.paths.getTmpGraphFile(gid))
        graphCmd = [ self.params.mantaGraphBin ]
        graphCmd.extend(["--output-file", tmpGraphFiles[-1]])
        graphCmd.extend(["--align-stats",statsPath])
        for gseg in gsegGroup :
            graphCmd.extend(["--region",gseg.bamRegion])
        graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations])
        graphCmd.extend(["--ref",self.params.referenceFasta])
        for bamPath in self.params.normalBamList :
            graphCmd.extend(["--align-file",bamPath])
        for bamPath in self.params.tumorBamList :
            graphCmd.extend(["--tumor-align-file",bamPath])

        if self.params.isHighDepthFilter :
            graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        if self.params.isIgnoreAnomProperPair :
            graphCmd.append("--ignore-anom-proper-pair")
        if self.params.isRNA :
            graphCmd.append("--rna")

        graphTask=preJoin(taskPrefix,"makeLocusGraph_"+gid)
        graphTasks.add(self.addTask(graphTask,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb))

    if len(tmpGraphFiles) == 0 :
        raise Exception("No SV Locus graphs to create. Possible target region parse error.")

    tmpGraphFileList = self.paths.getTmpGraphFileListPath()
    tmpGraphFileListTask = preJoin(taskPrefix,"mergeLocusGraphInputList")
    self.addWorkflowTask(tmpGraphFileListTask,listFileWorkflow(tmpGraphFileList,tmpGraphFiles),dependencies=graphTasks)

    mergeCmd = [ self.params.mantaGraphMergeBin ]
    mergeCmd.extend(["--output-file", graphPath])
    mergeCmd.extend(["--graph-file-list",tmpGraphFileList])
    mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=tmpGraphFileListTask,memMb=self.params.mergeMemMb)

    # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but
    # this makes the check much more clear:

    checkCmd = [ self.params.mantaGraphCheckBin ]
    checkCmd.extend(["--graph-file", graphPath])
    checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb)

    if not self.params.isRetainTempFiles :
        rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir]
        rmTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmGraphTmpCmd,dependencies=mergeTask)

    graphStatsCmd  = [self.params.mantaGraphStatsBin,"--global"]
    graphStatsCmd.extend(["--graph-file",graphPath])
    graphStatsCmd.extend(["--output-file",graphStatsPath])

    graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb)

    nextStepWait = set()
    nextStepWait.add(checkTask)
    return nextStepWait
Example #10
0
def runHyGen(self, taskPrefix="", dependencies=None):
    """
    Run hypothesis generation on each SV locus
    """

    import copy

    statsPath = self.paths.getStatsPath()
    graphPath = self.paths.getGraphPath()
    hygenDir = self.paths.getHyGenDir()

    makeHyGenDirCmd = getMkdirCmd() + [hygenDir]
    dirTask = self.addTask(preJoin(taskPrefix, "makeHyGenDir"),
                           makeHyGenDirCmd,
                           dependencies=dependencies,
                           isForceLocal=True)

    isTumorNormal = (len(self.params.normalBamList)
                     and len(self.params.tumorBamList))
    isTumorOnly = ((not isTumorNormal) and len(self.params.tumorBamList))

    hygenTasks = set()
    if self.params.isGenerateSupportBam:
        sortBamVcfTasks = set()

    self.candidateVcfPaths = []
    self.diploidVcfPaths = []
    self.somaticVcfPaths = []
    self.tumorVcfPaths = []
    self.rnaVcfPaths = []

    edgeRuntimeLogPaths = []
    edgeStatsLogPaths = []

    for binId in range(self.params.nonlocalWorkBins):
        binStr = str(binId).zfill(4)
        self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr))
        if isTumorOnly:
            self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr))
        elif self.params.isRNA:
            self.rnaVcfPaths.append(self.paths.getHyGenRnaPath(binStr))
        else:
            self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr))
            if isTumorNormal:
                self.somaticVcfPaths.append(
                    self.paths.getHyGenSomaticPath(binStr))

        hygenCmd = [self.params.mantaHyGenBin]
        hygenCmd.extend(["--align-stats", statsPath])
        hygenCmd.extend(["--graph-file", graphPath])
        hygenCmd.extend(["--bin-index", str(binId)])
        hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)])
        hygenCmd.extend(
            ["--max-edge-count",
             str(self.params.graphNodeMaxEdgeCount)])
        hygenCmd.extend(
            ["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        hygenCmd.extend([
            "--min-candidate-spanning-count",
            self.params.minCandidateSpanningCount
        ])
        hygenCmd.extend(
            ["--min-scored-sv-size", self.params.minScoredVariantSize])
        hygenCmd.extend(["--ref", self.params.referenceFasta])
        hygenCmd.extend(
            ["--candidate-output-file", self.candidateVcfPaths[-1]])

        # tumor-only mode
        if isTumorOnly:
            hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]])
        elif self.params.isRNA:
            hygenCmd.extend(["--rna-output-file", self.rnaVcfPaths[-1]])
        else:
            hygenCmd.extend(
                ["--diploid-output-file", self.diploidVcfPaths[-1]])
            hygenCmd.extend(
                ["--min-qual-score", self.params.minDiploidVariantScore])
            hygenCmd.extend([
                "--min-pass-qual-score", self.params.minPassDiploidVariantScore
            ])
            hygenCmd.extend(
                ["--min-pass-gt-score", self.params.minPassDiploidGTScore])
            # tumor/normal mode
            if isTumorNormal:
                hygenCmd.extend(
                    ["--somatic-output-file", self.somaticVcfPaths[-1]])
                hygenCmd.extend(
                    ["--min-somatic-score", self.params.minSomaticScore])
                hygenCmd.extend([
                    "--min-pass-somatic-score", self.params.minPassSomaticScore
                ])

        # Setup remote read retrieval for insertions:
        def isEnableRemoteReadRetrieval():
            if isTumorOnly or isTumorNormal:
                return self.params.enableRemoteReadRetrievalForInsertionsInCancerCallingModes
            else:
                return self.params.enableRemoteReadRetrievalForInsertionsInGermlineCallingModes

        if isEnableRemoteReadRetrieval():
            hygenCmd.append("--enable-remote-read-retrieval")

        if self.params.isHighDepthFilter:
            hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        edgeRuntimeLogPaths.append(
            self.paths.getHyGenEdgeRuntimeLogPath(binStr))
        hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]])

        edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr))
        hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]])

        if self.params.isGenerateSupportBam:
            hygenCmd.extend(
                ["--evidence-bam-stub",
                 self.paths.getSupportBamStub(binStr)])

        for bamPath in self.params.normalBamList:
            hygenCmd.extend(["--align-file", bamPath])
        for bamPath in self.params.tumorBamList:
            hygenCmd.extend(["--tumor-align-file", bamPath])

        if self.params.isIgnoreAnomProperPair:
            hygenCmd.append("--ignore-anom-proper-pair")

        if self.params.useOverlapPairEvidence:
            hygenCmd.append("--use-overlapping-pair")

        if self.params.isRNA:
            hygenCmd.append("--rna")
            if self.params.isUnstrandedRNA:
                hygenCmd.append("--unstranded")

        if self.params.isOutputContig:
            hygenCmd.append("--output-contigs")

        hygenTask = preJoin(taskPrefix, "generateCandidateSV_" + binStr)
        hygenTasks.add(
            self.addTask(hygenTask,
                         hygenCmd,
                         dependencies=dirTask,
                         memMb=self.params.hyGenMemMb))

        # TODO: if the bam is large, for efficiency, consider
        # 1) filtering the bin-specific bam first w.r.t. the final candidate vcf
        # 2) then sort the bin-specific bam and merge them
        # This would require moving the filter/sort bam jobs outside the hygen loop
        if self.params.isGenerateSupportBam:
            bamIndex = 0
            # sort supporting bams extracted from normal samples
            bamIndex = sortBams(self,
                                sortBamVcfTasks,
                                taskPrefix=taskPrefix,
                                binStr=binStr,
                                isNormal=True,
                                bamIdx=bamIndex,
                                dependencies=hygenTask)
            # sort supporting bams extracted from tumor samples
            bamIndex = sortBams(self,
                                sortBamVcfTasks,
                                taskPrefix=taskPrefix,
                                binStr=binStr,
                                isNormal=False,
                                bamIdx=bamIndex,
                                dependencies=hygenTask)

    vcfTasks = sortAllVcfs(self,
                           taskPrefix=taskPrefix,
                           dependencies=hygenTasks)
    nextStepWait = copy.deepcopy(hygenTasks)

    if self.params.isGenerateSupportBam:
        sortBamVcfTasks.union(vcfTasks)
        mergeBamTasks = set()
        bamCount = 0
        # merge supporting bams for each normal sample
        bamCount = mergeSupportBams(self,
                                    mergeBamTasks,
                                    taskPrefix=taskPrefix,
                                    isNormal=True,
                                    bamIdx=bamCount,
                                    dependencies=sortBamVcfTasks)

        # merge supporting bams for each tumor sample
        bamCount = mergeSupportBams(self,
                                    mergeBamTasks,
                                    taskPrefix=taskPrefix,
                                    isNormal=False,
                                    bamIdx=bamCount,
                                    dependencies=sortBamVcfTasks)

        nextStepWait = nextStepWait.union(sortBamVcfTasks)
        nextStepWait = nextStepWait.union(mergeBamTasks)

    #
    # sort the edge runtime logs
    #
    logListFile = self.paths.getEdgeRuntimeLogListPath()
    logListTask = preJoin(taskPrefix, "sortEdgeRuntimeLogsInputList")
    self.addWorkflowTask(logListTask,
                         listFileWorkflow(logListFile, edgeRuntimeLogPaths),
                         dependencies=hygenTasks)

    def getEdgeLogSortCmd(logListFile, outPath):
        cmd = [
            sys.executable, self.params.mantaSortEdgeLogs, "-f", logListFile,
            "-o", outPath
        ]
        return cmd

    edgeSortCmd = getEdgeLogSortCmd(logListFile,
                                    self.paths.getSortedEdgeRuntimeLogPath())
    self.addTask(preJoin(taskPrefix, "sortEdgeRuntimeLogs"),
                 edgeSortCmd,
                 dependencies=logListTask,
                 isForceLocal=True)

    #
    # merge all edge stats
    #
    statsFileList = self.paths.getStatsFileListPath()
    statsListTask = preJoin(taskPrefix, "mergeEdgeStatsInputList")
    self.addWorkflowTask(statsListTask,
                         listFileWorkflow(statsFileList, edgeStatsLogPaths),
                         dependencies=hygenTasks)

    edgeStatsMergeTask = preJoin(taskPrefix, "mergeEdgeStats")
    edgeStatsMergeCmd = [self.params.mantaStatsMergeBin]
    edgeStatsMergeCmd.extend(["--stats-file-list", statsFileList])
    edgeStatsMergeCmd.extend(
        ["--output-file", self.paths.getFinalEdgeStatsPath()])
    edgeStatsMergeCmd.extend(
        ["--report-file",
         self.paths.getFinalEdgeStatsReportPath()])
    self.addTask(edgeStatsMergeTask,
                 edgeStatsMergeCmd,
                 dependencies=statsListTask,
                 isForceLocal=True)

    if not self.params.isRetainTempFiles:
        # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present.
        # rmDirCmd = getRmdirCmd() + [hygenDir]
        # rmDirTask=self.addTask(preJoin(taskPrefix,"removeTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY)
        pass

    return nextStepWait
Example #11
0
def runLocusGraph(self, taskPrefix="", dependencies=None):
    """
    Create the full SV locus graph
    """

    statsPath = self.paths.getStatsPath()
    graphPath = self.paths.getGraphPath()
    graphStatsPath = self.paths.getGraphStatsPath()

    tmpGraphDir = self.paths.getTmpGraphDir()

    makeTmpGraphDirCmd = getMkdirCmd() + [tmpGraphDir]
    dirTask = self.addTask(preJoin(taskPrefix, "makeGraphTmpDir"),
                           makeTmpGraphDirCmd,
                           dependencies=dependencies,
                           isForceLocal=True)

    tmpGraphFiles = []
    graphTasks = set()

    for gsegGroup in getGenomeSegmentGroups(getNextGenomeSegment(self.params)):
        assert (len(gsegGroup) != 0)
        gid = gsegGroup[0].id
        if len(gsegGroup) > 1:
            gid += "_to_" + gsegGroup[-1].id
        tmpGraphFiles.append(self.paths.getTmpGraphFile(gid))
        graphCmd = [self.params.mantaGraphBin]
        graphCmd.extend(["--output-file", tmpGraphFiles[-1]])
        graphCmd.extend(["--align-stats", statsPath])
        for gseg in gsegGroup:
            graphCmd.extend(["--region", gseg.bamRegion])
        graphCmd.extend(
            ["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        graphCmd.extend(
            ["--min-edge-observations", self.params.minEdgeObservations])
        graphCmd.extend(["--ref", self.params.referenceFasta])
        for bamPath in self.params.normalBamList:
            graphCmd.extend(["--align-file", bamPath])
        for bamPath in self.params.tumorBamList:
            graphCmd.extend(["--tumor-align-file", bamPath])

        if self.params.isHighDepthFilter:
            graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        if self.params.isIgnoreAnomProperPair:
            graphCmd.append("--ignore-anom-proper-pair")
        if self.params.isRNA:
            graphCmd.append("--rna")

        graphTask = preJoin(taskPrefix, "makeLocusGraph_" + gid)
        graphTasks.add(
            self.addTask(graphTask,
                         graphCmd,
                         dependencies=dirTask,
                         memMb=self.params.estimateMemMb))

    if len(tmpGraphFiles) == 0:
        raise Exception(
            "No SV Locus graphs to create. Possible target region parse error."
        )

    tmpGraphFileList = self.paths.getTmpGraphFileListPath()
    tmpGraphFileListTask = preJoin(taskPrefix, "mergeLocusGraphInputList")
    self.addWorkflowTask(tmpGraphFileListTask,
                         listFileWorkflow(tmpGraphFileList, tmpGraphFiles),
                         dependencies=graphTasks)

    mergeCmd = [self.params.mantaGraphMergeBin]
    mergeCmd.extend(["--output-file", graphPath])
    mergeCmd.extend(["--graph-file-list", tmpGraphFileList])
    mergeTask = self.addTask(preJoin(taskPrefix, "mergeLocusGraph"),
                             mergeCmd,
                             dependencies=tmpGraphFileListTask,
                             memMb=self.params.mergeMemMb)

    # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but
    # this makes the check much more clear:

    checkCmd = [self.params.mantaGraphCheckBin]
    checkCmd.extend(["--graph-file", graphPath])
    checkTask = self.addTask(preJoin(taskPrefix, "checkLocusGraph"),
                             checkCmd,
                             dependencies=mergeTask,
                             memMb=self.params.mergeMemMb)

    if not self.params.isRetainTempFiles:
        rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir]
        rmTask = self.addTask(preJoin(taskPrefix, "removeTmpDir"),
                              rmGraphTmpCmd,
                              dependencies=mergeTask)

    graphStatsCmd = [self.params.mantaGraphStatsBin, "--global"]
    graphStatsCmd.extend(["--graph-file", graphPath])
    graphStatsCmd.extend(["--output-file", graphStatsPath])

    graphStatsTask = self.addTask(preJoin(taskPrefix, "locusGraphStats"),
                                  graphStatsCmd,
                                  dependencies=mergeTask,
                                  memMb=self.params.mergeMemMb)

    nextStepWait = set()
    nextStepWait.add(checkTask)
    return nextStepWait
Example #12
0
def runHyGen(self, taskPrefix="", dependencies=None) :
    """
    Run hypothesis generation on each SV locus
    """

    import copy

    statsPath=self.paths.getStatsPath()
    graphPath=self.paths.getGraphPath()
    hygenDir=self.paths.getHyGenDir()

    makeHyGenDirCmd = getMkdirCmd() + [hygenDir]
    dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True)

    isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList))
    isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList))

    hyGenMemMb = self.params.hyGenLocalMemMb
    if self.getRunMode() == "sge" :
        hyGenMemMb = self.params.hyGenSGEMemMb

    hygenTasks=set()
    candidateVcfPaths = []
    diploidVcfPaths = []
    somaticVcfPaths = []
    tumorVcfPaths = []

    edgeRuntimeLogPaths = []
    edgeStatsLogPaths = []

    for binId in range(self.params.nonlocalWorkBins) :
        binStr = str(binId).zfill(4)
        candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr))
        if isTumorOnly :
            tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr))
        else:
            diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr))
            if isSomatic :
                somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr))

        hygenCmd = [ self.params.mantaHyGenBin ]
        hygenCmd.extend(["--align-stats",statsPath])
        hygenCmd.extend(["--graph-file",graphPath])
        hygenCmd.extend(["--bin-index", str(binId)])
        hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)])
        hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount])
        hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize])
        hygenCmd.extend(["--ref",self.params.referenceFasta])
        hygenCmd.extend(["--candidate-output-file", candidateVcfPaths[-1]])

        # tumor-only mode
        if isTumorOnly :
            hygenCmd.extend(["--tumor-output-file", tumorVcfPaths[-1]])
        else:
            hygenCmd.extend(["--diploid-output-file", diploidVcfPaths[-1]])
            hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore])
            hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore])
            hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore])
            # tumor/normal mode
            if isSomatic :
       	        hygenCmd.extend(["--somatic-output-file", somaticVcfPaths[-1]])
                hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore])
                hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore])

                # temporary fix for FFPE:
                hygenCmd.append("--skip-remote-reads")

        if self.params.isHighDepthFilter :
            hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr))
        hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]])

        edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr))
        hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]])

        for bamPath in self.params.normalBamList :
            hygenCmd.extend(["--align-file", bamPath])
        for bamPath in self.params.tumorBamList :
            hygenCmd.extend(["--tumor-align-file", bamPath])

        if self.params.isIgnoreAnomProperPair :
            hygenCmd.append("--ignore-anom-proper-pair")
        if self.params.isRNA :
            hygenCmd.append("--rna")
        if self.params.isUnstrandedRNA :
            hygenCmd.append("--unstranded")

        hygenTaskLabel=preJoin(taskPrefix,"generateCandidateSV_"+binStr)
        hygenTasks.add(self.addTask(hygenTaskLabel,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb))

    nextStepWait = copy.deepcopy(hygenTasks)

    def getVcfSortCmd(vcfPaths, outPath, isDiploid) :
        cmd  = "\"%s\" -E \"%s\" -u " % (sys.executable,self.params.mantaSortVcf)
        cmd += " ".join(quoteStringList(vcfPaths))

        # apply the ploidy filter to diploid variants
        if isDiploid:
            tempVcf = self.paths.getTempDiploidPath()
            cmd += " > \"%s\"" % (tempVcf)
            cmd += " && \"%s\" -E \"%s\" \"%s\"" % (sys.executable, self.params.mantaPloidyFilter, tempVcf)

        cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath)

        if isDiploid:
            cmd += " && " + " ".join(getRmCmd()) + " \"%s\"" % (self.paths.getTempDiploidPath())
        return cmd

    def getVcfTabixCmd(vcfPath) :
        return [self.params.tabixBin,"-f","-p","vcf", vcfPath]


    def sortVcfs(pathList, outPath, label, isDiploid=False) :
        if len(pathList) == 0 : return set()

        # make header modifications to first vcf in list of files to be sorted:
        headerFixTask=preJoin(taskPrefix,"fixVcfHeader_"+label)
        def getHeaderFixCmd(fileName) :
            tmpName=fileName+".reheader.tmp"
            cmd  = "\"%s\" -E \"%s\"" % (sys.executable, self.params.vcfCmdlineSwapper)
            cmd += ' "' + " ".join(self.params.configCommandLine) + '"'
            cmd += " < \"%s\" > \"%s\"" % (fileName,tmpName)
            cmd += " && " + " ".join(getMvCmd()) +  " \"%s\" \"%s\"" % (tmpName, fileName)
            return cmd

        self.addTask(headerFixTask,getHeaderFixCmd(pathList[0]),dependencies=hygenTasks,isForceLocal=True)

        sortCmd = getVcfSortCmd(pathList, outPath, isDiploid)
        sortTask=self.addTask(preJoin(taskPrefix,"sort_"+label),sortCmd,dependencies=headerFixTask)
        nextStepWait.add(self.addTask(preJoin(taskPrefix,"tabix_"+label),getVcfTabixCmd(outPath),dependencies=sortTask,isForceLocal=True))
        return sortTask

    candSortTask = sortVcfs(candidateVcfPaths,
                            self.paths.getSortedCandidatePath(),
                            "sortCandidateSV")
    sortVcfs(diploidVcfPaths,
             self.paths.getSortedDiploidPath(),
             "sortDiploidSV",
             isDiploid=True)
    sortVcfs(somaticVcfPaths,
             self.paths.getSortedSomaticPath(),
             "sortSomaticSV")
    sortVcfs(tumorVcfPaths,
             self.paths.getSortedTumorPath(),
             "sortTumorSV")

    def getExtractSmallCmd(maxSize, inPath, outPath) :
        cmd  = "\"%s\" -dc \"%s\"" % (self.params.bgzipBin, inPath)
        cmd += " | \"%s\" -E \"%s\" --maxSize %i" % (sys.executable, self.params.mantaExtraSmallVcf, maxSize)
        cmd += " | \"%s\" -c > \"%s\"" % (self.params.bgzipBin, outPath)
        return cmd

    def extractSmall(inPath, outPath) :
        maxSize = int(self.params.minScoredVariantSize) - 1
        if maxSize < 1 : return
        smallCmd = getExtractSmallCmd(maxSize, inPath, outPath)
        smallLabel=self.addTask(preJoin(taskPrefix,"extractSmallIndels"), smallCmd, dependencies=candSortTask, isForceLocal=True)
        nextStepWait.add(self.addTask(smallLabel+"_tabix", getVcfTabixCmd(outPath), dependencies=smallLabel, isForceLocal=True))

    extractSmall(self.paths.getSortedCandidatePath(), self.paths.getSortedCandidateSmallIndelsPath())

    # sort edge logs:
    def getEdgeLogSortCmd(logPaths, outPath) :
        cmd  = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-o",outPath]
        cmd.extend(logPaths)
        return cmd

    edgeSortLabel=preJoin(taskPrefix,"sortEdgeRuntimeLogs")
    edgeSortCmd=getEdgeLogSortCmd(edgeRuntimeLogPaths,self.paths.getSortedEdgeRuntimeLogPath())
    self.addTask(edgeSortLabel, edgeSortCmd, dependencies=hygenTasks, isForceLocal=True)

    # merge edge stats:
    edgeStatsMergeLabel=preJoin(taskPrefix,"mergeEdgeStats")
    edgeStatsMergeCmd=[self.params.mantaStatsMergeBin]
    for statsFile in edgeStatsLogPaths :
        edgeStatsMergeCmd.extend(["--stats-file",statsFile])
    edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()])
    edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()])
    self.addTask(edgeStatsMergeLabel, edgeStatsMergeCmd, dependencies=hygenTasks, isForceLocal=True)

    return nextStepWait
Example #13
0
def runLocusGraph(self,taskPrefix="",dependencies=None):
    """
    Create the full SV locus graph
    """

    statsPath=self.paths.getStatsPath()
    graphPath=self.paths.getGraphPath()
    graphStatsPath=self.paths.getGraphStatsPath()

    graphFilename=os.path.basename(graphPath)
    tmpGraphDir=os.path.join(self.params.workDir,graphFilename+".tmpdir")

    makeTmpDirCmd = getMkdirCmd() + [tmpGraphDir]
    dirTask=self.addTask(preJoin(taskPrefix,"makeTmpDir"), makeTmpDirCmd, dependencies=dependencies, isForceLocal=True)

    tmpGraphFiles = []
    graphTasks = set()

    def getGenomeSegmentGroups(params) :
        """
        Iterate segment groups and 'clump' small contigs together
        """
        minSegmentGroupSize=200000
        group = []
        headSize = 0
        for gseg in getNextGenomeSegment(self.params) :
            if headSize+gseg.size() <= minSegmentGroupSize :
                group.append(gseg)
                headSize += gseg.size()
            else :
                if len(group) != 0 : yield(group)
                group = [gseg]
                headSize = gseg.size()
        if len(group) != 0 : yield(group)

    for gsegGroup in getGenomeSegmentGroups(self.params) :
        assert(len(gsegGroup) != 0)
        gid=gsegGroup[0].id
        if len(gsegGroup) > 1 :
            gid += "_to_"+gsegGroup[-1].id
        tmpGraphFiles.append(os.path.join(tmpGraphDir,graphFilename+"."+gid+".bin"))
        graphCmd = [ self.params.mantaGraphBin ]
        graphCmd.extend(["--output-file", tmpGraphFiles[-1]])
        graphCmd.extend(["--align-stats",statsPath])
        for gseg in gsegGroup :
            graphCmd.extend(["--region",gseg.bamRegion])
        graphCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        graphCmd.extend(["--min-edge-observations", self.params.minEdgeObservations])
        graphCmd.extend(["--ref",self.params.referenceFasta])
        for bamPath in self.params.normalBamList :
            graphCmd.extend(["--align-file",bamPath])
        for bamPath in self.params.tumorBamList :
            graphCmd.extend(["--tumor-align-file",bamPath])

        if self.params.isHighDepthFilter :
            graphCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        if self.params.isIgnoreAnomProperPair :
            graphCmd.append("--ignore-anom-proper-pair")
        if self.params.isRNA :
            graphCmd.append("--rna")

        graphTaskLabel=preJoin(taskPrefix,"makeLocusGraph_"+gid)
        graphTasks.add(self.addTask(graphTaskLabel,graphCmd,dependencies=dirTask,memMb=self.params.estimateMemMb))

    if len(tmpGraphFiles) == 0 :
        raise Exception("No SV Locus graphs to create. Possible target region parse error.")

    mergeCmd = [ self.params.mantaGraphMergeBin ]
    mergeCmd.extend(["--output-file", graphPath])
    for gfile in tmpGraphFiles :
        mergeCmd.extend(["--graph-file", gfile])

    mergeTask = self.addTask(preJoin(taskPrefix,"mergeLocusGraph"),mergeCmd,dependencies=graphTasks,memMb=self.params.mergeMemMb)

    # Run a separate process to rigorously check that the final graph is valid, the sv candidate generators will check as well, but
    # this makes the check much more clear:

    checkCmd = [ self.params.mantaGraphCheckBin ]
    checkCmd.extend(["--graph-file", graphPath])
    checkTask = self.addTask(preJoin(taskPrefix,"checkLocusGraph"),checkCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb)

    rmGraphTmpCmd = getRmdirCmd() + [tmpGraphDir]
    rmTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmGraphTmpCmd,dependencies=mergeTask)

    graphStatsCmd  = [self.params.mantaGraphStatsBin,"--global"]
    graphStatsCmd.extend(["--graph-file",graphPath])
    graphStatsCmd.extend(["--output-file",graphStatsPath])

    graphStatsTask = self.addTask(preJoin(taskPrefix,"locusGraphStats"),graphStatsCmd,dependencies=mergeTask,memMb=self.params.mergeMemMb)

    nextStepWait = set()
    nextStepWait.add(checkTask)
    return nextStepWait
Example #14
0
def runHyGen(self, taskPrefix="", dependencies=None) :
    """
    Run hypothesis generation on each SV locus
    """

    import copy

    statsPath=self.paths.getStatsPath()
    graphPath=self.paths.getGraphPath()
    hygenDir=self.paths.getHyGenDir()

    makeHyGenDirCmd = getMkdirCmd() + [hygenDir]
    dirTask = self.addTask(preJoin(taskPrefix,"makeHyGenDir"), makeHyGenDirCmd, dependencies=dependencies, isForceLocal=True)

    isSomatic = (len(self.params.normalBamList) and len(self.params.tumorBamList))
    isTumorOnly = ((not isSomatic) and len(self.params.tumorBamList))

    hyGenMemMb = self.params.hyGenLocalMemMb
    if self.getRunMode() == "sge" :
        hyGenMemMb = self.params.hyGenSGEMemMb

    hygenTasks=set()
    self.candidateVcfPaths = []
    self.diploidVcfPaths = []
    self.somaticVcfPaths = []
    self.tumorVcfPaths = []

    edgeRuntimeLogPaths = []
    edgeStatsLogPaths = []

    for binId in range(self.params.nonlocalWorkBins) :
        binStr = str(binId).zfill(4)
        self.candidateVcfPaths.append(self.paths.getHyGenCandidatePath(binStr))
        if isTumorOnly :
            self.tumorVcfPaths.append(self.paths.getHyGenTumorPath(binStr))
        else:
            self.diploidVcfPaths.append(self.paths.getHyGenDiploidPath(binStr))
            if isSomatic :
                self.somaticVcfPaths.append(self.paths.getHyGenSomaticPath(binStr))

        hygenCmd = [ self.params.mantaHyGenBin ]
        hygenCmd.extend(["--align-stats",statsPath])
        hygenCmd.extend(["--graph-file",graphPath])
        hygenCmd.extend(["--bin-index", str(binId)])
        hygenCmd.extend(["--bin-count", str(self.params.nonlocalWorkBins)])
        hygenCmd.extend(["--min-candidate-sv-size", self.params.minCandidateVariantSize])
        hygenCmd.extend(["--min-candidate-spanning-count", self.params.minCandidateSpanningCount])
        hygenCmd.extend(["--min-scored-sv-size", self.params.minScoredVariantSize])
        hygenCmd.extend(["--ref",self.params.referenceFasta])
        hygenCmd.extend(["--candidate-output-file", self.candidateVcfPaths[-1]])

        # tumor-only mode
        if isTumorOnly :
            hygenCmd.extend(["--tumor-output-file", self.tumorVcfPaths[-1]])
        else:
            hygenCmd.extend(["--diploid-output-file", self.diploidVcfPaths[-1]])
            hygenCmd.extend(["--min-qual-score", self.params.minDiploidVariantScore])
            hygenCmd.extend(["--min-pass-qual-score", self.params.minPassDiploidVariantScore])
            hygenCmd.extend(["--min-pass-gt-score", self.params.minPassDiploidGTScore])
            # tumor/normal mode
            if isSomatic :
                hygenCmd.extend(["--somatic-output-file", self.somaticVcfPaths[-1]])
                hygenCmd.extend(["--min-somatic-score", self.params.minSomaticScore])
                hygenCmd.extend(["--min-pass-somatic-score", self.params.minPassSomaticScore])

                # temporary fix for FFPE:
                hygenCmd.append("--skip-remote-reads")

        if self.params.isHighDepthFilter :
            hygenCmd.extend(["--chrom-depth", self.paths.getChromDepth()])

        edgeRuntimeLogPaths.append(self.paths.getHyGenEdgeRuntimeLogPath(binStr))
        hygenCmd.extend(["--edge-runtime-log", edgeRuntimeLogPaths[-1]])

        edgeStatsLogPaths.append(self.paths.getHyGenEdgeStatsPath(binStr))
        hygenCmd.extend(["--edge-stats-log", edgeStatsLogPaths[-1]])

        for bamPath in self.params.normalBamList :
            hygenCmd.extend(["--align-file", bamPath])
        for bamPath in self.params.tumorBamList :
            hygenCmd.extend(["--tumor-align-file", bamPath])

        if self.params.isIgnoreAnomProperPair :
            hygenCmd.append("--ignore-anom-proper-pair")
        if self.params.isRNA :
            hygenCmd.append("--rna")
        if self.params.isUnstrandedRNA :
            hygenCmd.append("--unstranded")

        hygenTask=preJoin(taskPrefix,"generateCandidateSV_"+binStr)
        hygenTasks.add(self.addTask(hygenTask,hygenCmd,dependencies=dirTask, memMb=hyGenMemMb))

    nextStepWait = copy.deepcopy(hygenTasks)

    sortAllVcfs(self,taskPrefix=taskPrefix,dependencies=hygenTasks)

    #
    # sort the edge runtime logs
    #
    logListFile = self.paths.getEdgeRuntimeLogListPath()
    logListTask = preJoin(taskPrefix,"sortEdgeRuntimeLogsInputList")
    self.addWorkflowTask(logListTask,listFileWorkflow(logListFile,edgeRuntimeLogPaths),dependencies=hygenTasks)

    def getEdgeLogSortCmd(logListFile, outPath) :
        cmd  = [sys.executable,"-E",self.params.mantaSortEdgeLogs,"-f", logListFile,"-o",outPath]
        return cmd

    edgeSortCmd=getEdgeLogSortCmd(logListFile,self.paths.getSortedEdgeRuntimeLogPath())
    self.addTask(preJoin(taskPrefix,"sortEdgeRuntimeLogs"), edgeSortCmd, dependencies=logListTask, isForceLocal=True)

    #
    # merge all edge stats
    #
    statsFileList = self.paths.getStatsFileListPath()
    statsListTask = preJoin(taskPrefix,"mergeEdgeStatsInputList")
    self.addWorkflowTask(statsListTask,listFileWorkflow(statsFileList,edgeStatsLogPaths),dependencies=hygenTasks)

    edgeStatsMergeTask=preJoin(taskPrefix,"mergeEdgeStats")
    edgeStatsMergeCmd=[self.params.mantaStatsMergeBin]
    edgeStatsMergeCmd.extend(["--stats-file-list",statsFileList])
    edgeStatsMergeCmd.extend(["--output-file",self.paths.getFinalEdgeStatsPath()])
    edgeStatsMergeCmd.extend(["--report-file",self.paths.getFinalEdgeStatsReportPath()])
    self.addTask(edgeStatsMergeTask, edgeStatsMergeCmd, dependencies=statsListTask, isForceLocal=True)

    if not self.params.isRetainTempFiles :
        # we could delete the temp hygenDir directory here, but it is used for debug so frequently it doesn't seem worth it at present.
        # rmDirCmd = getRmdirCmd() + [hygenDir]
        # rmDirTask=self.addTask(preJoin(taskPrefix,"rmTmpDir"),rmDirCmd,dependencies=TBD_XXX_MANY)
        pass

    return nextStepWait