Esempio n. 1
0
    def validateAndSanitizeExistingOptions(self, options):

        options.runDir = os.path.abspath(options.runDir)

        # check alignerMode:
        if options.alignerMode is not None:
            options.alignerMode = options.alignerMode.lower()
            if options.alignerMode not in self.validAlignerModes:
                raise OptParseException("Invalid aligner mode: '%s'" %
                                        options.alignerMode)

        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference")
        # check for reference fasta index file:
        if options.referenceFasta is not None:
            faiFile = options.referenceFasta + ".fai"
            if not os.path.isfile(faiFile):
                raise OptParseException(
                    "Can't find expected fasta index file: '%s'" % (faiFile))

        # check for bed file of call regions and its index file
        if options.callRegionsBed is not None:
            options.callRegionsBed = os.path.abspath(options.callRegionsBed)
            checkTabixIndexedFile(options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in options.regionStrList
            ]
Esempio n. 2
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir,
                                          options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException(
                "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory."
                % (workflowScriptPath))

        # check reference fasta file exists
        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference")

        # check for reference fasta index file:
        faiFile = options.referenceFasta + ".fai"
        if not os.path.isfile(faiFile):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" % (faiFile))

        # check for bed file of call regions and its index file
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in options.regionStrList
            ]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList
                                                    is not None):
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None:
                for chrom in getTabixChromSet(options.tabixBin,
                                              options.callRegionsBed):
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', in call regions bed file '%s', not found in reference genome."
                            % (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None:
                for (genomeRegionIndex,
                     genomeRegion) in enumerate(options.genomeRegionList):
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', parsed from region argument '%s', not found in reference genome."
                            %
                            (chrom, options.regionStrList[genomeRegionIndex]))
Esempio n. 3
0
    def validateAndSanitizeExistingOptions(self, options):

        options.runDir = os.path.abspath(options.runDir)

        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference")

        # check for reference fasta index file:
        if options.referenceFasta is not None:
            faiFile = options.referenceFasta + ".fai"
            if not os.path.isfile(faiFile):
                raise OptParseException(
                    "Can't find expected fasta index file: '%s'" % (faiFile))

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(rr) for r in options.regionStrList
                for rr in r.split("+")
            ]
Esempio n. 4
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference fasta file")

        # check for reference fasta index file:
        referenceFastaIndex = options.referenceFasta + ".fai"
        if not os.path.isfile(referenceFastaIndex):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" %
                (referenceFastaIndex))

        if options.isEstimateSequenceError:
            # Determine if dynamic error estimation is feasible based on the reference size
            # - Given reference contig set (S) with sequence length of at least 5 Mb
            # - The total sequence length from S must be at least 50 Mb

            class Constants:
                Megabase = 1000000
                minChromSize = options.errorEstimationMinChromMb * Megabase
                minTotalSize = options.errorEstimationMinTotalMb * Megabase

            # read fasta index
            (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

            totalEstimationSize = 0
            for chromSize in chromSizes.values():
                if chromSize < Constants.minChromSize: continue
                totalEstimationSize += chromSize

            if totalEstimationSize < Constants.minTotalSize:
                sys.stderr.write(
                    "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n"
                )
                options.isEstimateSequenceError = False

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(rr) for r in options.regionStrList
                for rr in r.split("+")
            ]

        options.snvScoringModelFile = validateFixExistingFileArg(
            options.snvScoringModelFile, "SNV empirical scoring model file")
        options.indelScoringModelFile = validateFixExistingFileArg(
            options.indelScoringModelFile,
            "Indel empirical scoring model file")
 def singleAppender(bamList, label):
     if bamList is None: return
     if len(bamList) > 1:
         raise OptParseException(
             "More than one %s sample BAM/CRAM files specified" %
             (label))
     bamSetChecker.appendBams(bamList, label)
    def validateOptionExistence(self, options):

        StrelkaSharedWorkflowOptionsBase.validateOptionExistence(self, options)

        if len(options.probandBamList) != 1:
            raise OptParseException(
                "Must specify one proband sample BAM/CRAM file")

        if len(options.parentBamList) != 2:
            raise OptParseException(
                "Must specify two parent sample BAM/CRAM files")

        bcheck = BamSetChecker()
        bcheck.appendBams(options.probandBamList, "proband")
        bcheck.appendBams(options.parentBamList, "parent")
        bcheck.appendBams(options.siblingBamList, "sibling", isAllowEmpty=True)
        bcheck.check(options.htsfileBin, options.referenceFasta)
Esempio n. 7
0
    def validateAndSanitizeOptions(self, options):

        MantaWorkflowOptionsBase.validateAndSanitizeOptions(self, options)

        def safeLen(x):
            if x is None: return 0
            return len(x)

        if ((safeLen(options.normalBamList) == 0)
                and (safeLen(options.tumorBamList) == 0)):
            raise OptParseException(
                "No normal or tumor sample alignment files specified")

        if (safeLen(options.tumorBamList) > 1):
            raise OptParseException("Can't accept more then one tumor sample")

        if ((safeLen(options.tumorBamList) > 0)
                and (safeLen(options.normalBamList) > 1)):
            raise OptParseException(
                "Can't accept multiple normal samples for tumor subtraction")

        if options.isRNA:
            if ((safeLen(options.normalBamList) != 1)
                    or (safeLen(options.tumorBamList) != 0)):
                raise OptParseException(
                    "RNA mode currently requires exactly one normal sample")
        else:
            if options.isUnstrandedRNA:
                raise OptParseException(
                    "Unstranded only applied for RNA inputs")

        if options.existingAlignStatsFile is not None:
            options.existingAlignStatsFile = validateFixExistingFileArg(
                options.existingAlignStatsFile, "existing align stats")

        groomBamList(options.normalBamList, "normal sample")
        groomBamList(options.tumorBamList, "tumor sample")

        bamSetChecker = BamSetChecker()
        if safeLen(options.normalBamList) > 0:
            bamSetChecker.appendBams(options.normalBamList, "Normal")
        if safeLen(options.tumorBamList) > 0:
            bamSetChecker.appendBams(options.tumorBamList, "Tumor")
        bamSetChecker.check(options.htsfileBin, options.referenceFasta)
Esempio n. 8
0
    def validateAndSanitizeExistingOptions(self, options):
        def checkForBamIndex(bamFile):
            baiFile = bamFile + ".bai"
            if not os.path.isfile(baiFile):
                raise OptParseException(
                    "Can't find expected BAM index file: '%s'" % (baiFile))

        def groomBamList(bamList, sampleLabel):
            if bamList is None: return
            for (index, bamFile) in enumerate(bamList):
                bamList[index] = validateFixExistingFileArg(
                    bamFile, "%s BAM file" % (sampleLabel))
                checkForBamIndex(bamList[index])

        groomBamList(options.normalBamList, "normal sample")
        groomBamList(options.tumorBamList, "tumor sample")

        # check alignerMode:
        if options.alignerMode is not None:
            options.alignerMode = options.alignerMode.lower()
            if options.alignerMode not in self.validAlignerModes:
                raise OptParseException("Invalid aligner mode: '%s'" %
                                        options.alignerMode)

        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference")

        # check for reference fasta index file:
        if options.referenceFasta is not None:
            faiFile = options.referenceFasta + ".fai"
            if not os.path.isfile(faiFile):
                raise OptParseException(
                    "Can't find expected fasta index file: '%s'" % (faiFile))

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in options.regionStrList
            ]

        MantaWorkflowOptionsBase.validateAndSanitizeExistingOptions(
            self, options)
Esempio n. 9
0
    def validateOptionExistence(self,options) :

        if (((options.normalBamList is None) or (len(options.normalBamList) == 0)) and
            ((options.tumorBamList is None) or (len(options.tumorBamList) == 0))) :
            raise OptParseException("No normal & tumor sample BAM files specified")

        bcheck = BamSetChecker()
        bcheck.appendBams(options.normalBamList,"Normal")
        bcheck.appendBams(options.tumorBamList,"Tumor")
        bcheck.check(options.samtoolsBin,
                     options.referenceFasta)

        MantaWorkflowOptionsBase.validateOptionExistence(self,options)
Esempio n. 10
0
    def validateOptionExistence(self, options):

        if (options.normalBamList is None) or (len(options.normalBamList)
                                               == 0):
            raise OptParseException("No normal sample BAM files specified")

        assertOptionExists(options.alignerMode, "aligner mode")
        assertOptionExists(options.referenceFasta, "reference fasta file")

        MantaWorkflowOptionsBase.validateOptionExistence(self, options)

        # check that the reference and all bams are using the same
        # set of chromosomes:
        bamList = []
        bamLabels = []

        def appendBams(inputBamList, inputLabel):
            if inputBamList is None: return
            for inputBamFile in inputBamList:
                bamList.append(inputBamFile)
                bamLabels.append(inputLabel)

        appendBams(options.normalBamList, "Normal")
        appendBams(options.tumorBamList, "Tumor")

        checkChromSet(options.samtoolsBin,
                      options.referenceFasta,
                      bamList,
                      bamLabels,
                      isReferenceLocked=True)

        # check for repeated bam entries:
        #
        bamSet = set()
        for bamFile in bamList:
            if bamFile in bamSet:
                raise OptParseException("Repeated input BAM file: %s" %
                                        (bamFile))
            bamSet.add(bamFile)
Esempio n. 11
0
    def validateOptionExistence(self, options):
        def safeLen(x):
            if x is None: return 0
            return len(x)

        if ((safeLen(options.normalBamList) == 0)
                and (safeLen(options.tumorBamList) == 0)):
            raise OptParseException(
                "No normal or tumor sample alignment files specified")

        if (safeLen(options.tumorBamList) > 1):
            raise OptParseException("Can't accept more then one tumor sample")

        if ((safeLen(options.tumorBamList) > 0)
                and (safeLen(options.normalBamList) > 1)):
            raise OptParseException(
                "Can't accept multiple normal samples for tumor subtraction")

        bcheck = BamSetChecker()
        bcheck.appendBams(options.normalBamList, "Normal")
        bcheck.appendBams(options.tumorBamList, "Tumor")
        bcheck.check(options.htsfileBin, options.referenceFasta)

        MantaWorkflowOptionsBase.validateOptionExistence(self, options)
Esempio n. 12
0
    def validateAndSanitizeOptions(self, options):

        StrelkaSharedWorkflowOptionsBase.validateAndSanitizeOptions(
            self, options)

        options.ploidyFilename = checkFixTabixIndexedFileOption(
            options.ploidyFilename, "ploidy file")
        options.noCompressBed = checkFixTabixIndexedFileOption(
            options.noCompressBed, "no-compress bed")
        if options.snvScoringModelFile is None:
            if options.isRNA:
                options.snvScoringModelFile = options.rnaSnvScoringModelFile
            else:
                options.snvScoringModelFile = options.germlineSnvScoringModelFile

        if options.indelScoringModelFile is None:
            if options.isRNA:
                options.indelScoringModelFile = options.rnaIndelScoringModelFile
            else:
                options.indelScoringModelFile = options.germlineIndelScoringModelFile

        # Disable dynamic error estimation for Exome
        if options.isExome:
            options.isEstimateSequenceError = False

        # Disable dynamic error estimation for RNA
        if options.isRNA:
            options.isEstimateSequenceError = False

        groomBamList(options.bamList, "input")

        def safeLen(x):
            if x is None: return 0
            return len(x)

        if safeLen(options.bamList) == 0:
            raise OptParseException(
                "No input sample alignment files specified")

        bamSetChecker = BamSetChecker()
        bamSetChecker.appendBams(options.bamList, "Input")
        bamSetChecker.check(options.htsfileBin, options.referenceFasta)
Esempio n. 13
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir,
                                          options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException(
                "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory."
                % (workflowScriptPath))

        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference fasta file")

        # check for reference fasta index file:
        referenceFastaIndex = options.referenceFasta + ".fai"
        if not os.path.isfile(referenceFastaIndex):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" %
                (referenceFastaIndex))

        if options.isEstimateSequenceError:
            # Determine if dynamic error estimation is feasible based on the reference size
            # - Given reference contig set (S) with sequence length of at least 5 Mb
            # - The total sequence length from S must be at least 50 Mb

            class Constants:
                Megabase = 1000000
                minChromSize = options.errorEstimationMinChromMb * Megabase
                minTotalSize = options.errorEstimationMinTotalMb * Megabase

            # read fasta index
            (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

            totalEstimationSize = 0
            for chromSize in chromSizes.values():
                if chromSize < Constants.minChromSize: continue
                totalEstimationSize += chromSize

            if totalEstimationSize < Constants.minTotalSize:
                sys.stderr.write(
                    "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n"
                )
                options.isEstimateSequenceError = False

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        def extendedRegionStrList():
            """
            A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format
            to specify multiple regions in a single argument.
            """
            for r in options.regionStrList:
                for rr in r.split("+"):
                    yield rr

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in extendedRegionStrList()
            ]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList
                                                    is not None):
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None:
                for chrom in getTabixChromSet(options.tabixBin,
                                              options.callRegionsBed):
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', in call regions bed file '%s', not found in reference genome."
                            % (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None:
                for (genomeRegionIndex,
                     genomeRegion) in enumerate(options.genomeRegionList):
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', parsed from region argument '%s', not found in reference genome."
                            % (chrom, list(
                                extendedRegionStrList())[genomeRegionIndex]))

        options.snvScoringModelFile = validateFixExistingFileArg(
            options.snvScoringModelFile, "SNV empirical scoring model file")
        options.indelScoringModelFile = validateFixExistingFileArg(
            options.indelScoringModelFile,
            "Indel empirical scoring model file")
Esempio n. 14
0
    def getRunOptions(self, primary_section, version=None, configHelp=None):
        """
        primary client code interface to the finished product.
        do not override this method

        This returns a tuple of the (1) a class holding all of the
        primary run options gathered from the primary section of the ini
        file and command-line options and (2) an inifile hash-of-hashes
        reflecting all sections of the ini file.
        """
        def updateIniSections(data, newData):
            for k in newData.keys():
                if k not in data: data[k] = {}
                for kk in newData[k].keys():
                    data[k][kk] = newData[k][kk]

        # first level of options are those hard coded into the python code as defaults,
        # these have the lowest precedence:
        #
        iniSections = {primary_section: self.getOptionDefaults()}

        # next is the 'global' ini file, in the same directory as the configure
        # script:
        realArg0 = os.path.realpath(sys.argv[0])
        cmdlineScriptName = os.path.basename(realArg0)
        configFileName = cmdlineScriptName + ".ini"

        cmdlineScriptDir = os.path.abspath(os.path.dirname(realArg0))
        globalConfigPath = os.path.join(cmdlineScriptDir, configFileName)
        updateIniSections(iniSections, getIniSections(globalConfigPath))

        parser = self._getOptionParser(iniSections[primary_section],
                                       configFileName,
                                       cmdlineScriptDir,
                                       version=version,
                                       configHelp=configHelp)
        (options, args) = parser.parse_args()

        try:
            if options.userConfigPath:
                if not os.path.isfile(options.userConfigPath):
                    raise OptParseException("Can't find config file: '%s'" %
                                            (options.userConfigPath))

                updateIniSections(iniSections,
                                  getIniSections(options.userConfigPath))

                # reparse with updated default values:
                parser = self._getOptionParser(iniSections[primary_section],
                                               configFileName,
                                               cmdlineScriptDir,
                                               version=version,
                                               configHelp=configHelp)
                (options, args) = parser.parse_args()
            else:
                if not os.path.isfile(globalConfigPath):
                    raise OptParseException(
                        "Can't find default config file: '%s'" %
                        (globalConfigPath))

            if options.isAllHelp:
                # this second call to getOptionParser is only here to provide the extended help option:
                parser = self._getOptionParser(iniSections[primary_section],
                                               configFileName,
                                               cmdlineScriptDir,
                                               True,
                                               version=version,
                                               configHelp=configHelp)
                parser.print_help()
                sys.exit(2)

            nargs = len(args)
            if nargs:
                plural = ""
                if nargs > 1: plural = "s"
                raise OptParseException("%i unrecognized argument%s:\n%s" %
                                        (nargs, plural, "\n".join(
                                            ["'" + arg + "'"
                                             for arg in args])))

            self.validateAndSanitizeOptions(options)

            # write options object back into full iniSections object:
            #
            for k, v in vars(options).iteritems():
                if k == "isAllHelp": continue
                iniSections[primary_section][k] = v

        except OptParseException as e:
            noArgOrError(parser, str(e))

        return options, iniSections
 def checkRequired(bamList,label):
     if (bamList is None) or (len(bamList) == 0) :
         raise OptParseException("No %s sample BAM/CRAM files specified" % (label))
Esempio n. 16
0
 def checkForBamIndex(bamFile):
     baiFile = bamFile + ".bai"
     if not os.path.isfile(baiFile):
         raise OptParseException(
             "Can't find expected BAM index file: '%s'" % (baiFile))