Example #1
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference fasta file")

        # check for reference fasta index file:
        referenceFastaIndex = options.referenceFasta + ".fai"
        if not os.path.isfile(referenceFastaIndex):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" %
                (referenceFastaIndex))

        if options.isEstimateSequenceError:
            # Determine if dynamic error estimation is feasible based on the reference size
            # - Given reference contig set (S) with sequence length of at least 5 Mb
            # - The total sequence length from S must be at least 50 Mb

            class Constants:
                Megabase = 1000000
                minChromSize = options.errorEstimationMinChromMb * Megabase
                minTotalSize = options.errorEstimationMinTotalMb * Megabase

            # read fasta index
            (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

            totalEstimationSize = 0
            for chromSize in chromSizes.values():
                if chromSize < Constants.minChromSize: continue
                totalEstimationSize += chromSize

            if totalEstimationSize < Constants.minTotalSize:
                sys.stderr.write(
                    "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n"
                )
                options.isEstimateSequenceError = False

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(rr) for r in options.regionStrList
                for rr in r.split("+")
            ]

        options.snvScoringModelFile = validateFixExistingFileArg(
            options.snvScoringModelFile, "SNV empirical scoring model file")
        options.indelScoringModelFile = validateFixExistingFileArg(
            options.indelScoringModelFile,
            "Indel empirical scoring model file")
Example #2
0
    def __init__(self,params) :

        cleanPyEnv()

        self.params=params

        # normalize boolean option input:
        safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes")
        safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes")
        safeSetBool(self.params,"useOverlapPairEvidence")

        # Use RNA option for minCandidate size
        if self.params.isRNA:
            self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
        self.params.evidenceDir=os.path.join(self.params.resultsDir,"evidence")
        ensureDir(self.params.evidenceDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)
        # determine subset of chroms where we can skip calling entirely
        (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)

        # always use overlapping pairs for RNA calling
        if (self.params.isRNA) :
            self.params.useOverlapPairEvidence = True
Example #3
0
    def __init__(self, params, iniSections):

        cleanPyEnv()

        self.params = params
        self.iniSections = iniSections

        # Use RNA option for minCandidate size
        if self.params.isRNA:
            self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize

        # format bam lists:
        if self.params.normalBamList is None: self.params.normalBamList = []
        if self.params.tumorBamList is None: self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir = os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir = os.path.join(self.params.runDir, "workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir = os.path.join(self.params.runDir, "results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir = os.path.join(self.params.resultsDir, "stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir = os.path.join(self.params.resultsDir,
                                               "variants")
        ensureDir(self.params.variantsDir)
        self.params.evidenceDir = os.path.join(self.params.resultsDir,
                                               "evidence")
        ensureDir(self.params.evidenceDir)
        #         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
        #         ensureDir(self.params.reportsDir)

        indexRefFasta = self.params.referenceFasta + ".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta, "reference fasta")
            checkFile(indexRefFasta, "reference fasta index")

        # read fasta index
        (self.params.chromOrder,
         self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)
        # determine subset of chroms where we can skip calling entirely
        (self.params.callRegionList,
         self.params.chromIsSkipped) = getCallRegions(self.params)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome
                                              or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
Example #4
0
    def __init__(self,params,iniSections) :

        # clear out some potentially destabilizing env variables:
        clearList = [ "PYTHONPATH", "PYTHONHOME"]
        for key in clearList :
            if key in os.environ :
                del os.environ[key]

        self.params=params
        self.iniSections=iniSections

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        self.params.normalBamList = []
        for bam in (self.params.normalBam,) :
            if bam is None : continue
            self.params.normalBamList.append(bam)

        self.params.tumorBamList = []
        for bam in (self.params.tumorBam,) :
            if bam is None : continue
            self.params.tumorBamList.append(bam)

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        # sanity check some parameter typing:
        self.params.binSize = int(self.params.binSize)
        self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins)

        self.paths = PathInfo(self.params)
Example #5
0
    def __init__(self,params,iniSections) :

        # clear out some potentially destabilizing env variables:
        clearList = [ "PYTHONPATH", "PYTHONHOME"]
        for key in clearList :
            if key in os.environ :
                del os.environ[key]

        self.params=params
        self.iniSections=iniSections

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        # sanity check some parameter typing:
        MEGABASE = 1000000
        self.params.scanSize = int(self.params.scanSizeMb) * MEGABASE
        self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
Example #6
0
    def __init__(self,params,iniSections) :

        cleanPyEnv()

        self.params=params
        self.iniSections=iniSections

        # format bam lists:
        if self.params.normalBamList is None : self.params.normalBamList = []
        if self.params.tumorBamList is None : self.params.tumorBamList = []

        # make sure run directory is setup:
        self.params.runDir=os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir=os.path.join(self.params.runDir,"workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transfered to resultsDir
        self.params.resultsDir=os.path.join(self.params.runDir,"results")
        ensureDir(self.params.resultsDir)
        self.params.statsDir=os.path.join(self.params.resultsDir,"stats")
        ensureDir(self.params.statsDir)
        self.params.variantsDir=os.path.join(self.params.resultsDir,"variants")
        ensureDir(self.params.variantsDir)
#         self.params.reportsDir=os.path.join(self.params.resultsDir,"reports")
#         ensureDir(self.params.reportsDir)

        indexRefFasta=self.params.referenceFasta+".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta,"reference fasta")
            checkFile(indexRefFasta,"reference fasta index")

        # read fasta index
        (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta)

        self.paths = PathInfo(self.params)

        self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
        self.params.isIgnoreAnomProperPair = (self.params.isRNA)
    def __init__(self, params, PathInfoType):

        cleanPyEnv()

        self.params = params

        # make sure run directory is setup:
        self.params.runDir = os.path.abspath(self.params.runDir)
        ensureDir(self.params.runDir)

        # everything that's not intended to be a final result should dump directories/files in workDir
        self.params.workDir = os.path.join(self.params.runDir, "workspace")
        ensureDir(self.params.workDir)

        # all finalized pretty results get transferred to resultsDir
        self.params.resultsDir = os.path.join(self.params.runDir, "results")
        ensureDir(self.params.resultsDir)
        self.params.variantsDir = os.path.join(self.params.resultsDir,
                                               "variants")
        ensureDir(self.params.variantsDir)

        # timings and other stats go into statsDir
        self.params.statsDir = os.path.join(self.params.resultsDir, "stats")
        ensureDir(self.params.statsDir)

        self.paths = PathInfoType(self.params)

        referenceFastaIndex = self.params.referenceFasta + ".fai"

        if self.params.referenceFasta is None:
            raise Exception("No reference fasta defined.")
        else:
            checkFile(self.params.referenceFasta, "reference fasta")
            checkFile(referenceFastaIndex, "reference fasta index")

        # read fasta index
        (self.params.chromOrder,
         self.params.chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

        # determine subset of chroms where we can skip calling entirely
        self.params.chromIsSkipped = getChromIsSkipped(self)

        self.params.isHighDepthFilter = (not (self.params.isExome
                                              or self.params.isRNA))
Example #8
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir,
                                          options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException(
                "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory."
                % (workflowScriptPath))

        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference fasta file")

        # check for reference fasta index file:
        referenceFastaIndex = options.referenceFasta + ".fai"
        if not os.path.isfile(referenceFastaIndex):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" %
                (referenceFastaIndex))

        if options.isEstimateSequenceError:
            # Determine if dynamic error estimation is feasible based on the reference size
            # - Given reference contig set (S) with sequence length of at least 5 Mb
            # - The total sequence length from S must be at least 50 Mb

            class Constants:
                Megabase = 1000000
                minChromSize = options.errorEstimationMinChromMb * Megabase
                minTotalSize = options.errorEstimationMinTotalMb * Megabase

            # read fasta index
            (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

            totalEstimationSize = 0
            for chromSize in chromSizes.values():
                if chromSize < Constants.minChromSize: continue
                totalEstimationSize += chromSize

            if totalEstimationSize < Constants.minTotalSize:
                sys.stderr.write(
                    "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n"
                )
                options.isEstimateSequenceError = False

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        def extendedRegionStrList():
            """
            A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format
            to specify multiple regions in a single argument.
            """
            for r in options.regionStrList:
                for rr in r.split("+"):
                    yield rr

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in extendedRegionStrList()
            ]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList
                                                    is not None):
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None:
                for chrom in getTabixChromSet(options.tabixBin,
                                              options.callRegionsBed):
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', in call regions bed file '%s', not found in reference genome."
                            % (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None:
                for (genomeRegionIndex,
                     genomeRegion) in enumerate(options.genomeRegionList):
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', parsed from region argument '%s', not found in reference genome."
                            % (chrom, list(
                                extendedRegionStrList())[genomeRegionIndex]))

        options.snvScoringModelFile = validateFixExistingFileArg(
            options.snvScoringModelFile, "SNV empirical scoring model file")
        options.indelScoringModelFile = validateFixExistingFileArg(
            options.indelScoringModelFile,
            "Indel empirical scoring model file")