Exemple #1
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir,
                                          options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException(
                "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory."
                % (workflowScriptPath))

        # check reference fasta file exists
        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference")

        # check for reference fasta index file:
        faiFile = options.referenceFasta + ".fai"
        if not os.path.isfile(faiFile):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" % (faiFile))

        # check for bed file of call regions and its index file
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in options.regionStrList
            ]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList
                                                    is not None):
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None:
                for chrom in getTabixChromSet(options.tabixBin,
                                              options.callRegionsBed):
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', in call regions bed file '%s', not found in reference genome."
                            % (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None:
                for (genomeRegionIndex,
                     genomeRegion) in enumerate(options.genomeRegionList):
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', parsed from region argument '%s', not found in reference genome."
                            %
                            (chrom, options.regionStrList[genomeRegionIndex]))
def getChromIsSkipped(self):
    """
    Determine subset of chroms from chromOrder which are completely skipped over

    here "skipped" means that not a single base on the chrom is requested for calling or error estimation

    \return The set of chromLabels which are skipped
    """

    chromIsSkipped = set()

    # return empty set when no region selections have been made:
    if ((self.params.genomeRegionList is None)
            and (self.params.callRegionsBed is None)):
        return chromIsSkipped

    def allChromosomes():
        """
        Return a set of all chromosomes from the reference/alignments in this analysis
        """
        return set(self.params.chromOrder)

    # first check chromosome coverage of "regions" arguments
    if self.params.genomeRegionList is not None:
        chromIsSkipped = allChromosomes()
        for genomeRegion in self.params.genomeRegionList:
            if genomeRegion["chrom"] in chromIsSkipped:
                chromIsSkipped.remove(genomeRegion["chrom"])

    # further refine coverage based on callRegions BED file
    if self.params.callRegionsBed is not None:
        callRegionsChroms = getTabixChromSet(self.params.tabixBin,
                                             self.params.callRegionsBed)
        chromsNotInCallRegions = allChromosomes() - callRegionsChroms

        # Skip the union of:
        # 1. chromosomes skipped already due to region arguments
        # 2. chromosomes skipped due to callRegions bed track
        chromIsSkipped = chromIsSkipped | chromsNotInCallRegions

    # if sequencing error estimation is turned on, make sure estimation targets are not skipped:
    if self.params.isEstimateSequenceError:

        class Constants:
            Megabase = 1000000
            errorEstimationMinChromSize = self.params.errorEstimationMinChromMb * Megabase

        for chrom in self.params.chromSizes:
            if self.params.chromSizes[
                    chrom] < Constants.errorEstimationMinChromSize:
                continue
            if chrom in chromIsSkipped:
                chromIsSkipped.remove(chrom)

    return chromIsSkipped
Exemple #3
0
    def validateAndSanitizeOptions(self,options) :

        assertOptionExists(options.runDir,"run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException("Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath))

        # check reference fasta file exists
        assertOptionExists(options.referenceFasta,"reference fasta file")
        options.referenceFasta=validateFixExistingFileArg(options.referenceFasta,"reference")

        # check for reference fasta index file:
        faiFile=options.referenceFasta + ".fai"
        if not os.path.isfile(faiFile) :
            raise OptParseException("Can't find expected fasta index file: '%s'" % (faiFile))

        # check for bed file of call regions and its index file
        options.callRegionsBed = checkFixTabixIndexedFileOption(options.callRegionsBed, "call-regions bed")

        if (options.regionStrList is None) or (len(options.regionStrList) == 0) :
            options.genomeRegionList = None
        else :
            options.genomeRegionList = [parseGenomeRegion(r) for r in options.regionStrList]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList is not None) :
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None :
                for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed) :
                    if chrom not in refChromInfo :
                        raise OptParseException("Chromosome label '%s', in call regions bed file '%s', not found in reference genome." %
                                                (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None :
                for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList) :
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo :
                        raise OptParseException("Chromosome label '%s', parsed from region argument '%s', not found in reference genome." %
                                                (chrom, options.regionStrList[genomeRegionIndex]))
Exemple #4
0
    def validateAndSanitizeOptions(self, options):

        assertOptionExists(options.runDir, "run directory")
        options.runDir = os.path.abspath(options.runDir)

        workflowScriptPath = os.path.join(options.runDir,
                                          options.workflowScriptName)
        if os.path.exists(workflowScriptPath):
            raise OptParseException(
                "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory."
                % (workflowScriptPath))

        assertOptionExists(options.referenceFasta, "reference fasta file")
        options.referenceFasta = validateFixExistingFileArg(
            options.referenceFasta, "reference fasta file")

        # check for reference fasta index file:
        referenceFastaIndex = options.referenceFasta + ".fai"
        if not os.path.isfile(referenceFastaIndex):
            raise OptParseException(
                "Can't find expected fasta index file: '%s'" %
                (referenceFastaIndex))

        if options.isEstimateSequenceError:
            # Determine if dynamic error estimation is feasible based on the reference size
            # - Given reference contig set (S) with sequence length of at least 5 Mb
            # - The total sequence length from S must be at least 50 Mb

            class Constants:
                Megabase = 1000000
                minChromSize = options.errorEstimationMinChromMb * Megabase
                minTotalSize = options.errorEstimationMinTotalMb * Megabase

            # read fasta index
            (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex)

            totalEstimationSize = 0
            for chromSize in chromSizes.values():
                if chromSize < Constants.minChromSize: continue
                totalEstimationSize += chromSize

            if totalEstimationSize < Constants.minTotalSize:
                sys.stderr.write(
                    "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n"
                )
                options.isEstimateSequenceError = False

        checkFixTabixListOption(options.indelCandidatesList,
                                "candidate indel vcf")
        checkFixTabixListOption(options.forcedGTList, "forced genotype vcf")
        options.callRegionsBed = checkFixTabixIndexedFileOption(
            options.callRegionsBed, "call-regions bed")

        def extendedRegionStrList():
            """
            A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format
            to specify multiple regions in a single argument.
            """
            for r in options.regionStrList:
                for rr in r.split("+"):
                    yield rr

        if (options.regionStrList is None) or (len(options.regionStrList)
                                               == 0):
            options.genomeRegionList = None
        else:
            options.genomeRegionList = [
                parseGenomeRegion(r) for r in extendedRegionStrList()
            ]

        # validate chromosome names appearing in region tags and callRegions bed file
        if (options.callRegionsBed is not None) or (options.genomeRegionList
                                                    is not None):
            refChromInfo = getFastaInfo(options.referenceFasta)
            if options.callRegionsBed is not None:
                for chrom in getTabixChromSet(options.tabixBin,
                                              options.callRegionsBed):
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', in call regions bed file '%s', not found in reference genome."
                            % (chrom, options.callRegionsBed))

            if options.genomeRegionList is not None:
                for (genomeRegionIndex,
                     genomeRegion) in enumerate(options.genomeRegionList):
                    chrom = genomeRegion["chrom"]
                    if chrom not in refChromInfo:
                        raise OptParseException(
                            "Chromosome label '%s', parsed from region argument '%s', not found in reference genome."
                            % (chrom, list(
                                extendedRegionStrList())[genomeRegionIndex]))

        options.snvScoringModelFile = validateFixExistingFileArg(
            options.snvScoringModelFile, "SNV empirical scoring model file")
        options.indelScoringModelFile = validateFixExistingFileArg(
            options.indelScoringModelFile,
            "Indel empirical scoring model file")
Exemple #5
0
def getCallRegions(params) :
    """
    determine
    1) a set of genomic regions for calling
    2) a set of chromosomes that are completely skipped over,
       where "skipped" means that not a single base on the chrom is requested for calling

    \return a list of genomic regions for calling
    \return a set of chromLabels which are skipped
    """
    callRegionList = []
    chromIsSkipped = set()

    # when no region selections have been made:
    if ((params.genomeRegionList is None) and
            (params.callRegionsBed is None)) :
        return (callRegionList, chromIsSkipped)

    # check chromosome coverage of "regions" arguments
    chromIsSkipped = set(params.chromOrder)
    if params.genomeRegionList is not None :
        for genomeRegion in params.genomeRegionList :
            chrom = genomeRegion["chrom"]

            if chrom not in params.chromOrder:
                raise Exception("Unexpected chromosome '%s' in the argument of target regions (--region)" %
                                (chrom))

            if chrom in chromIsSkipped :
                chromIsSkipped.remove(chrom)

    if params.callRegionsBed is None :
        return (params.genomeRegionList, chromIsSkipped)

    # check chromsome coverage based on callRegions BED file
    callChromList = []
    chromIsSkipped2 = set(params.chromOrder)

    for chrom in getTabixChromSet(params.tabixBin, params.callRegionsBed) :
        if chrom not in params.chromOrder:
            raise Exception("Unexpected chromosome '%s' in the bed file of call regions %s " %
                            (chrom, params.callRegionsBed))

        callChromList.append(chrom)
        if chrom in chromIsSkipped2 :
            chromIsSkipped2.remove(chrom)

    if params.genomeRegionList is None :
        chromIsSkipped = chromIsSkipped2

        for chrom in callChromList:
            chromRegion = {"chrom":chrom, "start":1, "end":params.chromSizes[chrom]}
            callRegions = getOverlapCallRegions(params, chromRegion)
            callRegionList.extend(callRegions)
    else:
        chromIsSkipped = chromIsSkipped | chromIsSkipped2

        for genomeRegion in params.genomeRegionList:
            chrom = genomeRegion['chrom']
            if genomeRegion["start"] is None:
                genomeRegion["start"] = 1
            if genomeRegion["end"] is None:
                genomeRegion["end"] = params.chromSizes[chrom]

            subCallRegions = getOverlapCallRegions(params, genomeRegion)
            callRegionList.extend(subCallRegions)

    return (callRegionList, chromIsSkipped)
Exemple #6
0
def getCallRegions(params):
    """
    determine
    1) a set of genomic regions for calling
    2) a set of chromosomes that are completely skipped over,
       where "skipped" means that not a single base on the chrom is requested for calling

    \return a list of genomic regions for calling
    \return a set of chromLabels which are skipped
    """
    callRegionList = []
    chromIsSkipped = set()

    # when no region selections have been made:
    if ((params.genomeRegionList is None) and (params.callRegionsBed is None)):
        return (callRegionList, chromIsSkipped)

    # check chromosome coverage of "regions" arguments
    chromIsSkipped = set(params.chromOrder)
    if params.genomeRegionList is not None:
        for genomeRegion in params.genomeRegionList:
            chrom = genomeRegion["chrom"]

            if chrom not in params.chromOrder:
                raise Exception(
                    "Unexpected chromosome '%s' in the argument of target regions (--region)"
                    % (chrom))

            if chrom in chromIsSkipped:
                chromIsSkipped.remove(chrom)

    if params.callRegionsBed is None:
        return (params.genomeRegionList, chromIsSkipped)

    # check chromsome coverage based on callRegions BED file
    callChromList = []
    chromIsSkipped2 = set(params.chromOrder)

    for chrom in getTabixChromSet(params.tabixBin, params.callRegionsBed):
        if chrom not in params.chromOrder:
            raise Exception(
                "Unexpected chromosome '%s' in the bed file of call regions %s "
                % (chrom, params.callRegionsBed))

        callChromList.append(chrom)
        if chrom in chromIsSkipped2:
            chromIsSkipped2.remove(chrom)

    if params.genomeRegionList is None:
        chromIsSkipped = chromIsSkipped2

        for chrom in callChromList:
            chromRegion = {
                "chrom": chrom,
                "start": 1,
                "end": params.chromSizes[chrom]
            }
            callRegions = getOverlapCallRegions(params, chromRegion)
            callRegionList.extend(callRegions)
    else:
        chromIsSkipped = chromIsSkipped | chromIsSkipped2

        for genomeRegion in params.genomeRegionList:
            chrom = genomeRegion['chrom']
            if genomeRegion["start"] is None:
                genomeRegion["start"] = 1
            if genomeRegion["end"] is None:
                genomeRegion["end"] = params.chromSizes[chrom]

            subCallRegions = getOverlapCallRegions(params, genomeRegion)
            callRegionList.extend(subCallRegions)

    return (callRegionList, chromIsSkipped)