def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(rr) for r in options.regionStrList for rr in r.split("+") ] options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")
def __init__(self,params) : cleanPyEnv() self.params=params # normalize boolean option input: safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes") safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes") safeSetBool(self.params,"useOverlapPairEvidence") # Use RNA option for minCandidate size if self.params.isRNA: self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) self.params.evidenceDir=os.path.join(self.params.resultsDir,"evidence") ensureDir(self.params.evidenceDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # determine subset of chroms where we can skip calling entirely (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA) # always use overlapping pairs for RNA calling if (self.params.isRNA) : self.params.useOverlapPairEvidence = True
def __init__(self, params, iniSections): cleanPyEnv() self.params = params self.iniSections = iniSections # Use RNA option for minCandidate size if self.params.isRNA: self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize # format bam lists: if self.params.normalBamList is None: self.params.normalBamList = [] if self.params.tumorBamList is None: self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir = os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir = os.path.join(self.params.runDir, "workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir = os.path.join(self.params.runDir, "results") ensureDir(self.params.resultsDir) self.params.statsDir = os.path.join(self.params.resultsDir, "stats") ensureDir(self.params.statsDir) self.params.variantsDir = os.path.join(self.params.resultsDir, "variants") ensureDir(self.params.variantsDir) self.params.evidenceDir = os.path.join(self.params.resultsDir, "evidence") ensureDir(self.params.evidenceDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta = self.params.referenceFasta + ".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta, "reference fasta") checkFile(indexRefFasta, "reference fasta index") # read fasta index (self.params.chromOrder, self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # determine subset of chroms where we can skip calling entirely (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self,params,iniSections) : # clear out some potentially destabilizing env variables: clearList = [ "PYTHONPATH", "PYTHONHOME"] for key in clearList : if key in os.environ : del os.environ[key] self.params=params self.iniSections=iniSections # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") self.params.normalBamList = [] for bam in (self.params.normalBam,) : if bam is None : continue self.params.normalBamList.append(bam) self.params.tumorBamList = [] for bam in (self.params.tumorBam,) : if bam is None : continue self.params.tumorBamList.append(bam) # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # sanity check some parameter typing: self.params.binSize = int(self.params.binSize) self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins) self.paths = PathInfo(self.params)
def __init__(self,params,iniSections) : # clear out some potentially destabilizing env variables: clearList = [ "PYTHONPATH", "PYTHONHOME"] for key in clearList : if key in os.environ : del os.environ[key] self.params=params self.iniSections=iniSections # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # sanity check some parameter typing: MEGABASE = 1000000 self.params.scanSize = int(self.params.scanSizeMb) * MEGABASE self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self,params,iniSections) : cleanPyEnv() self.params=params self.iniSections=iniSections # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self, params, PathInfoType): cleanPyEnv() self.params = params # make sure run directory is setup: self.params.runDir = os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir = os.path.join(self.params.runDir, "workspace") ensureDir(self.params.workDir) # all finalized pretty results get transferred to resultsDir self.params.resultsDir = os.path.join(self.params.runDir, "results") ensureDir(self.params.resultsDir) self.params.variantsDir = os.path.join(self.params.resultsDir, "variants") ensureDir(self.params.variantsDir) # timings and other stats go into statsDir self.params.statsDir = os.path.join(self.params.resultsDir, "stats") ensureDir(self.params.statsDir) self.paths = PathInfoType(self.params) referenceFastaIndex = self.params.referenceFasta + ".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta, "reference fasta") checkFile(referenceFastaIndex, "reference fasta index") # read fasta index (self.params.chromOrder, self.params.chromSizes) = getFastaChromOrderSize(referenceFastaIndex) # determine subset of chroms where we can skip calling entirely self.params.chromIsSkipped = getChromIsSkipped(self) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") def extendedRegionStrList(): """ A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format to specify multiple regions in a single argument. """ for r in options.regionStrList: for rr in r.split("+"): yield rr if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in extendedRegionStrList() ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, list( extendedRegionStrList())[genomeRegionIndex])) options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")