def validateAndSanitizeExistingOptions(self, options): options.runDir = os.path.abspath(options.runDir) options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: if options.referenceFasta is not None: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(rr) for r in options.regionStrList for rr in r.split("+") ]
def validateAndSanitizeOptions(self,options) : StrelkaSharedWorkflowOptionsBase.validateAndSanitizeOptions(self,options) checkFixTabixListOption(options.noiseVcfList,"noise vcf") groomBamList(options.normalBamList,"normal sample") groomBamList(options.tumorBamList, "tumor sample") def checkRequired(bamList,label): if (bamList is None) or (len(bamList) == 0) : raise OptParseException("No %s sample BAM/CRAM files specified" % (label)) checkRequired(options.tumorBamList,"tumor") bamSetChecker = BamSetChecker() def singleAppender(bamList,label): if bamList is None : return if len(bamList) > 1 : raise OptParseException("More than one %s sample BAM/CRAM files specified" % (label)) bamSetChecker.appendBams(bamList,label) singleAppender(options.normalBamList,"normal") singleAppender(options.tumorBamList,"tumor") bamSetChecker.check(options.htsfileBin, options.referenceFasta)
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(rr) for r in options.regionStrList for rr in r.split("+") ] options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")
def validateAndSanitizeExistingOptions(self, options): StrelkaSharedWorkflowOptionsBase.validateAndSanitizeExistingOptions( self, options) groomBamList(options.normalBamList, "normal sample") groomBamList(options.tumorBamList, "tumor sample") checkFixTabixListOption(options.noiseVcfList, "noise vcf") options.somaticSnvScoringModelFile = validateFixExistingFileArg( options.somaticSnvScoringModelFile, "Somatic SNV empirical scoring file") options.somaticIndelScoringModelFile = validateFixExistingFileArg( options.somaticIndelScoringModelFile, "Somatic indel empirical scoring file")
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") def extendedRegionStrList(): """ A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format to specify multiple regions in a single argument. """ for r in options.regionStrList: for rr in r.split("+"): yield rr if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in extendedRegionStrList() ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, list( extendedRegionStrList())[genomeRegionIndex])) options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")