def _add_common_chunk_options(p): # Order matters! add_debug_option(p) add_max_nchunks_option(p) p = _add_chunk_output_dir_option(p) p = add_output_chunk_json_report_option(p) return p
def add_options_to_parser(p): """ API function for extending main pbreport arg parser (independently of tool contract interface). """ p_wrap = _get_parser_core() p_wrap.arg_parser.parser = p p.description = __doc__ add_debug_option(p) _add_options_to_parser(p_wrap) return p
def _get_more_options(parser): """ Advanced options that won't be exposed via tool contract interface. """ parser.add_argument('--outfile', dest='outfile', default=None, help='Use this option to generate all possible output files. Argument here is the root filename of the output files.') # FIXME: Need to add an extra check for this; it can only be used if --useLDA flag is set. parser.add_argument('--m5Cgff', dest='m5Cgff', default=None, help='Name of output GFF file containing m5C scores') # FIXME: Make sure that this is specified if --useLDA flag is set. parser.add_argument('--m5Cclassifer', dest='m5Cclassifier', default=None, help='Specify csv file containing a 127 x 2 matrix') parser.add_argument('--csv_h5', dest='csv_h5', default=None, help='Name of csv output to be written in hdf5 format.') parser.add_argument('--pickle', dest='pickle', default=None, help='Name of output pickle file.') parser.add_argument('--summary_h5', dest='summary_h5', default=None, help='Name of output summary h5 file.') parser.add_argument('--ms_csv', dest='ms_csv', default=None, help='Multisite detection CSV file.') # Calculation options: parser.add_argument('--control', dest='control', default=None, type=validateNoneOrFile, help='cmph.h5 file containing a control sample. Tool will perform a case-control analysis') # Temporary addition to test LDA for Ca5C detection: parser.add_argument('--useLDA', action="store_true", dest='useLDA', default=False, help='Set this flag to debug LDA for m5C/Ca5C detection') # Parameter options: parser.add_argument('--paramsPath', dest='paramsPath', default=_getResourcePath(), type=validateNoneOrDir, help='Directory containing in-silico trained model for each chemistry') parser.add_argument('--minCoverage', dest='minCoverage', default=3, type=int, help='Minimum coverage required to call a modified base') parser.add_argument('--maxQueueSize', dest='maxQueueSize', default=20, type=int, help='Max Queue Size') parser.add_argument('--maxCoverage', dest='maxCoverage', type=int, default=-1, help='Maximum coverage to use at each site') parser.add_argument('--mapQvThreshold', dest='mapQvThreshold', type=float, default=-1.0) parser.add_argument('--ipdModel', dest='ipdModel', default=None, help='Alternate synthetic IPD model HDF5 file') parser.add_argument('--modelIters', dest='modelIters', type=int, default=-1, help='[Internal] Number of GBM model iteration to use') parser.add_argument('--cap_percentile', dest='cap_percentile', type=float, default=99.0, help='Global IPD percentile to cap IPDs at') parser.add_argument("--methylMinCov", type=int, dest='methylMinCov', default=10, help="Do not try to estimate methylFraction unless coverage is at least this.") parser.add_argument("--identifyMinCov", type=int, dest='identifyMinCov', default=5, help="Do not try to identify the modification type unless coverage is at least this.") parser.add_argument("--maxAlignments", type=int, dest="maxAlignments", default=1500, help="Maximum number of alignments to use for a given window") # Computation management options: parser.add_argument("-w", "--referenceWindow", "--referenceWindows", "--refContigs", # backwards compatibility type=str, dest='referenceWindowsAsString', default=None, help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup[:refStart-refEnd] " + \ "(default: entire reference).") def slurpWindowFile(fname): return ",".join(map(str.strip, open(fname).readlines())) parser.add_argument("--refContigIndex", type=int, dest='refContigIndex', default=-1, help="For debugging purposes only - rather than enter a reference contig name, simply enter an index" ) parser.add_argument("-W", "--referenceWindowsFile", "--refContigsFile", # backwards compatibility type=slurpWindowFile, dest='referenceWindowsAsString', default=None, help="A file containing reference window designations, one per line") parser.add_argument("--skipUnrecognizedContigs", type=bool, default=False, help="Whether to skip, or abort, unrecognized contigs in the -w/-W flags") # FIXME shouldn't it always do this? parser.add_argument("--alignmentSetRefWindows", action="store_true", dest="referenceWindowsFromAlignment", help="Use refWindows in dataset") # Debugging help options: parser.add_argument("--threaded", "-T", action="store_true", dest="threaded", default=False, help="Run threads instead of processes (for debugging purposes only)") parser.add_argument("--profile", action="store_true", dest="doProfiling", default=False, help="Enable Python-level profiling (using cProfile).") add_debug_option(parser) parser.add_argument("--seed", action="store", dest="randomSeed", type=int, default=None, help="Random seed (for development and debugging purposes only)") parser.add_argument("--referenceStride", action="store", type=int, default=1000, help="Size of reference window in internal "+ "parallelization. For testing purposes only.") return parser
def add_options_to_argument_parser(parser): def canonicalizedFilePath(path): return os.path.abspath(os.path.expanduser(path)) basics = parser.add_argument_group("Basic required options") basics.add_argument( "inputFilename", type=canonicalizedFilePath, help="The input cmp.h5 or BAM alignment file") basics.add_argument( "--referenceFilename", "--reference", "-r", action="store", dest="referenceFilename", type=canonicalizedFilePath, required=True, help="The filename of the reference FASTA file") basics.add_argument( "-o", "--outputFilename", dest="outputFilenames", required=True, type=str, action="append", default=[], help="The output filename(s), as a comma-separated list." + \ "Valid output formats are .fa/.fasta, .fq/.fastq, .gff, .vcf") parallelism = parser.add_argument_group("Parallelism") parallelism.add_argument( "-j", "--numWorkers", dest="numWorkers", type=int, default=1, help="The number of worker processes to be used") filtering = parser.add_argument_group("Output filtering") filtering.add_argument( "--minConfidence", "-q", action="store", dest="minConfidence", type=int, default=Constants.DEFAULT_MIN_CONFIDENCE, help="The minimum confidence for a variant call to be output to variants.{gff,vcf}") filtering.add_argument( "--minCoverage", "-x", action="store", dest="minCoverage", default=Constants.DEFAULT_MIN_COVERAGE, type=int, help="The minimum site coverage that must be achieved for variant calls and " + \ "consensus to be calculated for a site.") filtering.add_argument( "--noEvidenceConsensusCall", action="store", choices=["nocall", "reference", "lowercasereference"], default="lowercasereference", help="The consensus base that will be output for sites with no effective coverage.") readSelection = parser.add_argument_group("Read selection/filtering") readSelection.add_argument( "--coverage", "-X", action="store", dest="coverage", type=int, default=Constants.DEFAULT_MAX_COVERAGE, help="A designation of the maximum coverage level to be used for analysis." + \ " Exact interpretation is algorithm-specific.") readSelection.add_argument( "--minMapQV", "-m", action="store", dest="minMapQV", type=float, default=Constants.DEFAULT_MIN_MAPQV, help="The minimum MapQV for reads that will be used for analysis.") # Since the reference isn't loaded at options processing time, we # can't grok the referenceWindow specified until later. We store # it as a string (referenceWindowsAsString) and it will later be # interpreted and stored as a proper window tuple (referenceWindow) readSelection.add_argument( "--referenceWindow", "--referenceWindows", "-w", action="store", dest="referenceWindowsAsString", type=str, help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup:refStart-refEnd " + \ "(default: entire reference).", default=None) readSelection.add_argument( "--alignmentSetRefWindows", action="store_true", dest="referenceWindowsFromAlignment", help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup:refStart-refEnd " + \ "will be pulled from the alignment file.", default=False) def slurpWindowFile(fname): return ",".join(map(str.strip, open(fname).readlines())) readSelection.add_argument( "--referenceWindowsFile", "-W", action="store", dest="referenceWindowsAsString", type=slurpWindowFile, help="A file containing reference window designations, one per line", default=None) readSelection.add_argument( "--barcode", type=str, dest="_barcode", help="Only process reads with the given barcode name.") def parseReadStratum(s): rs = map(int, s.split("/")) assert len(rs) == 2 assert rs[0] < rs[1] return rs readSelection.add_argument( "--readStratum", help="A string of the form 'n/N', where n, and N are integers, 0 <= n < N, designating" \ " that the reads are to be deterministically split into N strata of roughly even" \ " size, and stratum n is to be used for variant and consensus calling. This is" \ " mostly useful for Quiver development.", dest="readStratum", default=None, type=parseReadStratum) readSelection.add_argument( "--minReadScore", action="store", dest="minReadScore", type=float, default=Constants.DEFAULT_MIN_READSCORE, help="The minimum ReadScore for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minSnr", action="store", dest="minHqRegionSnr", type=float, default=Constants.DEFAULT_MIN_HQREGIONSNR, help="The minimum acceptable signal-to-noise over all channels for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minZScore", action="store", dest="minZScore", type=float, default=Constants.DEFAULT_MIN_ZSCORE, help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minAccuracy", action="store", dest="minAccuracy", type=float, default=Constants.DEFAULT_MIN_ACCURACY, help="The minimum acceptable window-global alignment accuracy for reads that will be used for the analysis (arrow-only).") algorithm = parser.add_argument_group("Algorithm and parameter settings") algorithm.add_argument( "--algorithm", action="store", dest="algorithm", type=str, choices=Constants.ALGORITHM_CHOICES, default=Constants.DEFAULT_ALGORITHM) algorithm.add_argument( "--parametersFile", "-P", dest="parametersFile", type=str, default=None, help="Parameter set filename (such as ArrowParameters.json or " + \ "QuiverParameters.ini), or directory D such that either " + \ "D/*/GenomicConsensus/QuiverParameters.ini, " + \ "or D/GenomicConsensus/QuiverParameters.ini, is found. In the " + \ "former case, the lexically largest path is chosen.") algorithm.add_argument( "--parametersSpec", "-p", action="store", dest="parametersSpec", type=str, default="auto", help="Name of parameter set (chemistry.model) to select from the " + \ "parameters file, or just the name of the chemistry, in which " + \ "case the best available model is chosen. Default is 'auto', " + \ "which selects the best parameter set from the alignment data") algorithm.add_argument( "--maskRadius", dest="maskRadius", type=int, default=Constants.DEFAULT_MASK_RADIUS, help="Radius of window to use when excluding local regions for " + \ "exceeding maskMinErrorRate, where 0 disables any filtering (arrow-only).") algorithm.add_argument( "--maskErrorRate", dest="maskErrorRate", type=float, default=Constants.DEFAULT_MASK_ERROR_RATE, help="Maximum local error rate before the local region defined by " + \ "maskRadius is excluded from polishing (arrow-only).") debugging = parser.add_argument_group("Verbosity and debugging/profiling") add_debug_option(debugging) debugging.add_argument( "--notrace", action="store_true", dest="notrace", default=False, help="Suppress stacktrace for exceptions (to simplify testing)") debugging.add_argument( "--pdbAtStartup", action="store_true", dest="pdbAtStartup", default=False, help="Drop into Python debugger at startup (requires ipdb)") debugging.add_argument( "--profile", action="store_true", dest="doProfiling", default=False, help="Enable Python-level profiling (using cProfile).") debugging.add_argument( "--annotateGFF", action="store_true", help="Augment GFF variant records with additional information") debugging.add_argument( "--reportEffectiveCoverage", action="store_true", help="Additionally record the *post-filtering* coverage at variant sites") advanced = parser.add_argument_group("Advanced configuration options") advanced.add_argument( "--diploid", action="store_true", help="Enable detection of heterozygous variants (experimental)") advanced.add_argument( "--queueSize", "-Q", action="store", dest="queueSize", type=int, default=200) advanced.add_argument( "--threaded", "-T", action="store_true", dest="threaded", default=False, help="Run threads instead of processes (for debugging purposes only)") advanced.add_argument( "--referenceChunkSize", "-C", action="store", dest="referenceChunkSize", type=int, default=500) advanced.add_argument( "--fancyChunking", default=True, action="store_true", help="Adaptive reference chunking designed to handle coverage cutouts better") advanced.add_argument( "--simpleChunking", dest="fancyChunking", action="store_false", help="Disable adaptive reference chunking") advanced.add_argument( "--referenceChunkOverlap", action="store", dest="referenceChunkOverlap", type=int, default=5) advanced.add_argument( "--autoDisableHdf5ChunkCache", action="store", type=int, default=500, help="Disable the HDF5 chunk cache when the number of datasets in the cmp.h5 " + \ "exceeds the given threshold") advanced.add_argument( "--aligner", "-a", action="store", choices=["affine", "simple"], default="affine", help="The pairwise alignment algorithm that will be used to produce variant calls" \ " from the consensus (Quiver only).") advanced.add_argument( "--refineDinucleotideRepeats", dest="refineDinucleotideRepeats", action="store_true", help="Require quiver maximum likelihood search to try one less/more repeat copy in" \ " dinucleotide repeats, which seem to be the most frequent cause of suboptimal" \ " convergence (getting trapped in local optimum) (Quiver only)") advanced.add_argument( "--noRefineDinucleotideRepeats", dest="refineDinucleotideRepeats", action="store_false", help="Disable dinucleotide refinement") advanced.set_defaults(refineDinucleotideRepeats=True) advanced.add_argument( "--fast", dest="fastMode", action="store_true", help="Cut some corners to run faster. Unsupported!") advanced.add_argument( "--skipUnrecognizedContigs", action="store_true", help="Do not abort when told to process a reference window (via -w/--referenceWindow[s]) " \ "that has no aligned coverage. Outputs emptyish files if there are no remaining " \ "non-degenerate windows. Only intended for use by smrtpipe scatter/gather.") return parser
def constructOptionParser(parser, C=Constants, ccs_mode=False): """ Add PBAlignRunner arguments to the parser. """ # save reference to PbParser p = parser tcp = p.tool_contract_parser parser = parser.arg_parser.parser #parser.argument_default = argparse.SUPPRESS parser.formatter_class = argparse.ArgumentDefaultsHelpFormatter add_debug_option(parser) # Optional input. input_group = parser.add_argument_group("Optional input arguments") input_group.add_argument("--regionTable", dest="regionTable", type=str, default=None, action="store", help="Specify a region table for filtering reads.") input_group.add_argument("--configFile", dest="configFile", default=None, type=str, action="store", help="Specify a set of user-defined argument values.") helpstr = "When input reads are in fasta format and output is a cmp.h5\n" + \ "this option can specify pls.h5 or bas.h5 or \n" + \ "FOFN files from which pulse metrics can be loaded for Quiver." input_group.add_argument("--pulseFile", dest="pulseFile", default=None, type=str, action="store", help=helpstr) # Chose an aligner. align_group = parser.add_argument_group("Alignment options") helpstr = "Select an aligorithm from {0}.\n".format(ALGORITHM_CANDIDATES) align_group.add_argument("--algorithm", dest="algorithm", type=str, action="store", choices=ALGORITHM_CANDIDATES, default=ALGORITHM_CANDIDATES[0], help=helpstr) # Aligner options. helpstr = "The maximum number of matches of each read to the \n" + \ "reference sequence that will be evaluated." align_group.add_argument("--maxHits", dest="maxHits", type=int, default=None, # Set as None instead of a real number. action="store", help=helpstr) helpstr = "The minimum anchor size defines the length of the read\n" + \ "that must match against the reference sequence." align_group.add_argument("--minAnchorSize", dest="minAnchorSize", type=int, default=None, # Set as None to avoid conflicts with # --algorithmOptions action="store", help=helpstr) # Aligner options: Use ccs or not? helpstr = "Map the ccsSequence to the genome first, then align\n" + \ "subreads to the interval that the CCS reads mapped to.\n" + \ " useccs: only maps subreads that span the length of\n" + \ " the template.\n" + \ " useccsall: maps all subreads.\n" + \ " useccsdenovo: maps ccs only." align_group.add_argument("--useccs", type=str, choices=["useccs", "useccsall", "useccsdenovo"], action="store", default=None, help=helpstr) helpstr = "Do not split reads into subreads even if subread \n" + \ "regions are available." align_group.add_argument("--noSplitSubreads", dest="noSplitSubreads", default=DEFAULT_OPTIONS["noSplitSubreads"], action="store_true", help=helpstr) if not ccs_mode: tcp.add_boolean(C.NO_SPLIT_ID, "noSplitSubreads", default=DEFAULT_OPTIONS["noSplitSubreads"], name="Align unsplit polymerase reads", description=helpstr) helpstr = "Map subreads of a ZMW to the same genomic location.\n" align_group.add_argument("--concordant", dest="concordant", default=DEFAULT_OPTIONS["concordant"], action="store_true", help=helpstr) if not ccs_mode: tcp.add_boolean(C.CONCORDANT_ID, "concordant", default=DEFAULT_OPTIONS["concordant"], name="Concordant alignment", description="Map subreads of a ZMW to the same genomic location") helpstr = "Number of threads." align_group.add_argument("--nproc", type=int, dest="nproc", default=DEFAULT_OPTIONS["nproc"], #default=15, action="store", help=helpstr) align_group.add_argument("--algorithmOptions", type=str, dest="algorithmOptions", default=None, action="append", help="Pass alignment options through.") # XXX the arguments used in SMRTpipe 2.3 are different from the defaults # for the command line tool tcp.add_str(C.ALGORITHM_OPTIONS_ID, "algorithmOptions", default=C.ALGORITHM_OPTIONS_DEFAULT, name="Algorithm options", description="List of space-separated arguments passed to BLASR") # Filtering criteria and hit policy. filter_group = parser.add_argument_group("Filter criteria options") helpstr = "The maximum allowed percentage divergence of a read \n" + \ "from the reference sequence." filter_group.add_argument("--maxDivergence", dest="maxDivergence", type=float, default=DEFAULT_OPTIONS["maxDivergence"], #default=30, action="store", help=helpstr) helpstr = "The minimum concordance of alignments that\n" + \ "will be evaluated." filter_group.add_argument("--minAccuracy", dest="minAccuracy", type=float, default=DEFAULT_OPTIONS["minAccuracy"], #default=70, action="store", help=helpstr) tcp.add_float(C.MIN_ACCURACY_ID, "minAccuracy", default=DEFAULT_OPTIONS["minAccuracy"], name="Min. concordance", description="Minimum required alignment concordance") helpstr = "The minimum aligned read length of alignments that\n" + \ "will be evaluated." filter_group.add_argument("--minLength", dest="minLength", type=int, default=DEFAULT_OPTIONS["minLength"], action="store", help=helpstr) tcp.add_int(C.MIN_LENGTH_ID, "minLength", default=DEFAULT_OPTIONS["minLength"], name="Min. length", description="Minimum required alignment length") #helpstr = "Specify a score function for evaluating alignments.\n" #helpstr += " alignerscore : aligner's score in the SAM tag 'as'.\n" #helpstr += " editdist : edit distance between read and reference.\n" #helpstr += " blasrscore : blasr's default score function.\n" #helpstr += "Default value is {0}.".format(DEFAULT_OPTIONS["scoreFunction"]) #filter_group.add_argument("--scoreFunction", # dest="scoreFunction", # type=str, # choices=SCOREFUNCTION_CANDIDATES, # default=DEFAULT_OPTIONS["scoreFunction"], # action="store", # help=helpstr) #" userscore : user-defined score matrix (by -scoreMatrix).\n") #parser.add_argument("--scoreMatrix", # dest="scoreMatrix", # type=str, # default=None, # help= # "Specify a user-defined score matrix for " # "scoring reads.The matrix\n"+\ # "is in the format\n" # " ACGTN\n" # " A abcde\n" # " C fghij\n" # " G klmno\n" # " T pqrst\n" # " N uvwxy\n" # ". The values a...y should be input as a " # "quoted space separated\n" # "string: "a b c ... y". Lower scores are better," # "so matches\n" # "should be less than mismatches e.g. a,g,m,s " # "= -5 (match),\n" # "mismatch = 6.\n") filter_group.add_argument("--scoreCutoff", dest="scoreCutoff", type=int, default=None, action="store", help="The worst score to output an alignment.\n") helpstr = "Specify a policy for how to treat multiple hit\n" + \ " random : selects a random hit.\n" + \ " all : selects all hits.\n" + \ " allbest : selects all the best score hits.\n" + \ " randombest: selects a random hit from all best score hits.\n" + \ " leftmost : selects a hit which has the best score and the\n" + \ " smallest mapping coordinate in any reference.\n" filter_group.add_argument("--hitPolicy", dest="hitPolicy", type=str, choices=HITPOLICY_CANDIDATES, default=DEFAULT_OPTIONS["hitPolicy"], action="store", help=helpstr) tcp.add_str(C.HIT_POLICY_ID, "hitPolicy", default=DEFAULT_OPTIONS["hitPolicy"], name="Hit policy", description=helpstr) helpstr = "If specified, do not report adapter-only hits using\n" + \ "annotations with the reference entry." filter_group.add_argument("--filterAdapterOnly", dest="filterAdapterOnly", default=DEFAULT_OPTIONS["filterAdapterOnly"], action="store_true", help=helpstr) # Output. # CMP H5 output has been deprecated, let's hide associated options. cmph5_group = parser.add_argument_group("Options for cmp.h5") helpstr = "Specify the ReadType attribute in the cmp.h5 output.\n" cmph5_group.add_argument("--readType", dest="readType", type=str, action="store", default=DEFAULT_OPTIONS["readType"], help=argparse.SUPPRESS) helpstr = "The output cmp.h5 file which will be sorted, loaded\n" + \ "with pulse QV information, and repacked, so that it \n" + \ "can be consumed by quiver directly. This requires\n" + \ "the input file to be in PacBio bas/pls.h5 format,\n" + \ "and --useccs must be None." cmph5_group.add_argument("--forQuiver", dest="forQuiver", action="store_true", default=DEFAULT_OPTIONS["forQuiver"], help=argparse.SUPPRESS) helpstr = "Similar to --forQuiver, the only difference is that \n" + \ "--useccs can be specified." cmph5_group.add_argument("--loadQVs", dest="loadQVs", action="store_true", default=DEFAULT_OPTIONS["loadQVs"], help=argparse.SUPPRESS) helpstr = "Load pulse information using -byread option instead\n" + \ "of -bymetric. Only works when --forQuiver or \n" + \ "--loadQVs are set." cmph5_group.add_argument("--byread", dest="byread", action="store_true", default=DEFAULT_OPTIONS["byread"], help=argparse.SUPPRESS) helpstr = "Load the specified (comma-delimited list of) metrics\n" + \ "instead of the default metrics required by quiver.\n" + \ "This option only works when --forQuiver or \n" + \ "--loadQVs are set." cmph5_group.add_argument("--metrics", dest="metrics", type=str, action="store", default=DEFAULT_OPTIONS["metrics"], help=argparse.SUPPRESS) # Miscellaneous. misc_group = parser.add_argument_group("Miscellaneous options") helpstr = "Output names of unaligned reads to specified file." misc_group.add_argument("--unaligned", dest="unaligned", type=str, action="store", default=DEFAULT_OPTIONS["unaligned"], help=helpstr) helpstr = "Initialize the random number generator with a none-zero \n" + \ "integer. Zero means that current system time is used.\n" misc_group.add_argument("--seed", dest="seed", type=int, default=DEFAULT_OPTIONS["seed"], action="store", help=helpstr) helpstr = "Specify a directory for saving temporary files.\n" misc_group.add_argument("--tmpDir", dest="tmpDir", type=str, action="store", default=DEFAULT_OPTIONS["tmpDir"], help=helpstr) # Keep all temporary & intermediate files. misc_group.add_argument("--keepTmpFiles", dest="keepTmpFiles", action="store_true", default=False, help=argparse.SUPPRESS) return parser
def _f(p): add_debug_option(p) f = __gather_options(output_file_msg, input_file_msg, validate_file, chunk_key_func) return f(p)
def _get_more_options(parser): """ Advanced options that won't be exposed via tool contract interface. """ parser.add_argument( '--outfile', dest='outfile', default=None, help= 'Use this option to generate all possible output files. Argument here is the root filename of the output files.' ) # FIXME: Need to add an extra check for this; it can only be used if --useLDA flag is set. parser.add_argument('--m5Cgff', dest='m5Cgff', default=None, help='Name of output GFF file containing m5C scores') # FIXME: Make sure that this is specified if --useLDA flag is set. parser.add_argument('--m5Cclassifer', dest='m5Cclassifier', default=None, help='Specify csv file containing a 127 x 2 matrix') parser.add_argument( '--csv_h5', dest='csv_h5', default=None, help='Name of csv output to be written in hdf5 format.') parser.add_argument('--pickle', dest='pickle', default=None, help='Name of output pickle file.') parser.add_argument('--summary_h5', dest='summary_h5', default=None, help='Name of output summary h5 file.') parser.add_argument('--ms_csv', dest='ms_csv', default=None, help='Multisite detection CSV file.') # Calculation options: parser.add_argument( '--control', dest='control', default=None, type=validateNoneOrFile, help= 'cmph.h5 file containing a control sample. Tool will perform a case-control analysis' ) # Temporary addition to test LDA for Ca5C detection: parser.add_argument( '--useLDA', action="store_true", dest='useLDA', default=False, help='Set this flag to debug LDA for m5C/Ca5C detection') # Parameter options: defaultParamsPathSpec = _getResourcePathSpec() parser.add_argument( '--paramsPath', dest='paramsPath', default=defaultParamsPathSpec, type=validateNoneOrPathSpec, help= 'List of :-delimited directory paths containing in-silico trained models (default is "%s")' % defaultParamsPathSpec) parser.add_argument( '--minCoverage', dest='minCoverage', default=3, type=int, help='Minimum coverage required to call a modified base') parser.add_argument('--maxQueueSize', dest='maxQueueSize', default=20, type=int, help='Max Queue Size') parser.add_argument('--maxCoverage', dest='maxCoverage', type=int, default=-1, help='Maximum coverage to use at each site') parser.add_argument('--mapQvThreshold', dest='mapQvThreshold', type=float, default=-1.0) parser.add_argument('--ipdModel', dest='ipdModel', default=None, type=validateNoneOrFile, help='Alternate synthetic IPD model HDF5 file') parser.add_argument('--modelIters', dest='modelIters', type=int, default=-1, help='[Internal] Number of GBM model iteration to use') parser.add_argument('--cap_percentile', dest='cap_percentile', type=float, default=99.0, help='Global IPD percentile to cap IPDs at') parser.add_argument( "--methylMinCov", type=int, dest='methylMinCov', default=10, help= "Do not try to estimate methylFraction unless coverage is at least this." ) parser.add_argument( "--identifyMinCov", type=int, dest='identifyMinCov', default=5, help= "Do not try to identify the modification type unless coverage is at least this." ) parser.add_argument( "--maxAlignments", type=int, dest="maxAlignments", default=1500, help="Maximum number of alignments to use for a given window") # Computation management options: parser.add_argument("-w", "--referenceWindow", "--referenceWindows", "--refContigs", # backwards compatibility type=str, dest='referenceWindowsAsString', default=None, help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup[:refStart-refEnd] " + \ "(default: entire reference).") def slurpWindowFile(fname): return ",".join(map(str.strip, open(fname).readlines())) parser.add_argument( "--refContigIndex", type=int, dest='refContigIndex', default=-1, help= "For debugging purposes only - rather than enter a reference contig name, simply enter an index" ) parser.add_argument( "-W", "--referenceWindowsFile", "--refContigsFile", # backwards compatibility type=slurpWindowFile, dest='referenceWindowsAsString', default=None, help="A file containing reference window designations, one per line") parser.add_argument( "--skipUnrecognizedContigs", type=bool, default=False, help= "Whether to skip, or abort, unrecognized contigs in the -w/-W flags") # FIXME shouldn't it always do this? parser.add_argument("--alignmentSetRefWindows", action="store_true", dest="referenceWindowsFromAlignment", help="Use refWindows in dataset") # Debugging help options: parser.add_argument( "--threaded", "-T", action="store_true", dest="threaded", default=False, help="Run threads instead of processes (for debugging purposes only)") parser.add_argument("--profile", action="store_true", dest="doProfiling", default=False, help="Enable Python-level profiling (using cProfile).") add_debug_option(parser) parser.add_argument( "--seed", action="store", dest="randomSeed", type=int, default=None, help="Random seed (for development and debugging purposes only)") parser.add_argument("--referenceStride", action="store", type=int, default=1000, help="Size of reference window in internal " + "parallelization. For testing purposes only.") return parser
def constructOptionParser(parser, C=Constants, ccs_mode=False): """ Add PBAlignRunner arguments to the parser. """ # save reference to PbParser p = parser tcp = p.tool_contract_parser parser = parser.arg_parser.parser #parser.argument_default = argparse.SUPPRESS parser.formatter_class = argparse.ArgumentDefaultsHelpFormatter add_debug_option(parser) # Optional input. input_group = parser.add_argument_group("Optional input arguments") input_group.add_argument( "--regionTable", dest="regionTable", type=str, default=None, action="store", help="Specify a region table for filtering reads.") input_group.add_argument( "--configFile", dest="configFile", default=None, type=str, action="store", help="Specify a set of user-defined argument values.") helpstr = "When input reads are in fasta format and output is a cmp.h5\n" + \ "this option can specify pls.h5 or bas.h5 or \n" + \ "FOFN files from which pulse metrics can be loaded for Quiver." input_group.add_argument("--pulseFile", dest="pulseFile", default=None, type=str, action="store", help=helpstr) # Chose an aligner. align_group = parser.add_argument_group("Alignment options") helpstr = "Select an aligorithm from {0}.\n".format(ALGORITHM_CANDIDATES) align_group.add_argument("--algorithm", dest="algorithm", type=str, action="store", choices=ALGORITHM_CANDIDATES, default=ALGORITHM_CANDIDATES[0], help=helpstr) # Aligner options. helpstr = "The maximum number of matches of each read to the \n" + \ "reference sequence that will be evaluated." align_group.add_argument( "--maxHits", dest="maxHits", type=int, default=None, # Set as None instead of a real number. action="store", help=helpstr) helpstr = "The minimum anchor size defines the length of the read\n" + \ "that must match against the reference sequence." align_group.add_argument( "--minAnchorSize", dest="minAnchorSize", type=int, default=None, # Set as None to avoid conflicts with # --algorithmOptions action="store", help=helpstr) # Aligner options: Use ccs or not? helpstr = "Map the ccsSequence to the genome first, then align\n" + \ "subreads to the interval that the CCS reads mapped to.\n" + \ " useccs: only maps subreads that span the length of\n" + \ " the template.\n" + \ " useccsall: maps all subreads.\n" + \ " useccsdenovo: maps ccs only." align_group.add_argument("--useccs", type=str, choices=["useccs", "useccsall", "useccsdenovo"], action="store", default=None, help=helpstr) helpstr = "Do not split reads into subreads even if subread \n" + \ "regions are available." align_group.add_argument("--noSplitSubreads", dest="noSplitSubreads", default=DEFAULT_OPTIONS["noSplitSubreads"], action="store_true", help=helpstr) if not ccs_mode: tcp.add_boolean(C.NO_SPLIT_ID, "noSplitSubreads", default=DEFAULT_OPTIONS["noSplitSubreads"], name="Align unsplit polymerase reads", description=helpstr) helpstr = "Map subreads of a ZMW to the same genomic location.\n" align_group.add_argument("--concordant", dest="concordant", default=DEFAULT_OPTIONS["concordant"], action="store_true", help=helpstr) if not ccs_mode: tcp.add_boolean( C.CONCORDANT_ID, "concordant", default=DEFAULT_OPTIONS["concordant"], name="Concordant alignment", description="Map subreads of a ZMW to the same genomic location") helpstr = "Number of threads." align_group.add_argument( "--nproc", type=int, dest="nproc", default=DEFAULT_OPTIONS["nproc"], #default=15, action="store", help=helpstr) align_group.add_argument("--algorithmOptions", type=str, dest="algorithmOptions", default=None, action="append", help="Pass alignment options through.") # XXX the arguments used in SMRTpipe 2.3 are different from the defaults # for the command line tool tcp.add_str( C.ALGORITHM_OPTIONS_ID, "algorithmOptions", default=C.ALGORITHM_OPTIONS_DEFAULT, name="Algorithm options", description="List of space-separated arguments passed to BLASR") # Filtering criteria and hit policy. filter_group = parser.add_argument_group("Filter criteria options") helpstr = "The maximum allowed percentage divergence of a read \n" + \ "from the reference sequence." filter_group.add_argument( "--maxDivergence", dest="maxDivergence", type=float, default=DEFAULT_OPTIONS["maxDivergence"], #default=30, action="store", help=helpstr) helpstr = "The minimum concordance of alignments that\n" + \ "will be evaluated." filter_group.add_argument( "--minAccuracy", dest="minAccuracy", type=float, default=DEFAULT_OPTIONS["minAccuracy"], #default=70, action="store", help=helpstr) tcp.add_float(C.MIN_ACCURACY_ID, "minAccuracy", default=DEFAULT_OPTIONS["minAccuracy"], name="Min. concordance", description="Minimum required alignment concordance") helpstr = "The minimum aligned read length of alignments that\n" + \ "will be evaluated." filter_group.add_argument("--minLength", dest="minLength", type=int, default=DEFAULT_OPTIONS["minLength"], action="store", help=helpstr) tcp.add_int(C.MIN_LENGTH_ID, "minLength", default=DEFAULT_OPTIONS["minLength"], name="Min. length", description="Minimum required alignment length") #helpstr = "Specify a score function for evaluating alignments.\n" #helpstr += " alignerscore : aligner's score in the SAM tag 'as'.\n" #helpstr += " editdist : edit distance between read and reference.\n" #helpstr += " blasrscore : blasr's default score function.\n" #helpstr += "Default value is {0}.".format(DEFAULT_OPTIONS["scoreFunction"]) #filter_group.add_argument("--scoreFunction", # dest="scoreFunction", # type=str, # choices=SCOREFUNCTION_CANDIDATES, # default=DEFAULT_OPTIONS["scoreFunction"], # action="store", # help=helpstr) #" userscore : user-defined score matrix (by -scoreMatrix).\n") #parser.add_argument("--scoreMatrix", # dest="scoreMatrix", # type=str, # default=None, # help= # "Specify a user-defined score matrix for " # "scoring reads.The matrix\n"+\ # "is in the format\n" # " ACGTN\n" # " A abcde\n" # " C fghij\n" # " G klmno\n" # " T pqrst\n" # " N uvwxy\n" # ". The values a...y should be input as a " # "quoted space separated\n" # "string: "a b c ... y". Lower scores are better," # "so matches\n" # "should be less than mismatches e.g. a,g,m,s " # "= -5 (match),\n" # "mismatch = 6.\n") filter_group.add_argument("--scoreCutoff", dest="scoreCutoff", type=int, default=None, action="store", help="The worst score to output an alignment.\n") helpstr = "Specify a policy for how to treat multiple hit\n" + \ " random : selects a random hit.\n" + \ " all : selects all hits.\n" + \ " allbest : selects all the best score hits.\n" + \ " randombest: selects a random hit from all best score hits.\n" + \ " leftmost : selects a hit which has the best score and the\n" + \ " smallest mapping coordinate in any reference.\n" filter_group.add_argument("--hitPolicy", dest="hitPolicy", type=str, choices=HITPOLICY_CANDIDATES, default=DEFAULT_OPTIONS["hitPolicy"], action="store", help=helpstr) tcp.add_str(C.HIT_POLICY_ID, "hitPolicy", default=DEFAULT_OPTIONS["hitPolicy"], name="Hit policy", description=helpstr) helpstr = "If specified, do not report adapter-only hits using\n" + \ "annotations with the reference entry." filter_group.add_argument("--filterAdapterOnly", dest="filterAdapterOnly", default=DEFAULT_OPTIONS["filterAdapterOnly"], action="store_true", help=helpstr) # Output. # CMP H5 output has been deprecated, let's hide associated options. cmph5_group = parser.add_argument_group("Options for cmp.h5") helpstr = "Specify the ReadType attribute in the cmp.h5 output.\n" cmph5_group.add_argument("--readType", dest="readType", type=str, action="store", default=DEFAULT_OPTIONS["readType"], help=argparse.SUPPRESS) helpstr = "The output cmp.h5 file which will be sorted, loaded\n" + \ "with pulse QV information, and repacked, so that it \n" + \ "can be consumed by quiver directly. This requires\n" + \ "the input file to be in PacBio bas/pls.h5 format,\n" + \ "and --useccs must be None." cmph5_group.add_argument("--forQuiver", dest="forQuiver", action="store_true", default=DEFAULT_OPTIONS["forQuiver"], help=argparse.SUPPRESS) helpstr = "Similar to --forQuiver, the only difference is that \n" + \ "--useccs can be specified." cmph5_group.add_argument("--loadQVs", dest="loadQVs", action="store_true", default=DEFAULT_OPTIONS["loadQVs"], help=argparse.SUPPRESS) helpstr = "Load pulse information using -byread option instead\n" + \ "of -bymetric. Only works when --forQuiver or \n" + \ "--loadQVs are set." cmph5_group.add_argument("--byread", dest="byread", action="store_true", default=DEFAULT_OPTIONS["byread"], help=argparse.SUPPRESS) helpstr = "Load the specified (comma-delimited list of) metrics\n" + \ "instead of the default metrics required by quiver.\n" + \ "This option only works when --forQuiver or \n" + \ "--loadQVs are set." cmph5_group.add_argument("--metrics", dest="metrics", type=str, action="store", default=DEFAULT_OPTIONS["metrics"], help=argparse.SUPPRESS) # Miscellaneous. misc_group = parser.add_argument_group("Miscellaneous options") helpstr = "Output names of unaligned reads to specified file." misc_group.add_argument("--unaligned", dest="unaligned", type=str, action="store", default=DEFAULT_OPTIONS["unaligned"], help=helpstr) helpstr = "Initialize the random number generator with a none-zero \n" + \ "integer. Zero means that current system time is used.\n" misc_group.add_argument("--seed", dest="seed", type=int, default=DEFAULT_OPTIONS["seed"], action="store", help=helpstr) helpstr = "Specify a directory for saving temporary files.\n" misc_group.add_argument("--tmpDir", dest="tmpDir", type=str, action="store", default=DEFAULT_OPTIONS["tmpDir"], help=helpstr) # Keep all temporary & intermediate files. misc_group.add_argument("--keepTmpFiles", dest="keepTmpFiles", action="store_true", default=False, help=argparse.SUPPRESS) return parser
def add_options_to_argument_parser(parser): def canonicalizedFilePath(path): return os.path.abspath(os.path.expanduser(path)) basics = parser.add_argument_group("Basic required options") basics.add_argument( "inputFilename", type=canonicalizedFilePath, help="The input cmp.h5 or BAM alignment file") basics.add_argument( "--referenceFilename", "--reference", "-r", action="store", dest="referenceFilename", type=canonicalizedFilePath, required=True, help="The filename of the reference FASTA file") basics.add_argument( "-o", "--outputFilename", dest="outputFilenames", required=True, type=str, action="append", default=[], help="The output filename(s), as a comma-separated list." + \ "Valid output formats are .fa/.fasta, .fq/.fastq, .gff, .vcf") parallelism = parser.add_argument_group("Parallelism") parallelism.add_argument( "-j", "--numWorkers", dest="numWorkers", type=int, default=1, help="The number of worker processes to be used") filtering = parser.add_argument_group("Output filtering") filtering.add_argument( "--minConfidence", "-q", action="store", dest="minConfidence", type=int, default=Constants.DEFAULT_MIN_CONFIDENCE, help="The minimum confidence for a variant call to be output to variants.{gff,vcf}") filtering.add_argument( "--minCoverage", "-x", action="store", dest="minCoverage", default=Constants.DEFAULT_MIN_COVERAGE, type=int, help="The minimum site coverage that must be achieved for variant calls and " + \ "consensus to be calculated for a site.") filtering.add_argument( "--noEvidenceConsensusCall", action="store", choices=["nocall", "reference", "lowercasereference"], default="lowercasereference", help="The consensus base that will be output for sites with no effective coverage.") readSelection = parser.add_argument_group("Read selection/filtering") readSelection.add_argument( "--coverage", "-X", action="store", dest="coverage", type=int, default=Constants.DEFAULT_MAX_COVERAGE, help="A designation of the maximum coverage level to be used for analysis." + \ " Exact interpretation is algorithm-specific.") readSelection.add_argument( "--minMapQV", "-m", action="store", dest="minMapQV", type=float, default=Constants.DEFAULT_MIN_MAPQV, help="The minimum MapQV for reads that will be used for analysis.") # Since the reference isn't loaded at options processing time, we # can't grok the referenceWindow specified until later. We store # it as a string (referenceWindowsAsString) and it will later be # interpreted and stored as a proper window tuple (referenceWindow) readSelection.add_argument( "--referenceWindow", "--referenceWindows", "-w", action="store", dest="referenceWindowsAsString", type=str, help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup:refStart-refEnd " + \ "(default: entire reference).", default=None) readSelection.add_argument( "--alignmentSetRefWindows", action="store_true", dest="referenceWindowsFromAlignment", help="The window (or multiple comma-delimited windows) of the reference to " + \ "be processed, in the format refGroup:refStart-refEnd " + \ "will be pulled from the alignment file.", default=False) def slurpWindowFile(fname): return ",".join(map(str.strip, open(fname).readlines())) readSelection.add_argument( "--referenceWindowsFile", "-W", action="store", dest="referenceWindowsAsString", type=slurpWindowFile, help="A file containing reference window designations, one per line", default=None) readSelection.add_argument( "--barcode", type=str, dest="_barcode", help="Only process reads with the given barcode name.") def parseReadStratum(s): rs = map(int, s.split("/")) assert len(rs) == 2 assert rs[0] < rs[1] return rs readSelection.add_argument( "--readStratum", help="A string of the form 'n/N', where n, and N are integers, 0 <= n < N, designating" \ " that the reads are to be deterministically split into N strata of roughly even" \ " size, and stratum n is to be used for variant and consensus calling. This is" \ " mostly useful for Quiver development.", dest="readStratum", default=None, type=parseReadStratum) readSelection.add_argument( "--minReadScore", action="store", dest="minReadScore", type=float, default=Constants.DEFAULT_MIN_READSCORE, help="The minimum ReadScore for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minSnr", action="store", dest="minHqRegionSnr", type=float, default=Constants.DEFAULT_MIN_HQREGIONSNR, help="The minimum acceptable signal-to-noise over all channels for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minZScore", action="store", dest="minZScore", type=float, default=Constants.DEFAULT_MIN_ZSCORE, help="The minimum acceptable z-score for reads that will be used for analysis (arrow-only).") readSelection.add_argument( "--minAccuracy", action="store", dest="minAccuracy", type=float, default=Constants.DEFAULT_MIN_ACCURACY, help="The minimum acceptable window-global alignment accuracy for reads that will be used for the analysis (arrow-only).") algorithm = parser.add_argument_group("Algorithm and parameter settings") algorithm.add_argument( "--algorithm", action="store", dest="algorithm", type=str, choices=Constants.ALGORITHM_CHOICES, default=Constants.DEFAULT_ALGORITHM) algorithm.add_argument( "--parametersFile", "-P", dest="parametersFile", type=str, default=None, help="Parameter set filename (such as ArrowParameters.json or " + \ "QuiverParameters.ini), or directory D such that either " + \ "D/*/GenomicConsensus/QuiverParameters.ini, " + \ "or D/GenomicConsensus/QuiverParameters.ini, is found. In the " + \ "former case, the lexically largest path is chosen.") algorithm.add_argument( "--parametersSpec", "-p", action="store", dest="parametersSpec", type=str, default="auto", help="Name of parameter set (chemistry.model) to select from the " + \ "parameters file, or just the name of the chemistry, in which " + \ "case the best available model is chosen. Default is 'auto', " + \ "which selects the best parameter set from the alignment data") algorithm.add_argument( "--maskRadius", dest="maskRadius", type=int, default=Constants.DEFAULT_MASK_RADIUS, help="Radius of window to use when excluding local regions for " + \ "exceeding maskMinErrorRate, where 0 disables any filtering (arrow-only).") algorithm.add_argument( "--maskErrorRate", dest="maskErrorRate", type=float, default=Constants.DEFAULT_MASK_ERROR_RATE, help="Maximum local error rate before the local region defined by " + \ "maskRadius is excluded from polishing (arrow-only).") debugging = parser.add_argument_group("Verbosity and debugging/profiling") add_debug_option(debugging) debugging.add_argument( "--notrace", action="store_true", dest="notrace", default=False, help="Suppress stacktrace for exceptions (to simplify testing)") debugging.add_argument( "--pdbAtStartup", action="store_true", dest="pdbAtStartup", default=False, help="Drop into Python debugger at startup (requires ipdb)") debugging.add_argument( "--profile", action="store_true", dest="doProfiling", default=False, help="Enable Python-level profiling (using cProfile).") debugging.add_argument( "--dumpEvidence", "-d", dest="dumpEvidence", nargs="?", default=None, const="variants", choices=["variants", "all", "outliers"]) debugging.add_argument( "--evidenceDirectory", default="evidence_dump") debugging.add_argument( "--annotateGFF", action="store_true", help="Augment GFF variant records with additional information") debugging.add_argument( "--reportEffectiveCoverage", action="store_true", help="Additionally record the *post-filtering* coverage at variant sites") advanced = parser.add_argument_group("Advanced configuration options") advanced.add_argument( "--diploid", action="store_true", help="Enable detection of heterozygous variants (experimental)") advanced.add_argument( "--queueSize", "-Q", action="store", dest="queueSize", type=int, default=200) advanced.add_argument( "--threaded", "-T", action="store_true", dest="threaded", default=False, help="Run threads instead of processes (for debugging purposes only)") advanced.add_argument( "--referenceChunkSize", "-C", action="store", dest="referenceChunkSize", type=int, default=500) advanced.add_argument( "--fancyChunking", default=True, action="store_true", help="Adaptive reference chunking designed to handle coverage cutouts better") advanced.add_argument( "--simpleChunking", dest="fancyChunking", action="store_false", help="Disable adaptive reference chunking") advanced.add_argument( "--referenceChunkOverlap", action="store", dest="referenceChunkOverlap", type=int, default=5) advanced.add_argument( "--autoDisableHdf5ChunkCache", action="store", type=int, default=500, help="Disable the HDF5 chunk cache when the number of datasets in the cmp.h5 " + \ "exceeds the given threshold") advanced.add_argument( "--aligner", "-a", action="store", choices=["affine", "simple"], default="affine", help="The pairwise alignment algorithm that will be used to produce variant calls" \ " from the consensus (Quiver only).") advanced.add_argument( "--refineDinucleotideRepeats", dest="refineDinucleotideRepeats", action="store_true", help="Require quiver maximum likelihood search to try one less/more repeat copy in" \ " dinucleotide repeats, which seem to be the most frequent cause of suboptimal" \ " convergence (getting trapped in local optimum) (Quiver only)") advanced.add_argument( "--noRefineDinucleotideRepeats", dest="refineDinucleotideRepeats", action="store_false", help="Disable dinucleotide refinement") advanced.set_defaults(refineDinucleotideRepeats=True) advanced.add_argument( "--fast", dest="fastMode", action="store_true", help="Cut some corners to run faster. Unsupported!") advanced.add_argument( "--skipUnrecognizedContigs", action="store_true", help="Do not abort when told to process a reference window (via -w/--referenceWindow[s]) " \ "that has no aligned coverage. Outputs emptyish files if there are no remaining " \ "non-degenerate windows. Only intended for use by smrtpipe scatter/gather.") return parser
def _example_parser(): p = get_default_argparser("1.0.0", "Example Mock Parser") p = CU.add_debug_option(p) p.add_argument('example_file', type=str, help="No testing of existence") return p
def add_args_run_diagnstic(p): _add_required_preset_xml_option(p) add_debug_option(p) _add_output_dir_option(p) _add_simple_mode_option(p) return p