def add_graph_args(cls, parser): cls.add_common_graph_args(parser=parser) cls.add_ceph_args(parser=parser) # Note: this option is inverted. lazy ceph reads are the default if parser.get_default("lazy_ceph") is None: parser.add_argument( "--eager-ceph", dest="lazy_ceph", default=True, action="store_false", help= "use the eager read version of the ceph pipeline (not lazy)") parser.add_argument( "--ceph-lazy-records-per-segment", dest="ceph_records_per_segment", default=250000, type=parse.numeric_min_checker( minimum=1000, message="minimum of 1000 for records per segment"), help= "number of records for each segment in the ceph lazy reader") parser.add_argument( "--ceph-lazy-segments", dest="ceph_num_lazy_segments", default=2, type=parse.numeric_min_checker( minimum=1, message="must have at least one lazy segment"), help= "number of lazy segments for each asynchronous ceph lazy column" )
def add_common_args(parser): parser.add_argument("--align-stages", dest="align_stages", default=0, type=numeric_min_checker(0, "must have at least 1 align fused_align_sort"), help="number of align stages") parser.add_argument("--merge-stages", dest="merge_stages", default=0, type=numeric_min_checker(0, "must have at least 1 merge fused_align_sort"), help="number of merge stages") parser.add_argument("--combo-stages", dest="combo_stages", default=0, type=numeric_min_checker(0, "must have non-negative number of combo stages for FAS/M"), help="number of combo fused-align-sort/merge stages") parser.add_argument("--parallel-open-requests", type=numeric_min_checker(1, "must have at least 1 parallel open request"), help="if specified, the number of parallel open requests") parser.add_argument("--parallel-open-request-expansion-factor", default=1.5, type=numeric_min_checker(0.1, numeric_type=float, message="must have at least 0.1 expansion factor"), help="the expansion factor to multiple the number of client slots by to bound the capacity in the global pipeline. Not used if parallel_open_requests is set") parser.add_argument("--credit-link", default=credit_link_successive, choices=(credit_link_end_to_end, credit_link_successive), help="Type of credit linking to use between successive stages") parser.add_argument("--align-counters", default=False, action="store_true", help="track the exit rate of the align/sort stages") parser.add_argument("--merge-counters", default=False, action="store_true", help="track the exit rate of the merge stages")
def add_arguments(cls, parser): cls.add_record_args(parser=parser) parser.add_argument("--summary", default=False, action="store_true", help="record a Tensorflow graph summary") parser.add_argument( "--summary-interval", default=1, type=parse.numeric_min_checker( numeric_type=float, minimum=0.1, message="Can't have too small of an interval"), help="interval in seconds for recording summary intervals") # related to the timing for the master worker parser.add_argument( "--master-startup-poll-interval", default=1, type=parse.numeric_min_checker( minimum=0.1, numeric_type=float, message= "must have a sensible (>100ms) wait time for startup check"), help= "the amount of time to wait when checking for worker status on startup" ) parser.add_argument( "--master-shutdown-interval", default=1, type=parse.numeric_min_checker( minimum=0.1, numeric_type=float, message= "must have a sensible (>100ms) wait time for startup check"), help= "the amount of time to wait when checking for worker status on startup" ) # related to the pyro server which to connect parser.add_argument( "-n", "--pyro-number", default=random.randint(0, 2**30), type=int, help="number to assign to this server in the naming system") parser.add_argument("--pyro-ns-port", type=int, help="override default Pyro4 nameserver port") parser.add_argument("--pyro-ns-host", help="override default Pyro4 nameserver port")
def add_args(parser): parser.add_argument( "--record-stats", default=False, action="store_true", help="store statistics for this process into the output directory") parser.add_argument("-o", "--output-directory", default=".", type=parse.path_exists_checker(check_dir=True), help="path in which to store the directory of outputs") parser.add_argument( "-n", "--number", default=random.randint(0, 2**30), type=int, help="number to assign to this server in the naming system") parser.add_argument( "--safe-register", default=False, action="store_true", help="error if the name already exists in the name server") parser.add_argument("--pyro-ns-port", type=int, help="override default Pyro4 nameserver port") parser.add_argument("--pyro-ns-host", help="override default Pyro4 nameserver port") parser.add_argument( "-i", "--run-sleep-interval", dest="run_sleep_interval", default=2, type=parse.numeric_min_checker( 0.5, numeric_type=float, message="must wait at least 0.5 seconds"), help="number of seconds to sleep while in the run loop") parser.add_argument( "-w", "--worker-name", default="", help= "if set, use this exact name to register on the nameserver. An error will occur if this name is already taken" ) parser.add_argument( "--startup-sleep", default=3, type=parse.numeric_min_checker( numeric_type=float, minimum=1, message="must wait at least 1 second after worker starts"), help="number of seconds to sleep after session is initialized")
def add_graph_args(self, parser): parser.add_argument( "-p", "--parse-parallel", default=1, type=numeric_min_checker(minimum=1, message="read parallelism"), help="total paralellism level for reading data from disk") parser.add_argument("-i", "--dataset-dir", type=path_exists_checker(), help="Directory containing ALL of the chunk files") parser.add_argument( "-scale", "--scale", default=1, type=int, help= "Each coverage value is multiplied by this factor before being reported. Default is 1" ) parser.add_argument( "-max", "--max", default=-1, type=int, help= "Combine all positions with a depth >= max into a single bin in the histogram" ) parser.add_argument("-bg", "--bg", default=False, action="store_true", help="Report depth in BedGraph format") parser.add_argument( "-d", "--d", default=False, action="store_true", help= "Report the depth at each genome position with 1-based coordinates" ) parser.add_argument( "-strand", "--strand", default='B', help="Calculate coverage of intervals from a specific strand") parser.add_argument( "-bga", "--bga", default=False, action="store_true", help="Report depth in BedGraph format along with zero-entries") parser.add_argument( "-dz", "--dz", default=False, action="store_true", help= "Report the depth at each genome position with 0-based coordinates" )
def add_common_graph_args(cls, parser): prefix=cls.local_dest cls.prefix_option(parser=parser, prefix=prefix, argument="read-parallel", type=numeric_min_checker(1, "must have >0 parallel read stages"), default=2, help="number of read stages to run in parallel") cls.prefix_option(parser=parser, prefix=prefix, argument="decompress-parallel", type=numeric_min_checker(1, "must have >0 parallel decomp stages"), default=3, help="number of decompress stages to run in parallel") cls.prefix_option(parser=parser, prefix=prefix, argument="align-parallel", type=numeric_min_checker(1, "must have >0 parallel align stages"), default=8, help="number of parallel align stages") cls.prefix_option(parser=parser, prefix=prefix, argument="aligner-threads", type=numeric_min_checker(1, "must have >0 parallel aligner threads"), default=multiprocessing.cpu_count()-2, help="number of aligner threads for shared aligner") cls.prefix_option(parser=parser, prefix=prefix, argument="compress-parallel", type=numeric_min_checker(1, "must have >0 parallel compress stages"), default=2, help="number of parallel compress stages") cls.prefix_option(parser=parser, prefix=prefix, argument="write-parallel", type=numeric_min_checker(1, "must have >0 parallel write stages"), default=2, help="number of parallel write stages") cls.prefix_option(parser=parser, prefix=prefix, argument="deep-verify", default=False, action='store_true', help="verify record integrity") cls.prefix_option(parser=parser, prefix=prefix, argument="paired", default=False, action='store_true', help="interpret dataset as interleaved paired dataset") cls.prefix_option(parser=parser, prefix=prefix, argument="snap-args", type=str, default="", help="SNAP algorithm specific self. Pass with enclosing \" \". E.g. \"-om 5 -omax 1\" . See SNAP documentation for all options.") cls.prefix_option(parser=parser, prefix=prefix, argument="subchunking", type=numeric_min_checker(100, "don't go lower than 100 for subchunking size"), default=5000, help="the size of each subchunk (in number of reads)") # Note: can't have path-exists checker for this because the path might be on a remote machine cls.prefix_option(parser=parser, prefix=prefix, argument="index-path", default="/home/whitlock/tf/ref_index", help="location of the ref index on all machines. Make sure all machines have this path!") cls.prefix_option(parser=parser, prefix=prefix, argument="max-secondary", type=numeric_min_checker(0, "must have a non-negative number of secondary results"), default=0, help="Max secondary results to store. >= 0 ") cls.prefix_option(parser=parser, prefix=prefix, argument="global-batch", type=numeric_min_checker(1, "must have >=1 batch from global gate"), default=2, help="batch size for dequeuing from the upstream central gate. Doesn't affect correctness") # all options below here are rather verbose, for length of queues # cls.prefix_option(parser=parser, prefix=prefix, argument="head-gate-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="length of capacity for head gate") cls.prefix_option(parser=parser, prefix=prefix, argument="pre-decomp-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="length of post-read, pre-decomp queues") cls.prefix_option(parser=parser, prefix=prefix, argument="pre-align-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="length of post-decomp, pre-align queues") cls.prefix_option(parser=parser, prefix=prefix, argument="pre-compress-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="length of post-align, pre-compress queues") cls.prefix_option(parser=parser, prefix=prefix, argument="pre-write-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="length of post-align, pre-write queues") cls.prefix_option(parser=parser, prefix=prefix, argument="final-sink-capacity", type=numeric_min_checker(1, "must have >= 1 capacity"), help="capacity of final queue of this stage") cls.prefix_option(parser=parser, prefix=prefix, argument="log-goodput", default=False, action="store_true", help="log the goodput events") cls.prefix_option(parser=parser, prefix=prefix, argument="log-directory", default="/home/whitlock/tf/shell", help="the directory to log all events to, if log_goodput is enabled")
def add_graph_args(self, parser): super().add_graph_args(parser=parser) parser.add_argument("--ceph-cluster-name", type=non_empty_string_checker, default="ceph", help="name for the ceph cluster") parser.add_argument("--ceph-user-name", type=non_empty_string_checker, default="client.dcsl1024", help="ceph username") parser.add_argument("--ceph-conf-path", type=path_exists_checker(check_dir=False), default="/etc/ceph/ceph.conf", help="path for the ceph configuration") parser.add_argument( "--ceph-read-chunk-size", default=(2**26), type=numeric_min_checker( 128, "must have a reasonably large minimum read size from Ceph"), help="minimum size to read from ceph storage, in bytes") parser.add_argument( "--ceph-pool-name", help= "override the pool name to use (if specified or not in the json file" )
def _make_graph_args(cls, parser): simple_stage.Incrementer.add_graph_args(parser=parser) parser.add_argument("--stages", default=1, type=numeric_min_checker( minimum=1, message="need at least one stage!"), help="number of stages to run in parallel")
def make_graph_args(cls, parser): cls._make_graph_args(parser=parser) parser.add_argument( "--max-parallel-clients", default=8, type=numeric_min_checker( 1, "must allow at least one parallel client"), help="number of parallel clients this App should allow")
def add_graph_args(self, parser): parser.add_argument( "-p", "--parse-parallel", default=1, type=numeric_min_checker(minimum=1, message="read parallelism"), help="total paralellism level for reading data from disk") parser.add_argument("-w", "--write-parallel", default=1, help="number of writers to use", type=numeric_min_checker( minimum=1, message="number of writers min")) parser.add_argument("-d", "--dataset-dir", type=path_exists_checker(), help="Directory containing ALL of the chunk files")
def add_max_secondary(parser): parser.add_argument( "-s", "--max-secondary", type=numeric_min_checker( 0, "must have a non-negative number of secondary results"), default=0, help="Max secondary results to store. >= 0 ")
class Ceph: ceph_attributes = tuple("_".join(("ceph", a)) for a in ("cluster_name", "user_name", "pool_name", "conf_path", "read_chunk_size")) full_ceph_attributes = ( { "attribute": "ceph_cluster_name", "type": non_empty_string_checker, "default": "ceph", "help": "name for the ceph cluster", }, { "attribute": "ceph_user_name", "type": non_empty_string_checker, "default": "client.dcsl1024", "help": "ceph username", }, { "attribute": "ceph_pool_name", "type": non_empty_string_checker, "default": "dcsl1024", "help": "ceph pool name", }, { "attribute": "ceph_conf_path", "type": path_exists_checker(check_dir=False), "default": "/etc/ceph/ceph.conf", "help": "ceph_configuration_path", }, { "attribute": "ceph_read_chunk_size", "type": numeric_min_checker( 128, "must have a reasonably large minimum read size from Ceph"), "default": (2**26), "help": "minimum size to read from ceph storage, in bytes", }, ) @classmethod def add_ceph_args(cls, parser): for attr_dict in cls.full_ceph_attributes: attr_name = attr_dict["attribute"] if parser.get_default(attr_name) is None: arg_name = "--{}".format(attr_name.replace("_", "-")) parser.add_argument(arg_name, dest=attr_name, type=attr_dict["type"], default=attr_dict["default"], help=attr_dict["help"]) def add_ceph_attrs(self, args): for ceph_attr in self.ceph_attributes: setattr(self, ceph_attr, getattr(args, ceph_attr))
def add_graph_args(cls, parser): parser.add_argument("--increment", type=numeric_min_checker( minimum=0, message="must increment by a positive amount"), default=1, help="amount to increment by") parser.add_argument( "--queue-chain", type=numeric_min_checker( minimum=0, message="must have non-negative queue chain length"), default=1, help="length of local queue length (with queue runners)") parser.add_argument("--parallel-chains", type=numeric_min_checker( minimum=1, message="must have >=1 chains"), default=1, help="number of chains to run in parallel")
def add_graph_args(self, parser): # TODO sane defaults depending on num schedulable cores parser.add_argument("-c", "--chunk", type=numeric_min_checker(1, "chunk size"), default=100000, help="chunk size to create records") parser.add_argument("-p", "--parallel-conversion", type=numeric_min_checker(1, "parallel conversion"), default=1, help="number of parallel converters") parser.add_argument("-n", "--name", required=True, help="name for the record") parser.add_argument("-o", "--out", default=".", help="directory to write the final record to") parser.add_argument("-w", "--write", default=1, type=numeric_min_checker(1, "write parallelism"), help="number of parallel writers") parser.add_argument( "--paired", default=False, action='store_true', help= "interpret fastq files as paired, requires an even number of files for positional args fastq_files" ) parser.add_argument("--compress-parallel", default=1, type=numeric_min_checker(0, "compress parallelism"), help="number of parallel compression pipelines") parser.add_argument("fastq_files", nargs="+", help="the fastq file to convert")
def add_graph_args(self, parser): # TODO sane defaults depending on num schedulable cores parser.add_argument("-c", "--chunk", type=numeric_min_checker(1, "chunk size"), default=100000, help="chunk size to create records") parser.add_argument("--dna", action='store_true', help="Set if the input fasta is DNA nucleotides") parser.add_argument( "--protein", action='store_true', help="Set if the input fasta is protein amino acids") parser.add_argument("-p", "--parallel-conversion", type=numeric_min_checker(1, "parallel conversion"), default=1, help="number of parallel converters") parser.add_argument("-n", "--name", required=True, help="name for the record") parser.add_argument("-o", "--out", default=".", help="directory to write the final record to") parser.add_argument("-w", "--write", default=2, type=numeric_min_checker(1, "write parallelism"), help="number of parallel writers") parser.add_argument("--compress-parallel", default=10, type=numeric_min_checker(1, "compress parallelism"), help="number of parallel compression pipelines") parser.add_argument("fasta_file", help="the fasta file to convert")
def add_arguments(cls, parser): cls.add_record_args(parser=parser) parser.add_argument("--summary", default=False, action="store_true", help="record a Tensorflow graph summary") parser.add_argument( "--summary-interval", default=1, type=parse.numeric_min_checker( numeric_type=float, minimum=0.1, message="Can't have too small of an interval"), help="interval in seconds for recording summary intervals")
def queue_only_args(parser): parser.add_argument( "-Q", "--queue-index", type=parse.numeric_min_checker( minimum=0, message="queue index must be non-negative"), default=0, help="task index for cluster node that hosts the queues") parser.add_argument("-C", "--cluster", dest="cluster_members", required=True, nargs='+', type=parse_cluster_def_member, help="TF Cluster definition")
def add_graph_args(self, parser): parser.add_argument( "-p", "--parse-parallel", default=1, type=numeric_min_checker(minimum=1, message="read parallelism"), help="total paralellism level for reading data from disk") parser.add_argument("-i", "--dataset-dir", type=path_exists_checker(), help="Directory containing ALL of the chunk files") parser.add_argument("-feature", "--feature", default='B', help="Feature name") parser.add_argument("-o", "--output", help="output directory")
def add_graph_args(self, parser): # adds the common args to all graphs parser.add_argument( "-r", "--sort-read-parallel", default=1, type=numeric_min_checker( minimum=1, message="read parallelism min for sort phase"), help= "total parallelism level for local read pipeline for sort phase") parser.add_argument( "-p", "--sort-process-parallel", default=1, type=numeric_min_checker( minimum=1, message="process parallelism min for sort phase"), help= "total parallelism level for local read pipeline for sort phase") parser.add_argument( "-k", "--compress-parallel", default=1, type=numeric_min_checker( minimum=1, message="compress parallelism min for post merge write"), help="total parallelism level for compression") parser.add_argument("-c", "--column-grouping", default=5, help="grouping factor for parallel chunk sort", type=numeric_min_checker( minimum=1, message="column grouping min")) parser.add_argument( "-s", "--sort-parallel", default=1, help="number of sorting pipelines to run in parallel", type=numeric_min_checker(minimum=1, message="sorting pipeline min")) parser.add_argument( "-w", "--write-parallel", default=1, help="number of writing pipelines to run in parallel", type=numeric_min_checker(minimum=1, message="writing pipeline min")) #parser.add_argument("--chunk", default=100000, type=numeric_min_checker(1, "need non-negative chunk size"), help="chunk size for final merge stage") parser.add_argument( "-b", "--order-by", default="location", choices=["location", "metadata"], help="sort by this parameter [location | metadata]")
def add_graph_args(self, parser): # adds the common args to all graphs parser.add_argument("-p", "--parallel", type=int, default=2, help="parallel decompression") parser.add_argument("-e", "--enqueue", type=int, default=1, help="parallel enqueuing") parser.add_argument("-m", "--mmap-queue", type=int, default=2, help="size of the mmaped file record queue") parser.add_argument("-a", "--aligners", type=numeric_min_checker(1, "number of aligners"), default=2, help="number of aligners") parser.add_argument( "-t", "--aligner-threads", type=numeric_min_checker(1, "number of aligner threads"), default=multiprocessing.cpu_count(), help= "the number of threads to use for alignment. >= 1 or >= 3 if paired [num_cpus]" ) parser.add_argument( "-r", "--thread-ratio", type=float, default=0.35, help="Ratio of aligner threads to finalize threads") parser.add_argument( "-x", "--subchunking", type=int, default=5000, help="the size of each subchunk (in number of reads) [5000]") parser.add_argument("-w", "--writers", type=int, default=1, help="the number of writer pipelines ") parser.add_argument( "-c", "--compress-parallel", type=int, default=2, help="parallel compression of output. 0 for uncompressed.") parser.add_argument("-i", "--index-path", default="/scratch/bwa_index/hs38DH.fa") parser.add_argument( "-s", "--max-secondary", default=1, help= "Max secondary results to store. >= 1 (required for chimaric results" ) parser.add_argument( "--paired", default=False, action='store_true', help="interpret dataset as interleaved paired dataset") parser.add_argument( "--null", type=float, required=False, help="use the null aligner instead of actually aligning") parser.add_argument("--deep-verify", default=False, action='store_true', help="verify record integrity") # TODO this is rigid, needs to be changed to get from the queue service! parser.add_argument("--bwa-args", default="", help="BWA algorithm options")
def add_default_module_args(parser): parser.add_argument("-T", "--task-index", type=numeric_min_checker(minimum=0, message="task index must be non-negative"), required=True, help="TF Cluster task index") dist_common.queue_only_args(parser=parser)
def add_graph_args(self, parser): # adds the common args to all graphs parser.add_argument("-p", "--parallel", type=numeric_min_checker(1, "parallel decompression"), default=2, help="parallel decompression") parser.add_argument("-e", "--enqueue", type=numeric_min_checker(1, "parallel enqueuing"), default=1, help="parallel enqueuing / reading from Ceph") parser.add_argument("-a", "--aligners", type=numeric_min_checker(1, "number of aligners"), default=1, help="number of aligners") parser.add_argument("-t", "--aligner-threads", type=numeric_min_checker(1, "threads per aligner"), default=multiprocessing.cpu_count(), help="the number of threads to use per aligner") parser.add_argument( "-x", "--subchunking", type=numeric_min_checker( 1, "don't go lower than 100 for subchunking size"), default=5000, help="the size of each subchunk (in number of reads)") parser.add_argument("-w", "--writers", type=numeric_min_checker( 0, "must have a non-negative number of writers"), default=1, help="the number of writer pipelines") parser.add_argument( "-c", "--compress-parallel", type=int, default=2, help="compress output in parallel. 0 for uncompressed [2]") parser.add_argument("--assemblers", default=1, type=numeric_min_checker( 1, "must have at least one assembler node"), help="level of parallelism for assembling records") parser.add_argument("--deep-verify", default=False, action='store_true', help="verify record integrity") parser.add_argument( "--paired", default=False, action='store_true', help="interpret dataset as interleaved paired dataset") parser.add_argument( "-i", "--index-path", type=path_exists_checker(), default="/scratch/stuart/ref_index", help= "location of the ref index on all machines. Make sure all machines have this path!" ) self.add_max_secondary(parser=parser) parser.add_argument( "--snap-args", type=str, default="", help= "SNAP algorithm specific args. Pass with enclosing \" \". E.g. \"-om 5 -omax 1\" . See SNAP documentation for all options." )
def add_default_module_args(parser): parser.add_argument("-Q", "--queue-index", type=parse.numeric_min_checker(minimum=0, message="queue index must be non-negative"), default=0, help="task index for cluster node that hosts the queues") # TODO we want to have sensible defaults for this eventually! parser.add_argument("--queue-host", required=True, help="host running the queue service") parser.add_argument("--queue-port", type=parse.numeric_min_checker(0, "port must be >0"), required=True, help="port of the host running the queue service")
def add_args(parser): parser.add_argument( "-u", "--username", default="whitlock", help= "username to ssh into all machines. Have ssh keys to access the machines via this username!!" ) parser.add_argument( "--worker-startup-delay", default=8, type=parse.numeric_min_checker( minimum=0.1, numeric_type=float, message="need a minimal startup delay of 0.1"), help= "seconds to wait from starting up workers until starting the master") parser.add_argument("--worker-sleep-interval", default=5, type=parse.numeric_min_checker( minimum=1.0, numeric_type=float, message="minimum of 1 sec startup delay"), help="delay when startup up a worker") parser.add_argument("--worker-startup-sleep-interval", default=2, type=parse.numeric_min_checker( minimum=0.5, numeric_type=float, message="0.5s min for startup interval"), help="startup interval for ") parser.add_argument( "--record-finish-delay", default=5, type=parse.numeric_min_checker(minimum=0.1, numeric_type=float, message="minimum nice kill delay 0.1"), help="delay after nicely killing cluster after experiment") parser.add_argument( "--experiment-time", dest="runtime", type=parse.numeric_min_checker( minimum=min_runtime, numeric_type=float, message="minimum runtime for the experiment"), help= "time to run the experiment for. Overrides what is specified in the file." ) parser.add_argument( "--nice-kill-delay", default=5, type=parse.numeric_min_checker(minimum=0.1, numeric_type=float, message="minimum nice kill delay 0.1"), help="delay after nicely killing cluster after experiment") parser.add_argument("-o", "--output", default="results", type=lambda p: pathlib.Path(p).absolute(), help="path to output the results to") parser.add_argument( "experiment", type=pathlib.Path, help="path to a JSON file describing the experiment setup")