main_cmd = Cmd( "c|heckpoint", None, "manage checkpoints", [ Cmd( "download", download, "download checkpoint from persistent storage", [ Arg("uuid", type=str, help="Download a checkpoint by specifying its UUID."), Arg( "-o", "--output-dir", type=str, help="Desired output directory for the checkpoint.", ), Arg( "-q", "--quiet", action="store_true", help="Only print the path to the checkpoint.", ), ], ), Cmd( "describe", describe, "describe checkpoint", [Arg("uuid", type=str, help="checkpoint uuid to describe")], ), Cmd( "delete", delete_checkpoints, "delete checkpoints", [ Arg("checkpoints_uuids", help="comma-separated list of checkpoints to delete"), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), ], )
args_description = Cmd( "local", None, "local help", [ Cmd( "cluster-up", handle_cluster_up, "Create a Determined cluster", [ Group( Arg( "--master-config-path", type=Path, default=None, help="path to master configuration", ), Arg( "--storage-host-path", type=Path, default=DEFAULT_STORAGE_HOST_PATH, help= "Storage location for cluster data (e.g. checkpoints)", ), ), Arg( "--agents", type=int, default=1, help="number of agents to start (on this machine)", ), Arg("--master-port", type=int, default=8080, help="port to expose master on"), Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), Arg("--det-version", type=str, default=None, help="version or commit to use"), Arg( "--db-password", type=str, default="postgres", help="password for master database", ), Arg( "--delete-db", action="store_true", help="remove current master database", ), BoolOptArg( "--gpu", "--no-gpu", dest="gpu", default=("darwin" not in sys.platform), true_help="enable GPU support for agent", false_help="disable GPU support for agent", ), Arg( "--no-autorestart", help= "disable container auto-restart (recommended for local development)", action="store_true", ), Arg( "--auto-bind-mount", type=str, default=None, help= "directory to mount into task containers (default: user's home directory)", ), Arg( "--no-auto-bind-mount", help= "disable mounting user's home directory into task containers", action="store_true", ), ], ), Cmd( "cluster-down", handle_cluster_down, "Stop a Determined cluster", [ Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), Arg( "--delete-db", action="store_true", help="remove current master database", ), ], ), Cmd( "master-up", handle_master_up, "Start a Determined master", [ Group( Arg( "--master-config-path", type=str, default=None, help="path to master configuration", ), Arg( "--storage-host-path", type=str, default=DEFAULT_STORAGE_HOST_PATH, help= "Storage location for cluster data (e.g. checkpoints)", ), ), Arg("--master-port", type=int, default=8080, help="port to expose master on"), Arg( "--master-name", type=str, default="determined", help="name for the cluster resources", ), Arg("--det-version", type=str, default=None, help="version or commit to use"), Arg( "--db-password", type=str, default="postgres", help="password for master database", ), Arg( "--delete-db", action="store_true", help="remove current master database", ), Arg( "--no-autorestart", help= "disable container auto-restart (recommended for local development)", action="store_true", ), Arg( "--auto-bind-mount", type=str, default=str(Path.home()), help= "directory to mount into task containers (default: user's home directory)", ), Arg( "--no-auto-bind-mount", help= "disable mounting user's home directory into task containers", action="store_true", ), Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), ], ), Cmd( "master-down", handle_master_down, "Stop a Determined master", [ Arg( "--master-name", type=str, default="determined", help="name for the cluster resources", ), Arg( "--delete-db", action="store_true", help="remove current master database", ), Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), ], ), Cmd( "logs", handle_logs, "Show the logs of a Determined cluster", [ Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), Arg("--no-follow", help="disable following logs", action="store_true"), ], ), Cmd( "agent-up", handle_agent_up, "Start a Determined agent", [ Arg("master_host", type=str, help="master hostname"), Arg("--master-port", type=int, default=8080, help="master port"), Arg("--det-version", type=str, default=None, help="version or commit to use"), Arg("--agent-name", type=str, default="det-agent", help="agent name"), Arg("--agent-label", type=str, default=None, help="agent label"), Arg("--agent-resource-pool", type=str, default=None, help="agent resource pool"), BoolOptArg( "--gpu", "--no-gpu", dest="gpu", default=("darwin" not in sys.platform), true_help="enable GPU support for agent", false_help="disable GPU support for agent", ), Arg( "--no-autorestart", help= "disable container auto-restart (recommended for local development)", action="store_true", ), Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), ], ), Cmd( "agent-down", handle_agent_down, "Stop a Determined agent", [ Arg("--agent-name", type=str, default="det-agent", help="agent name"), Arg("--all", help="stop all running agents", action="store_true"), Arg( "--cluster-name", type=str, default="determined", help="name for the cluster resources", ), ], ), ], )
# fmt: off args_description = [ Cmd("template tpl", None, "manage config templates", [ Cmd("list ls", list_template, "list config templates", [ Arg("-d", "--details", action="store_true", help="show the configs of the templates"), ], is_default=True), Cmd("describe", describe_template, "describe config template", [ Arg("template_name", type=str, help="template name"), ]), Cmd("set", set_template, "set config template", [ Arg("template_name", help="template name"), Arg("template_file", type=FileType("r"), help="config template file (.yaml)") ]), Cmd("remove rm", remove_templates, "remove config template", [Arg("template_name", help="template name")]), ]) ] # type: List[Any] # fmt: on
@authentication_required def remove_client(parsed_args: Namespace) -> None: try: api.delete(parsed_args.master, "oauth2/clients/{}".format(parsed_args.client_id)) except NotFoundException: raise EnterpriseOnlyError("API not found: oauth2/clients") # fmt: off args_description = [ Cmd("oauth", None, "manage OAuth", [ Cmd("client", None, "manage clients", [ Cmd("list", list_clients, "list OAuth client applications", [], is_default=True), Cmd("add", add_client, "add OAuth client application", [ Arg("name", type=str, help="descriptive name"), Arg("domain", type=str, help="redirect domain"), ]), Cmd("remove", remove_client, "remove OAuth client application", [ Arg("client_id", help="OAuth client ID to remove"), ]), ]) ]) ] # type: List[Any] # fmt: on
args_description = Cmd( "c|heckpoint", None, "manage checkpoints", [ Cmd( "download", download, "download checkpoint from persistent storage", [ Arg("uuid", type=str, help="Download a checkpoint by specifying its UUID."), Arg( "-o", "--output-dir", type=str, help="Desired output directory for the checkpoint.", ), Arg( "-q", "--quiet", action="store_true", help="Only print the path to the checkpoint.", ), ], ), Cmd( "describe", describe, "describe checkpoint", [Arg("uuid", type=str, help="checkpoint uuid to describe")], ), ], )
Cmd( "t|rial", None, "manage trials", [ Cmd( "describe", describe_trial, "describe trial", [ Arg("trial_id", type=int, help="trial ID"), Arg("--metrics", action="store_true", help="display full metrics"), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print JSON"), ), ], ), Cmd( "download", download, "download checkpoint for trial", [ Arg("trial_id", type=int, help="trial ID"), Group( Arg( "--best", action="store_true", help="download the checkpoint with the best validation metric", ), Arg( "--latest", action="store_true", help="download the most recent checkpoint", ), Arg( "--uuid", type=str, help="download a checkpoint by specifying its UUID", ), required=True, ), Arg( "-o", "--output-dir", type=str, default=None, help="Desired output directory for the checkpoint", ), Arg( "--sort-by", type=str, default=None, help="The name of the validation metric to sort on. This argument is only " "used with --best. If --best is passed without --sort-by, the " "experiment's searcher metric is assumed. If this argument is specified, " "--smaller-is-better must also be specified.", ), Arg( "--smaller-is-better", type=lambda s: bool(distutils.util.strtobool(s)), default=None, help="The sort order for metrics when using --best with --sort-by. For " "example, 'accuracy' would require passing '--smaller-is-better false'. If " "--sort-by is specified, this argument must be specified.", ), Arg( "-q", "--quiet", action="store_true", help="only print the path to the checkpoint", ), ], ), Cmd( "logs", trial_logs, "fetch trial logs", [ Arg("trial_id", type=int, help="trial ID"), Arg( "-f", "--follow", action="store_true", help="follow the logs of a running trial, similar to tail -f", ), Group( Arg( "--head", type=int, help="number of lines to show, counting from the beginning " "of the log (default is all)", ), Arg( "--tail", type=int, help="number of lines to show, counting from the end " "of the log (default is all)", ), ), Arg( "--agent-id", dest="agent_ids", action="append", help="agents to show logs from (repeat for multiple values)", ), Arg( "--container-id", dest="container_ids", action="append", help="containers to show logs from (repeat for multiple values)", ), Arg( "--rank-id", dest="rank_ids", type=int, action="append", help="containers to show logs from (repeat for multiple values)", ), Arg( "--timestamp-before", help="show logs only from before (RFC 3339 format)", ), Arg( "--timestamp-after", help="show logs only from after (RFC 3339 format)", ), Arg( "--level", dest="level", help="show logs with this level or higher " + "(TRACE, DEBUG, INFO, WARNING, ERROR, CRITICAL)", ), Arg( "--source", dest="sources", action="append", help="sources to show logs from (repeat for multiple values)", ), Arg( "--stdtype", dest="stdtypes", action="append", help="output stream to show logs from (repeat for multiple values)", ), ], ), Cmd( "kill", kill_trial, "forcibly terminate a trial", [Arg("trial_id", help="trial ID")] ), ], ),
Cmd("command cmd", None, "manage commands", [ Cmd("list ls", command.list_tasks, "list commands", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all commands (including other users')"), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), ), ], is_default=True), Cmd("config", command.config, "display command config", [ Arg("command_id", type=str, help="command ID"), ]), Cmd("run", run_command, "create command", [ Arg("entrypoint", type=str, nargs=REMAINDER, help="entrypoint command and arguments to execute"), Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("--template", type=str, help="name of template to apply to the command configuration"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID") ]), Cmd("logs", lambda *args, **kwargs: task.logs(*args, **kwargs), "fetch command logs", [ Arg("task_id", help="command ID", metavar="command_id"), *task.common_log_options, ]), Cmd("kill", command.kill, "forcibly terminate a command", [ Arg("command_id", help="command ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), Cmd("set", None, "set command attributes", [ Cmd("priority", command.set_priority, "set command priority", [ Arg("command_id", help="command ID"), Arg("priority", type=int, help="priority"), ]), ]), ])
Cmd( "p|roject", None, "manage projects", [ Cmd( "list", list_workspace_projects, "list the projects associated with a workspace", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg( "--sort-by", type=str, choices=["id", "name"], default="id", help="sort workspaces by the given field", ), Arg( "--order-by", type=str, choices=["asc", "desc"], default="asc", help= "order workspaces in either ascending or descending order", ), *pagination_args, Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "list-experiments", list_project_experiments, "list the experiments associated with a project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("project_name", type=str, help="name of the project"), Arg( "--all", "-a", action="store_true", default=False, help= "show all experiments (including archived and other users')", ), Arg( "--sort-by", type=str, choices=["id", "name"], default="id", help="sort workspaces by the given field", ), Arg( "--order-by", type=str, choices=["asc", "desc"], default="asc", help= "order workspaces in either ascending or descending order", ), *pagination_args, Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "create", create_project, "create project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("name", type=str, help="name of the project"), Arg("--description", type=str, help="description of the project"), Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "delete", delete_project, "delete project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("project_name", type=str, help="name of the project"), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), Cmd( "archive", archive_project, "archive project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("project_name", type=str, help="name of the project"), ], ), Cmd( "unarchive", unarchive_project, "unarchive project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("project_name", type=str, help="name of the project"), ], ), Cmd( "describe", describe_project, "describe project", [ Arg("workspace_name", type=str, help="name of the workspace"), Arg("project_name", type=str, help="name of the project"), Arg( "--all", "-a", action="store_true", default=False, help= "show all experiments (including archived and other users')", ), Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "edit", edit_project, "edit project", [ Arg("workspace_name", type=str, help="current name of the workspace"), Arg("project_name", type=str, help="name of the project"), Arg("--new_name", type=str, help="new name of the project"), Arg("--description", type=str, help="description of the project"), Arg("--json", action="store_true", help="print as JSON"), ], ), ], )
Arg( "--image-repo-prefix", type=str, default="determinedai", help= "Docker image repository to use for determined-master and determined-agent images", ), local_args_description, aws_args_description, gcp_args_description, ] DEPLOY_CMD_NAME = "d|eploy" args_description = Cmd( DEPLOY_CMD_NAME, None, "manage deployments", args_subs, ) def main() -> None: """Deprecated entry point for standalone `det-deploy`.""" parser = argparse.ArgumentParser( description="Manage Determined deployments.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) add_args(parser, args_subs) parsed_args = parser.parse_args() v = vars(parsed_args) if not v.get("func"):
args_description = Cmd( "gke-experimental", None, "GKE help", [ Cmd( "up", handle_up, "create gke cluster", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, default=None, required=True, help="a unique name for the gke cluster", ), ], ), ArgGroup( "optional named arguments", None, [ Arg( "--agent-node-pool-name", "--gpu-node-pool-name", type=str, default=None, help="a unique name for the GPU node pool", ), Arg( "--gcs-bucket-name", type=str, default=None, help= "a unique name for the GCS bucket that will store your" " checkpoints", ), Arg( "--gpu-type", type=str, default=defaults.GPU_TYPE, required=False, help="accelerator type to use for agents", ), Arg( "--cpu-only", required=False, help= "Flag to create a CPU Only Determined Instance.", action="store_true", ), Arg( "--gpus-per-node", type=int, default=defaults.GPUS_PER_NODE, required=False, help="number of GPUs per node", ), Arg( "--helm-dir", type=str, default="helm/charts/determined", required=False, help= "directory containing Helm Chart, values.yaml and templates.", ), Arg( "--det-version", type=str, default=None, help=argparse.SUPPRESS, ), Arg( "--no-managed-bucket", required=False, help= "flag that indicates GCS checkpointing bucket already exists", action="store_true", ), Arg( "--zone", type=str, default=defaults.ZONE, help="zone to create cluster in", ), Arg( "--master-machine-type", type=str, default=defaults.MASTER_MACHINE_TYPE, help="machine type to use for master node group", ), Arg( "--agent-machine-type", "--machine-type", type=str, default=defaults.AGENT_MACHINE_TYPE, help="machine type to use for agent node group", ), Arg( "--max-gpu-nodes", "--max-nodes", type=int, default=defaults.MAX_GPU_NODES, help= "maximum number of nodes for the GPU node group", ), Arg( "--max-cpu-nodes", type=int, default=defaults.MAX_CPU_NODES, help= "maximum number of nodes for the CPU node group", ), Arg( "--cpu-node-pool-name", type=str, default=None, help="a unique name for the GPU node pool", ), Arg( "--multiple-node-pools", required=False, help= "flag that indicates multiple node pools should be used - one" " for CPU only tasks and one for GPU tasks", action="store_true", ), Arg( "--gpu-coscheduler", "--coscheduler", required=False, help= "Enables the lightweight coscheduling plugin for Kubernetes that" " provides priority-based gang scheduling for the GPU Agent Nodepool." "If this argument is set, cluster autoscaling is disabled, and" " --max-gpu-nodes nodes are statically allocated for the GPU Agent Node" " pool at creation time.", action="store_true", ), Arg( "--preemption", "--preemptive-scheduler", required=False, help= "Enables the priority-based scheduler with preemption on the GPU" " Agent Nodepool. If this argument is set, cluster autoscaling is" " disabled, and --max-gpu-nodes nodes are statically allocated for the " " GPU Agent Node pool at creation time.", action="store_true", ), ], ), ], ), Cmd( "down", handle_down, "delete gke cluster", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, default=None, required=True, help="the gke cluster to delete", ) ], ), ArgGroup( "optional named arguments", None, [ Arg( "--region", type=str, default="us-west1", help="region containing cluster to delete", ), Arg( "--no-managed-bucket", required=False, help= "GCS checkpointing bucket is managed externally", action="store_true", ), Arg( "--gcs-bucket-name", type=str, default=None, help= "a unique name for the GCS bucket that will store your" " checkpoints", ), ], ), ], ), ], )
Cmd("notebook", None, "manage notebooks", [ Cmd("list ls", command.list, "list notebooks", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all notebooks (including other users')") ], is_default=True), Cmd("config", command.config, "display notebook config", [ Arg("id", type=str, help="notebook ID"), ]), Cmd("start", start_notebook, "start a new notebook", [ Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("--template", type=str, help="name of template to apply to the notebook configuration"), Arg("--no-browser", action="store_true", help="don't open the notebook in a browser after startup"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID"), Arg("--preview", action="store_true", help="preview the notebook configuration"), ]), Cmd("open", open_notebook, "open an existing notebook", [ Arg("notebook_id", help="notebook ID") ]), Cmd("logs", command.tail_logs, "fetch notebook logs", [ Arg("notebook_id", help="notebook ID"), Arg("-f", "--follow", action="store_true", help="follow the logs of a notebook, similar to tail -f"), Arg("--tail", type=int, default=200, help="number of lines to show, counting from the end " "of the log") ]), Cmd("kill", command.kill, "kill a notebook", [ Arg("notebook_id", help="notebook ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), ])
Cmd("notebook", None, "manage notebooks", [ Cmd("list ls", command.list_tasks, "list notebooks", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all notebooks (including other users')") ], is_default=True), Cmd("config", command.config, "display notebook config", [ Arg("notebook_id", type=str, help="notebook ID"), ]), Cmd("start", start_notebook, "start a new notebook", [ Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("--template", type=str, help="name of template to apply to the notebook configuration" ), Arg("--no-browser", action="store_true", help="don't open the notebook in a browser after startup"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID"), Arg("--preview", action="store_true", help="preview the notebook configuration"), ]), Cmd("open", open_notebook, "open an existing notebook", [Arg("notebook_id", help="notebook ID")]), Cmd("logs", lambda *args, **kwargs: task.logs(*args, **kwargs), "fetch notebook logs", [ Arg("task_id", help="notebook ID", metavar="notebook_id"), *task.common_log_options ]), Cmd("kill", command.kill, "kill a notebook", [ Arg("notebook_id", help="notebook ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), Cmd("set", None, "set notebook attributes", [ Cmd("priority", command.set_priority, "set notebook priority", [ Arg("notebook_id", help="notebook ID"), Arg("priority", type=int, help="priority"), ]), ]), ])
args_description = Cmd( "aws", None, "AWS help", [ Cmd( "list", handle_list, "list CloudFormation stacks", [ Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), ], ), Cmd( "down", handle_down, "delete CloudFormation stack", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, help="stack name for CloudFormation cluster", required=True, ), ], ), Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), ], ), Cmd( "up", handle_up, "deploy/update CloudFormation stack", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, help="stack name for CloudFormation cluster", required=True, ), Arg( "--keypair", type=str, help="aws ec2 keypair for master and agent", required=True, ), ], ), Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), Arg( "--master-instance-type", type=str, help="instance type for master", ), Arg( "--enable-cors", action="store_true", help="allow CORS requests or not: true/false", ), Arg("--master-tls-cert"), Arg("--master-tls-key"), Arg("--master-cert-name"), Arg( "--gpu-agent-instance-type", type=str, help="instance type for agent in the GPU resource pool", ), Arg( "--cpu-agent-instance-type", type=str, help="instance type for agent in the CPU resource pool", ), Arg( "--deployment-type", type=str, choices=constants.deployment_types.DEPLOYMENT_TYPES, default=constants.defaults.DEPLOYMENT_TYPE, help=f"deployment type - " f'must be one of [{", ".join(constants.deployment_types.DEPLOYMENT_TYPES)}]', ), Arg( "--inbound-cidr", type=str, help="inbound IP Range in CIDR format", ), Arg( "--agent-subnet-id", type=str, help="subnet to deploy agents into. Optional. " "Only used with simple deployment type", ), Arg( "--det-version", type=str, help=argparse.SUPPRESS, ), Arg( "--db-password", type=str, default=constants.defaults.DB_PASSWORD, help="password for master database", ), Arg( "--max-idle-agent-period", type=str, help="max agent idle time", ), Arg( "--max-agent-starting-period", type=str, help="max agent starting time", ), Arg( "--max-cpu-containers-per-agent", type=int, help="maximum number of cpu containers on agent in the CPU resource pool", ), Arg( "--min-dynamic-agents", type=int, help="minimum number of dynamic agent instances at one time", ), Arg( "--max-dynamic-agents", type=int, help="maximum number of dynamic agent instances at one time", ), Arg( "--spot", action="store_true", help="whether to use spot instances or not", ), Arg( "--spot-max-price", type=validate_spot_max_price(), help="maximum hourly price for the spot instance " "(do not include the dollar sign)", ), Arg( "--scheduler-type", type=validate_scheduler_type(), default="fair_share", help="scheduler to use (defaults to fair_share).", ), Arg( "--preemption-enabled", type=str, default="false", help="whether task preemption is supported in the scheduler " "(only configurable for priority scheduler).", ), Arg( "--dry-run", action="store_true", help="print deployment template", ), Arg( "--cpu-env-image", type=str, help="Docker image for CPU tasks", ), Arg( "--gpu-env-image", type=str, help="Docker image for GPU tasks", ), Arg( "--log-group-prefix", type=str, help="prefix for output CloudWatch log group", ), Arg( "--retain-log-group", action="store_const", const="true", help="whether to retain CloudWatch log group after the stack is deleted" " (only available for the simple template)", ), ], ), ], )
Cmd("a|gent", None, "manage agents", [ Cmd("list", list_agents, "list agents", [ Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), ), ], is_default=True), Cmd("enable", patch_agent(True), "enable agent", [ Group( Arg("agent_id", help="agent ID", nargs="?", completer=agent_id_completer), Arg("--all", action="store_true", help="enable all agents"), ) ]), Cmd("disable", patch_agent(False), "disable agent", [ Group( Arg("agent_id", help="agent ID", nargs="?", completer=agent_id_completer), Arg("--all", action="store_true", help="disable all agents"), ), Arg("--drain", action="store_true", help="enter drain mode, allowing the tasks currently running on " "the disabled agents to finish. will also print these tasks, if any" ), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), ), ]), ]),
Cmd("shell", None, "manage shells", [ Cmd("list", partial(command.list_tasks), "list shells", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all shells (including other users')"), Group(format_args["json"], format_args["csv"]), ], is_default=True), Cmd("config", partial(command.config), "display shell config", [ Arg("shell_id", type=str, help="shell ID"), ]), Cmd("start", start_shell, "start a new shell", [ Arg("ssh_opts", nargs="*", help="additional SSH options when connecting to the shell"), Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("-p", "--passphrase", action="store_true", help="passphrase to encrypt the shell private key"), Arg("--template", type=str, help="name of template to apply to the shell configuration"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID"), Arg("--show-ssh-command", action="store_true", help= "show ssh command (e.g. for use in IDE) when starting the shell" ), ]), Cmd("open", open_shell, "open an existing shell", [ Arg("shell_id", help="shell ID"), Arg("ssh_opts", nargs="*", help="additional SSH options when connecting to the shell"), Arg("--show-ssh-command", action="store_true", help= "show ssh command (e.g. for use in IDE) when starting the shell" ), ]), Cmd("show_ssh_command", show_ssh_command, "print the ssh command", [ Arg("shell_id", help="shell ID"), Arg("ssh_opts", nargs="*", help="additional SSH options when connecting to the shell"), ]), Cmd("logs", partial(task.logs), "fetch shell logs", [ Arg("task_id", help="shell ID", metavar="shell_id"), *task.common_log_options ]), Cmd("kill", partial(command.kill), "kill a shell", [ Arg("shell_id", help="shell ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), Cmd("set", None, "set shell attributes", [ Cmd("priority", partial(command.set_priority), "set shell priority", [ Arg("shell_id", help="shell ID"), Arg("priority", type=int, help="priority"), ]), ]), ])
args_description = Cmd( "aws", None, "AWS help", [ Cmd( "list", handle_list, "list CloudFormation stacks", [ Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), ], ), Cmd( "down", handle_down, "delete CloudFormation stack", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, help="stack name for CloudFormation cluster", required=True, ), ], ), Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), Arg( "--no-prompt", action="store_true", help="no prompt when deleting resources", ), ], ), Cmd( "up", handle_up, "deploy/update CloudFormation stack", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=str, help="stack name for CloudFormation cluster", required=True, ), Arg( "--keypair", type=str, help="aws ec2 keypair for master and agent", required=True, ), ], ), Arg( "--region", type=str, default=None, help="AWS region", ), Arg("--profile", type=str, default=None, help="AWS profile"), Arg( "--master-instance-type", type=str, help="instance type for master", ), Arg( "--enable-cors", action="store_true", help="allow CORS requests or not: true/false", ), Arg("--master-tls-cert"), Arg("--master-tls-key"), Arg("--master-cert-name"), Arg( "--compute-agent-instance-type", "--gpu-agent-instance-type", type=str, help="instance type for agents in the compute resource pool", ), Arg( "--aux-agent-instance-type", "--cpu-agent-instance-type", type=str, help="instance type for agents in the auxiliary resource pool", ), Arg( "--deployment-type", type=str, choices=constants.deployment_types.DEPLOYMENT_TYPES, default=constants.defaults.DEPLOYMENT_TYPE, help="deployment type", ), Arg( "--inbound-cidr", type=str, help="inbound IP Range in CIDR format", ), Arg( "--agent-subnet-id", type=str, help="subnet to deploy agents into. Optional. " "Only used with simple deployment type", ), Arg( "--det-version", type=str, help=argparse.SUPPRESS, ), Arg( "--db-password", type=str, default=constants.defaults.DB_PASSWORD, help="password for master database", ), Arg( "--max-idle-agent-period", type=str, help="max agent idle time", ), Arg( "--max-agent-starting-period", type=str, help="max agent starting time", ), Arg( "--max-aux-containers-per-agent", "--max-cpu-containers-per-agent", type=int, help="maximum number of containers on agents in the auxiliary resource pool", ), Arg( "--min-dynamic-agents", type=int, help="minimum number of dynamic agent instances at one time", ), Arg( "--max-dynamic-agents", type=int, help="maximum number of dynamic agent instances at one time", ), Arg( "--spot", action="store_true", help="whether to use spot instances or not", ), Arg( "--spot-max-price", type=validate_spot_max_price(), help="maximum hourly price for spot instances " "(do not include the dollar sign)", ), Arg( "--scheduler-type", type=str, choices=["fair_share", "priority", "round_robin"], default="fair_share", help="scheduler to use", ), Arg( "--preemption-enabled", type=str, default="false", help="whether task preemption is supported in the scheduler " "(only configurable for priority scheduler).", ), Arg( "--dry-run", action="store_true", help="print deployment template", ), Arg( "--cpu-env-image", type=str, help="Docker image for CPU tasks", ), Arg( "--gpu-env-image", type=str, help="Docker image for GPU tasks", ), Arg( "--log-group-prefix", type=str, help="prefix for output CloudWatch log group", ), Arg( "--retain-log-group", action="store_const", const="true", help="whether to retain CloudWatch log group after the stack is deleted" " (only available for the simple template)", ), Arg( "--master-config-template-path", type=Path, default=None, help="path to master yaml template", ), Arg( "--efs-id", type=str, help="preexisting EFS that will be mounted into the task containers; " "if not provided, a new EFS instance will be created. The agents must be " "able to connect to the EFS instance.", ), Arg( "--fsx-id", type=str, help="preexisting FSx that will be mounted into the task containers; " "if not provided, a new FSx instance will be created. The agents must be " "able to connect to the FSx instance.", ), Arg( "--no-prompt", action="store_true", help="no prompt when deployment would delete existing database", ), ], ), Cmd( "dump-master-config-template", handle_dump_master_config_template, "dump default master config template", [ Arg( "--deployment-type", type=str, choices=constants.deployment_types.DEPLOYMENT_TYPES, default=constants.defaults.DEPLOYMENT_TYPE, help="deployment type", ), ], ), ], )
# fetch logs. response = api.get( args.master, "logs", params={"greater_than_id": str(latest_log_id)} ) latest_log_id = process_response(response, latest_log_id) except KeyboardInterrupt: break # fmt: off args_description = [ Cmd("master", None, "manage master", [ Cmd("config", config, "fetch master config", [ Group(format_args["json"], format_args["yaml"]) ]), Cmd("info", get_master, "fetch master info", [ Group(format_args["json"], format_args["yaml"]) ]), Cmd("logs", logs, "fetch master logs", [ Arg("-f", "--follow", action="store_true", help="follow the logs of master, similar to tail -f"), Arg("--tail", type=int, help="number of lines to show, counting from the end " "of the log (default is all)") ]), ]) ] # type: List[Any] # fmt: on
from determined.common.declarative_argparse import Cmd deploy_cmd = Cmd( "d|eploy", None, "manage deployments", [], )
help="master address", metavar="address", default=get_default_master_address()), Arg("-v", "--version", action="version", help="print CLI version and exit", version="%(prog)s {}".format(determined.__version__)), experiment.args_description, checkpoint.args_description, Cmd( "task", None, "manage tasks (commands, experiments, notebooks, shells, tensorboards)", [ Cmd("list", list_tasks, "list tasks in cluster", [ Arg("--csv", action="store_true", help="print as CSV"), ], is_default=True), ]), Cmd("preview-search", preview_search, "preview search", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)") ]), deploy_args_description, ] # type: List[object] # fmt: on
Cmd( "res|ources", None, "query historical resource allocation", [ Cmd( "raw", raw, "get raw allocation information", [ Arg("timestamp_after"), Arg("timestamp_before"), Arg("--json", action="store_true", help="output JSON rather than CSV"), ], ), Cmd( "agg|regated", aggregated, "get aggregated allocation information", [ Arg("start_date", help="first date to include"), Arg("end_date", help="last date to include"), Arg("--json", action="store_true", help="output JSON rather than CSV"), Arg( "--monthly", action="store_true", help="aggregate by month rather than by day", ), ], ), ], )
Cmd( "m|odel", None, "manage models", [ Cmd( "list", list_models, "list all models in the registry", [ Arg( "--sort-by", type=str, choices=["name", "description", "creation_time", "last_updated_time"], default="last_updated_time", help="sort models by the given field", ), Arg( "--order-by", type=str, choices=["asc", "desc"], default="asc", help="order models in either ascending or descending order", ), Arg("--json", action="store_true", help="print as JSON"), ], is_default=True, ), Cmd( "register-version", register_version, "register a new version of a model", [ Arg("name", type=str, help="name of the model"), Arg("uuid", type=str, help="uuid to register as the next version of the model"), Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "describe", describe, "describe model", [ Arg("name", type=str, help="model to describe"), Arg("--json", action="store_true", help="print as JSON"), Arg( "--version", type=int, default=0, help="model version information to include in output", ), ], ), Cmd( "list-versions", list_versions, "list the versions of a model", [ Arg("name", type=str, help="unique name of the model"), Arg("--json", action="store_true", help="print as JSON"), ], ), Cmd( "create", create, "create model", [ Arg("name", type=str, help="unique name of the model"), Arg("--description", type=str, help="description of the model"), Arg("--json", action="store_true", help="print as JSON"), ], ), ], )
Cmd("command cmd", None, "manage commands", [ Cmd("list ls", command.list, "list commands", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all commands (including other users')"), ], is_default=True), Cmd("config", command.config, "display command config", [ Arg("id", type=str, help="command ID"), ]), Cmd("run", run_command, "create command", [ Arg("entrypoint", type=str, nargs=REMAINDER, help="entrypoint command and arguments to execute"), Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("--template", type=str, help="name of template to apply to the command configuration"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID") ]), Cmd("logs", command.tail_logs, "fetch command logs", [ Arg("command_id", help="command ID"), Arg("-f", "--follow", action="store_true", help="follow the logs of a command, similar to tail -f"), Arg("--tail", type=int, default=200, help="number of lines to show, counting from the end " "of the log") ]), Cmd("kill", command.kill, "forcibly terminate a command", [ Arg("command_id", help="command ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), ])
args_description = Cmd( "gcp", None, "GCP help", [ Cmd( "down", handle_down, "delete gcp cluster", [ ArgGroup( "optional named arguments", None, [ Arg( "--local-state-path", type=str, default=os.getcwd(), help="local directory for storing cluster state", ), Arg( "--yes", action="store_true", help="no prompt when deleting resources", ), Arg( "--no-prompt", dest="yes", action="store_true", help=argparse.SUPPRESS, ), ], ), ], ), Cmd( "up", handle_up, "create gcp cluster", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=validate_cluster_id(), default=None, required=True, help="unique identifier to name and tag resources", ), Arg( "--project-id", type=str, default=None, required=True, help="project ID to create the cluster in", ), ], ), ArgGroup( "optional named arguments", None, [ Arg( "--dry-run", action="store_true", help="return the infrastructure plan to be executed " "based on your arguments", ), Arg( "--keypath", type=str, default=None, help= "path to service account key if not using default credentials", ), Arg( "--network", type=str, default="det-default", help="network name to create " "(the network should not already exist in the project)", ), Arg( "--filestore-address", type=str, default="", help= "the address of an existing Filestore in the format of " "'ip-address:/file-share'; if not provided and the no-filestore " "flag is not set, a new Filestore instance will be created", ), Arg( "--no-filestore", help= "whether to create a new Filestore if filestore-address " "parameter is not set", action="store_true", ), Arg( "--det-version", type=str, default=determined.__version__, help=argparse.SUPPRESS, ), Arg( "--region", type=str, default=constants.defaults.REGION, help= "region to create the cluster in (defaults to us-west1)", ), Arg( "--zone", type=str, default=None, help= "zone to create the cluster in (defaults to `region`-b)", ), Arg( "--environment-image", type=str, default=constants.defaults.ENVIRONMENT_IMAGE, help=argparse.SUPPRESS, ), Arg( "--local-state-path", type=str, default=os.getcwd(), help="local directory for storing cluster state", ), Arg( "--preemptible", type=bool, default=False, help= "whether to use preemptible instances for dynamic agents", ), Arg( "--operation-timeout-period", type=str, default=constants.defaults. OPERATION_TIMEOUT_PERIOD, help= "operation timeout before retrying, e.g. 5m for 5 minutes", ), Arg( "--master-instance-type", type=str, default=constants.defaults.MASTER_INSTANCE_TYPE, help="instance type for master", ), Arg( "--compute-agent-instance-type", "--gpu-agent-instance-type", type=str, default=constants.defaults. COMPUTE_AGENT_INSTANCE_TYPE, help= "instance type for agents in the compute resource pool", ), Arg( "--aux-agent-instance-type", "--cpu-agent-instance-type", type=str, default=constants.defaults.AUX_AGENT_INSTANCE_TYPE, help= "instance type for agents in the auxiliary resource pool", ), Arg( "--db-password", type=str, default=constants.defaults.DB_PASSWORD, help="password for master database", ), Arg( "--max-aux-containers-per-agent", "--max-cpu-containers-per-agent", type=int, default=constants.defaults. MAX_AUX_CONTAINERS_PER_AGENT, help="maximum number of containers on agents in the " "auxiliary resource pool", ), Arg( "--max-idle-agent-period", type=str, default=constants.defaults.MAX_IDLE_AGENT_PERIOD, help="max agent idle time before it is shut down, " "e.g. 30m for 30 minutes", ), Arg( "--max-agent-starting-period", type=str, default=constants.defaults. MAX_AGENT_STARTING_PERIOD, help= "max agent starting time before retrying, e.g. 30m for 30 minutes", ), Arg( "--port", type=int, default=constants.defaults.PORT, help= "port to use for communication on master instance", ), Arg( "--gpu-type", type=str, default=constants.defaults.GPU_TYPE, help="type of GPU to use on agents", ), Arg( "--gpu-num", type=int, default=constants.defaults.GPU_NUM, help="number of GPUs per agent instance", ), Arg( "--min-dynamic-agents", type=int, default=constants.defaults.MIN_DYNAMIC_AGENTS, help= "minimum number of dynamic agent instances at one time", ), Arg( "--max-dynamic-agents", type=int, default=constants.defaults.MAX_DYNAMIC_AGENTS, help= "maximum number of dynamic agent instances at one time", ), Arg( "--static-agents", type=int, default=constants.defaults.STATIC_AGENTS, help=argparse.SUPPRESS, ), Arg( "--min-cpu-platform-master", type=str, default=constants.defaults.MIN_CPU_PLATFORM_MASTER, help="minimum cpu platform for master instances", ), Arg( "--min-cpu-platform-agent", type=str, default=constants.defaults.MIN_CPU_PLATFORM_AGENT, help="minimum cpu platform for agent instances", ), Arg( "--scheduler-type", type=str, choices=["fair_share", "priority", "round_robin"], default=constants.defaults.SCHEDULER_TYPE, help="scheduler to use", ), Arg( "--preemption-enabled", type=bool, default=constants.defaults.PREEMPTION_ENABLED, help= "whether task preemption is supported in the scheduler " "(only configurable for priority scheduler).", ), Arg( "--cpu-env-image", type=str, default="", help="Docker image for CPU tasks", ), Arg( "--gpu-env-image", type=str, default="", help="Docker image for GPU tasks", ), Arg( "--master-config-template-path", type=Path, default=None, help="path to master yaml template", ), Arg( "--tf-state-gcs-bucket-name", type=str, default=None, help= "use the GCS bucket to store the terraform state " "instead of a local directory", ), ], ), ], ), Cmd( "dump-master-config-template", handle_dump_master_config_template, "dump default master config template", [], ), ], )
help="run as the given user", metavar="username", default=None), Arg("-m", "--master", help="master address", metavar="address", default=get_default_master_address()), Arg("-v", "--version", action="version", help="print CLI version and exit", version="%(prog)s {}".format(determined.__version__)), Cmd("preview-search", preview_search, "preview search", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)") ]), deploy_cmd, ] # type: List[object] # fmt: on all_args_description = (args_description + experiment_args_description + checkpoint_args_description + master_args_description + model_args_description + agent_args_description + notebook_args_description + job_args_description + resources_args_description + shell_args_description + task_args_description + template_args_description + tensorboard_args_description + trial_args_description + remote_args_description + user_args_description +
Cmd("shell", None, "manage shells", [ Cmd("list", command.list, "list shells", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all shells (including other users')") ], is_default=True), Cmd("config", command.config, "display shell config", [ Arg("id", type=str, help="shell ID"), ]), Cmd("start", start_shell, "start a new shell", [ Arg("ssh_opts", nargs="*", help="additional SSH options when connecting to the shell"), Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-v", "--volume", action="append", default=[], help=VOLUME_DESC), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("-p", "--passphrase", action="store_true", help="passphrase to encrypt the shell private key"), Arg("--template", type=str, help="name of template to apply to the shell configuration"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID"), ]), Cmd("open", open_shell, "open an existing shell", [ Arg("shell_id", help="shell ID"), Arg("ssh_opts", nargs="*", help="additional SSH options when connecting to the shell"), ]), Cmd("logs", command.tail_logs, "fetch shell logs", [ Arg("shell_id", help="shell ID"), Arg("-f", "--follow", action="store_true", help="follow the logs of a shell, similar to tail -f"), Arg("--tail", type=int, default=200, help="number of lines to show, counting from the end " "of the log") ]), Cmd("kill", command.kill, "kill a shell", [ Arg("shell_id", help="shell ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), ])
print( termcolor.colored( "CLI version {} is less than master version {}. " "Consider upgrading the CLI.".format(client_version, master_version), "yellow", ), file=sys.stderr, ) elif version.Version(client_version) > version.Version(master_version): print( termcolor.colored( "Master version {} is less than CLI version {}. " "Consider upgrading the master.".format( master_version, client_version), "yellow", ), file=sys.stderr, ) def describe_version(parsed_args: argparse.Namespace) -> None: info = get_version(parsed_args.master) print(render.format_object_as_yaml(info)) args_description = [ Cmd("version", describe_version, "show version information", []) ] # type: List[Any]
args_description = Cmd( "e|xperiment", None, "manage experiments", [ # Inspection commands. Cmd( "list", list_experiments, "list experiments", [ Arg( "--all", "-a", action="store_true", help="show all experiments (including archived and other users')", ), Arg("--csv", action="store_true", help="print as CSV"), ], is_default=True, ), Cmd("config", config, "display experiment config", [experiment_id_arg("experiment ID")]), Cmd( "describe", describe, "describe experiment", [ Arg("experiment_ids", help="comma-separated list of experiment IDs to describe"), Arg("--metrics", action="store_true", help="display full metrics"), Group( Arg("--csv", action="store_true", help="print as CSV"), Arg("--json", action="store_true", help="print as JSON"), Arg("--outdir", type=Path, help="directory to save output"), ), ], ), Cmd( "download-model-def", download_model_def, "download model definition", [ experiment_id_arg("experiment ID"), Arg("--output-dir", type=Path, help="output directory", default="."), ], ), Cmd( "list-trials lt", list_trials, "list trials of experiment", [ experiment_id_arg("experiment ID"), Arg("--csv", action="store_true", help="print as CSV"), ], ), Cmd( "list-checkpoints lc", checkpoint.list, "list checkpoints of experiment", [ experiment_id_arg("experiment ID"), Arg( "--best", type=int, help="Return the best N checkpoints for this experiment. " "If this flag is used, only checkpoints with an associated " "validation metric will be considered.", ), Arg("--csv", action="store_true", help="print as CSV"), ], ), # Create command. Cmd( "create", create, "create experiment", [ Arg("config_file", type=FileType("r"), help="experiment config file (.yaml)"), Arg("model_def", type=Path, help="file or directory containing model definition"), Arg( "-g", "--git", action="store_true", help="Associate git metadata with this experiment. This " "flag assumes that git is installed, a .git repository " "exists in the model definition directory, and that the " "git working tree of that repository is empty.", ), Arg( "--local", action="store_true", help="Create the experiment in local mode instead of submitting it to the " "cluster. For more information, see documentation on det.experimental.create()", ), Arg( "--template", type=str, help="name of template to apply to the experiment configuration", ), Group( Arg( "-f", "--follow-first-trial", action="store_true", help="follow the logs of the first trial that is created", ), Arg("--paused", action="store_true", help="do not activate the experiment"), Arg( "-t", "--test-mode", action="store_true", help="Test the experiment configuration and model " "definition by creating and scheduling a very small " "experiment. This command will verify that a training " "workload and validation workload run successfully and that " "checkpoints can be saved. The test experiment will " "be archived on creation.", ), ), ], ), # Lifecycle management commands. Cmd( "activate", activate, "activate experiment", [experiment_id_arg("experiment ID to activate")], ), Cmd("cancel", cancel, "cancel experiment", [experiment_id_arg("experiment ID to cancel")]), Cmd("pause", pause, "pause experiment", [experiment_id_arg("experiment ID to pause")]), Cmd( "archive", archive, "archive experiment", [experiment_id_arg("experiment ID to archive")], ), Cmd( "unarchive", unarchive, "unarchive experiment", [experiment_id_arg("experiment ID to unarchive")], ), Cmd( "delete", delete_experiment, "delete experiment", [ Arg("experiment_id", help="delete experiment"), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), Cmd( "download", download, "download checkpoints for an experiment", [ experiment_id_arg("experiment ID to download"), Arg( "-o", "--output-dir", type=str, default="checkpoints", help="Desired top level directory for the checkpoints. " "Checkpoints will be downloaded to " "<output_dir>/<checkpoint_uuid>/<checkpoint_files>.", ), Arg( "--top-n", type=int, default=1, help="The number of checkpoints to download for the " "experiment. The checkpoints are sorted by validation " "metric as defined by --sort-by and --smaller-is-better." "This command will select the best N checkpoints from the " "top performing N trials of the experiment.", ), Arg( "--sort-by", type=str, default=None, help="The name of the validation metric to sort on. Without --sort-by, the " "experiment's searcher metric is assumed. If this argument is specified, " "--smaller-is-better must also be specified.", ), Arg( "--smaller-is-better", type=lambda s: bool(distutils.util.strtobool(s)), default=None, help="The sort order for metrics when using --sort-by. For " "example, 'accuracy' would require passing '--smaller-is-better false'. If " "--sort-by is specified, this argument must be specified.", ), Arg( "-q", "--quiet", action="store_true", help="Only print the paths to the checkpoints.", ), ], ), Cmd( "kill", kill_experiment, "kill experiment", [Arg("experiment_id", help="experiment ID")] ), Cmd( "wait", wait, "wait for experiment to reach terminal state", [ experiment_id_arg("experiment ID"), Arg( "--polling-interval", type=int, default=5, help="the interval (in seconds) to poll for updated state", ), ], ), # Attribute setting commands. Cmd( "label", None, "manage experiment labels", [ Cmd( "add", add_label, "add label", [experiment_id_arg("experiment ID"), Arg("label", help="label")], ), Cmd( "remove", remove_label, "remove label", [experiment_id_arg("experiment ID"), Arg("label", help="label")], ), ], ), Cmd( "set", None, "set experiment attributes", [ Cmd( "description", set_description, "set experiment description", [ experiment_id_arg("experiment ID to modify"), Arg("description", help="experiment description"), ], ), Cmd( "gc-policy", set_gc_policy, "set experiment GC policy and run GC", [ experiment_id_arg("experiment ID to modify"), Arg( "--save-experiment-best", type=int, required=True, help="number of best checkpoints per experiment " "to save", ), Arg( "--save-trial-best", type=int, required=True, help="number of best checkpoints per trial to save", ), Arg( "--save-trial-latest", type=int, required=True, help="number of latest checkpoints per trial to save", ), Arg( "--yes", action="store_true", default=False, help="automatically answer yes to prompts", ), ], ), Cmd( "max-slots", set_max_slots, "set `max_slots` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("max_slots", type=none_or_int, help="max slots"), ], ), Cmd( "weight", set_weight, "set `weight` of experiment", [ experiment_id_arg("experiment ID to modify"), Arg("weight", type=float, help="weight"), ], ), ], ), ], )
# The `tail` parameter only makes sense the first time we # fetch logs. response = api.get( args.master, "logs", params={"greater_than_id": str(latest_log_id)}) latest_log_id = process_response(response, latest_log_id) except KeyboardInterrupt: break # fmt: off args_description = [ Cmd("m|aster", None, "manage master", [ Cmd("config", config, "fetch master config as JSON", []), Cmd("logs", logs, "fetch master logs", [ Arg("-f", "--follow", action="store_true", help="follow the logs of master, similar to tail -f"), Arg("--tail", type=int, help="number of lines to show, counting from the end " "of the log (default is all)") ]), ]) ] # type: List[Any] # fmt: on
Cmd("tensorboard", None, "manage TensorBoard instances", [ Cmd("list ls", partial(command.list_tasks), "list TensorBoard instances", [ Arg("-q", "--quiet", action="store_true", help="only display the IDs"), Arg("--all", "-a", action="store_true", help="show all TensorBoards (including other users')"), Group(format_args["json"], format_args["csv"]), ], is_default=True), Cmd("start", start_tensorboard, "start new TensorBoard instance", [ Arg("experiment_ids", type=int, nargs="*", help= "experiment IDs to load into TensorBoard. At most 100 trials from " "the specified experiment will be loaded into TensorBoard. If the " "experiment has more trials, the 100 best-performing trials will " "be used."), Arg("-t", "--trial-ids", nargs=ONE_OR_MORE, type=int, help= "trial IDs to load into TensorBoard; at most 100 trials are " "allowed per TensorBoard instance"), Arg("--config-file", default=None, type=FileType("r"), help="command config file (.yaml)"), Arg("-c", "--context", default=None, type=Path, help=CONTEXT_DESC), Arg("--config", action="append", default=[], help=CONFIG_DESC), Arg("--no-browser", action="store_true", help="don't open TensorBoard in a browser after startup"), Arg("-d", "--detach", action="store_true", help="run in the background and print the ID") ]), Cmd("config", partial(command.config), "display TensorBoard config", [Arg("tensorboard_id", type=str, help="TensorBoard ID")]), Cmd("open", open_tensorboard, "open existing TensorBoard instance", [Arg("tensorboard_id", help="TensorBoard ID")]), Cmd("logs", partial(task.logs), "fetch TensorBoard instance logs", [ Arg("task_id", help="TensorBoard ID", metavar="tensorboard_id"), *task.common_log_options, ]), Cmd("kill", partial(command.kill), "kill TensorBoard instance", [ Arg("tensorboard_id", help="TensorBoard ID", nargs=ONE_OR_MORE), Arg("-f", "--force", action="store_true", help="ignore errors"), ]), Cmd("set", None, "set TensorBoard attributes", [ Cmd("priority", partial(command.set_priority), "set TensorBoard priority", [ Arg("tensorboard_id", help="TensorBoard ID"), Arg("priority", type=int, help="priority"), ]), ]), ])
args_description = Cmd( "gcp", None, "gcp_help", [ Cmd( "down", handle_down, "delete gcp cluster", [ ArgGroup( "optional named arguments", None, [ Arg( "--local-state-path", type=str, default=os.getcwd(), help="local directory for storing cluster state", ), ], ), ], ), Cmd( "up", handle_up, "create gcp cluster", [ ArgGroup( "required named arguments", None, [ Arg( "--cluster-id", type=validate_cluster_id(), default=None, required=True, help="unique identifier to name and tag resources", ), Arg( "--project-id", type=str, default=None, required=True, help="project ID to create the cluster in", ), ], ), ArgGroup( "optional named arguments", None, [ Arg( "--dry-run", action="store_true", help="return the infrastructure plan to be executed " "based on your arguments", ), Arg( "--keypath", type=str, default=None, help="path to service account key if not using default credentials", ), Arg( "--network", type=str, default="det-default", help="network name to create " "(the network should not already exist in the project)", ), Arg( "--det-version", type=str, default=determined.__version__, help=argparse.SUPPRESS, ), Arg( "--region", type=str, default=constants.defaults.REGION, help="region to create the cluster in (defaults to us-west1)", ), Arg( "--zone", type=str, default=None, help="zone to create the cluster in (defaults to `region`-b)", ), Arg( "--environment-image", type=str, default=constants.defaults.ENVIRONMENT_IMAGE, help=argparse.SUPPRESS, ), Arg( "--local-state-path", type=str, default=os.getcwd(), help="local directory for storing cluster state", ), Arg( "--preemptible", type=bool, default=False, help="whether to use preemptible instances for dynamic agents", ), Arg( "--operation-timeout-period", type=str, default=constants.defaults.OPERATION_TIMEOUT_PERIOD, help="operation timeout before retrying, e.g. 5m for 5 minutes", ), Arg( "--master-instance-type", type=str, default=constants.defaults.MASTER_INSTANCE_TYPE, help="instance type for master", ), Arg( "--cpu-agent-instance-type", type=str, default=constants.defaults.CPU_AGENT_INSTANCE_TYPE, help="instance type for agens in the CPU resource pool", ), Arg( "--gpu-agent-instance-type", type=str, default=constants.defaults.GPU_AGENT_INSTANCE_TYPE, help="instance type for agents in the GPU resource pool", ), Arg( "--db-password", type=str, default=constants.defaults.DB_PASSWORD, help="password for master database", ), Arg( "--max-cpu-containers-per-agent", type=str, default=constants.defaults.MAX_CPU_CONTAINERS_PER_AGENT, help="max CPU containers running for agents in the CPU resource pool", ), Arg( "--max-idle-agent-period", type=str, default=constants.defaults.MAX_IDLE_AGENT_PERIOD, help="max agent idle time before it is shut down, " "e.g. 30m for 30 minutes", ), Arg( "--max-agent-starting-period", type=str, default=constants.defaults.MAX_AGENT_STARTING_PERIOD, help="max agent starting time before retrying, e.g. 30m for 30 minutes", ), Arg( "--port", type=int, default=constants.defaults.PORT, help="port to use for communication on master instance", ), Arg( "--gpu-type", type=str, default=constants.defaults.GPU_TYPE, help="type of GPU to use on agents", ), Arg( "--gpu-num", type=int, default=constants.defaults.GPU_NUM, help="number of GPUs per agent instance", ), Arg( "--min-dynamic-agents", type=int, default=constants.defaults.MIN_DYNAMIC_AGENTS, help="minimum number of dynamic agent instances at one time", ), Arg( "--max-dynamic-agents", type=int, default=constants.defaults.MAX_DYNAMIC_AGENTS, help="maximum number of dynamic agent instances at one time", ), Arg( "--static-agents", type=int, default=constants.defaults.STATIC_AGENTS, help=argparse.SUPPRESS, ), Arg( "--min-cpu-platform-master", type=str, default=constants.defaults.MIN_CPU_PLATFORM_MASTER, help="minimum cpu platform for master instances", ), Arg( "--min-cpu-platform-agent", type=str, default=constants.defaults.MIN_CPU_PLATFORM_AGENT, help="minimum cpu platform for agent instances", ), Arg( "--scheduler-type", type=validate_scheduler_type(), default=constants.defaults.SCHEDULER_TYPE, help="scheduler to use (defaults to fair_share).", ), Arg( "--preemption-enabled", type=bool, default=constants.defaults.PREEMPTION_ENABLED, help="whether task preemption is supported in the scheduler " "(only configurable for priority scheduler).", ), Arg( "--cpu-env-image", type=str, default="", help="Docker image for CPU tasks", ), Arg( "--gpu-env-image", type=str, default="", help="Docker image for GPU tasks", ), ], ), ], ), ], )