def logs(args: Namespace) -> None: def process_response(response: Response, latest_log_id: int) -> int: for log in response.json(): check_gt(log["id"], latest_log_id) latest_log_id = log["id"] print("{} [{}]: {}".format(log["time"], log["level"], log["message"])) return latest_log_id params = {} if args.tail: params["tail"] = args.tail response = api.get(args.master, "logs", params=params) latest_log_id = process_response(response, -1) # "Follow" mode is implemented as a loop in the CLI. We assume that # newer log messages have a numerically larger ID than older log # messages, so we keep track of the max ID seen so far. if args.follow: while True: try: # Poll for new logs every 100 ms. time.sleep(0.1) # The `tail` parameter only makes sense the first time we # fetch logs. response = api.get( args.master, "logs", params={"greater_than_id": str(latest_log_id)}) latest_log_id = process_response(response, latest_log_id) except KeyboardInterrupt: break
def get_version(self, version: int = 0) -> Checkpoint: if version == 0: resp = api.get( self._master, "/api/v1/models/{}/versions/".format(self.name), { "limit": 1, "order_by": 2 }, ) data = resp.json() latest_version = data["versions"][0] return Checkpoint.from_json({ **latest_version["checkpoint"], "version": latest_version["version"], "model_name": data["model"]["name"], }) else: resp = api.get( self._master, "/api/v1/models/{}/versions/{}".format(self.name, version)) data = resp.json() return Checkpoint.from_json(data["version"]["checkpoint"], self._master)
def list_slots(args: argparse.Namespace) -> None: task_res = api.get(args.master, "tasks") agent_res = api.get(args.master, "agents") agents = agent_res.json() tasks = task_res.json() cont_names = {} for task in tasks.values(): for cont in task["containers"]: cont_names[cont["id"]] = {"name": task["name"], "id": task["id"]} headers = ["Agent Name", "Slot ID", "Enabled", "Task ID", "Task Name", "Type", "Device"] values = [ [ local_id(agent_id), local_id(slot_id), slot["enabled"], cont_names[slot["container"]["id"]]["id"] if slot["container"] else "FREE", cont_names[slot["container"]["id"]]["name"] if slot["container"] else "None", slot["device"]["type"], slot["device"]["brand"], ] for agent_id, agent in sorted(agents.items()) for slot_id, slot in sorted(agent["slots"].items()) ] print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
def get_version(self, version: int = 0) -> Checkpoint: """ Retrieve the checkpoint corresponding to the specified version of the model. If no version is specified the latest model version is returned. Arguments: version (int, optional): The model version number requested. """ if version == 0: resp = api.get( self._master, "/api/v1/models/{}/versions/".format(self.name), { "limit": 1, "order_by": 2 }, ) data = resp.json() latest_version = data["versions"][0] return Checkpoint.from_json({ **latest_version["checkpoint"], "version": latest_version["version"], "model_name": data["model"]["name"], }) else: resp = api.get( self._master, "/api/v1/models/{}/versions/{}".format(self.name, version)) data = resp.json() return Checkpoint.from_json(data["version"]["checkpoint"], self._master)
def describe_trial(args: Namespace) -> None: if args.metrics: r = api.get(args.master, "trials/{}/metrics".format(args.trial_id)) else: r = api.get(args.master, "trials/{}".format(args.trial_id)) trial = r.json() if args.json: print(json.dumps(trial, indent=4)) return # Print information about the trial itself. headers = [ "Experiment ID", "State", "H-Params", "Start Time", "End Time", ] values = [[ trial["experiment_id"], trial["state"], json.dumps(trial["hparams"], indent=4), render.format_time(trial["start_time"]), render.format_time(trial["end_time"]), ]] render.tabulate_or_csv(headers, values, args.csv) # Print information about individual steps. headers = [ "Step #", "State", "Start Time", "End Time", "Checkpoint", "Checkpoint UUID", "Checkpoint Metadata", "Validation", "Validation Metrics", ] if args.metrics: headers.append("Step Metrics") values = [[ s["id"], s["state"], render.format_time(s["start_time"]), render.format_time(s["end_time"]), *format_checkpoint(s["checkpoint"]), *format_validation(s["validation"]), *([json.dumps(s["metrics"], indent=4)] if args.metrics else []), ] for s in trial["steps"]] print() print("Steps:") render.tabulate_or_csv(headers, values, args.csv)
def list_slots(args: argparse.Namespace) -> None: task_res = api.get(args.master, "tasks") agent_res = api.get(args.master, "agents") agents = agent_res.json() tasks = task_res.json() c_names = {} for task in tasks.values(): for cont in task["containers"]: c_names[cont["id"]] = {"name": task["name"], "id": task["id"]} slots = [ OrderedDict([ ("agent_id", local_id(agent_id)), ("resource_pool", agent["resource_pool"]), ("slot_id", local_id(slot_id)), ("enabled", slot["enabled"]), ( "task_id", c_names[slot["container"]["id"]]["id"] if slot["container"] else "FREE", ), ( "task_name", c_names[slot["container"]["id"]]["name"] if slot["container"] else "None", ), ("type", slot["device"]["type"]), ("device", slot["device"]["brand"]), ]) for agent_id, agent in sorted(agents.items()) for slot_id, slot in sorted(agent["slots"].items()) ] if args.json: print(json.dumps(slots, indent=4)) return headers = [ "Agent ID", "Resource Pool", "Slot ID", "Enabled", "Task ID", "Task Name", "Type", "Device", ] values = [s.values() for s in slots] render.tabulate_or_csv(headers, values, args.csv)
def list_agents(args: argparse.Namespace) -> None: r = api.get(args.master, "agents") agents = r.json() agents = [ OrderedDict( [ ("id", local_id(agent_id)), ("registered_time", render.format_time(agent["registered_time"])), ("num_slots", len(agent["slots"])), ("num_containers", agent["num_containers"]), ("label", agent["label"]), ] ) for agent_id, agent in sorted(agents.items()) ] if args.json: print(json.dumps(agents, indent=4)) return headers = ["Agent ID", "Registered Time", "Slots", "Containers", "Label"] values = [a.values() for a in agents] render.tabulate_or_csv(headers, values, args.csv)
def top_n_checkpoints( self, limit: int, sort_by: Optional[str] = None, smaller_is_better: Optional[bool] = None ) -> List[checkpoint.Checkpoint]: """ Return the N :class:`~determined.experimental.Checkpoint` instances with the best validation metric values as defined by the ``sort_by`` and ``smaller_is_better`` arguments. This method will return the best checkpoint from the top N performing distinct trials of the experiment. Arguments: sort_by (string, optional): The name of the validation metric to use for sorting checkpoints. If this parameter is unset, the metric defined in the experiment configuration searcher field will be used. smaller_is_better (bool, optional): Specifies whether to sort the metric above in ascending or descending order. If ``sort_by`` is unset, this parameter is ignored. By default, the value of ``smaller_is_better`` from the experiment's configuration is used. """ r = api.get(self._master, "checkpoints", params={"experiment_id": self.id}).json() if not r: raise AssertionError("No checkpoint found for trial {}".format(self.id)) if not sort_by: sort_by = r[0]["experiment_config"]["searcher"]["metric"] smaller_is_better = r[0]["experiment_config"]["searcher"]["smaller_is_better"] r.sort( reverse=not smaller_is_better, key=lambda x: x["metrics"]["validation_metrics"][sort_by] ) return [checkpoint.from_json(ckpt, self._master) for ckpt in r[:limit]]
def list_tasks(args: Namespace) -> None: r = api.get(args.master, "tasks") def agent_info(t: Dict[str, Any]) -> Union[str, List[str]]: containers = t.get("containers", []) if not containers: return "unassigned" if len(containers) == 1: agent = containers[0]["agent"] # type: str return agent return [c["agent"] for c in containers] tasks = r.json() headers = [ "ID", "Name", "Slots Needed", "Registered Time", "Agent", "Priority" ] values = [[ task["id"], task["name"], task["slots_needed"], render.format_time(task["registered_time"]), agent_info(task), task["priority"] if task["scheduler_type"] == "priority" else "N/A", ] for task_id, task in sorted( tasks.items(), key=lambda tup: (render.format_time(tup[1]["registered_time"]), ), )] render.tabulate_or_csv(headers, values, args.csv)
def get_models( self, sort_by: ModelSortBy = ModelSortBy.NAME, order_by: ModelOrderBy = ModelOrderBy.ASCENDING, name: str = "", description: str = "", ) -> List[Model]: """ Get a list of all models in the model registry. Arguments: sort_by: Which field to sort by. See :class:`~determined.experimental.ModelSortBy`. order_by: Whether to sort in ascending or descending order. See :class:`~determined.experimental.ModelOrderBy`. name: If this parameter is set, models will be filtered to only include models with names matching this parameter. description: If this parameter is set, models will be filtered to only include models with descriptions matching this parameter. """ r = api.get( self._session._master, "/api/v1/models/", params={ "sort_by": sort_by.value, "order_by": order_by.value, "name": name, "description": description, }, ) models = r.json().get("models") return [Model.from_json(m, self._session._master) for m in models]
def get_model(self, name: str) -> Model: """ Get the :class:`~determined.experimental.Model` representing the model with the provided name. """ r = api.get(self._session._master, "/api/v1/models/{}".format(name)) return Model.from_json(r.json().get("model"), self._session._master)
def top_n_checkpoints( self, limit: int, sort_by: Optional[str] = None, smaller_is_better: Optional[bool] = None, ) -> List[checkpoint.Checkpoint]: """ Return the N :class:`~determined.experimental.Checkpoint` instances with the best validation metrics, as defined by the ``sort_by`` and ``smaller_is_better`` arguments. This method will return the best checkpoint from the top N best-performing distinct trials of the experiment. Only checkpoints in a ``COMPLETED`` state with a matching ``COMPLETED`` validation are considered. Arguments: limit (int): The maximum number of checkpoints to return. sort_by (string, optional): The name of the validation metric to use for sorting checkpoints. If this parameter is unset, the metric defined in the experiment configuration searcher field will be used. smaller_is_better (bool, optional): Specifies whether to sort the metric above in ascending or descending order. If ``sort_by`` is unset, this parameter is ignored. By default, the value of ``smaller_is_better`` from the experiment's configuration is used. """ r = api.get( self._master, "/api/v1/experiments/{}/checkpoints".format(self.id), params={ "states": checkpoint.CheckpointState.COMPLETED.value, "validation_states": checkpoint.CheckpointState.COMPLETED.value, }, ) checkpoints = r.json()["checkpoints"] if not checkpoints: raise AssertionError( "No checkpoint found for experiment {}".format(self.id)) if not sort_by: sort_by = checkpoints[0]["experimentConfig"]["searcher"]["metric"] smaller_is_better = checkpoints[0]["experimentConfig"]["searcher"][ "smaller_is_better"] checkpoints.sort( reverse=not smaller_is_better, key=lambda x: x["metrics"]["validationMetrics"][sort_by]) # Ensure returned checkpoints are from distinct trials. t_ids = set() checkpoint_refs = [] for ckpt in checkpoints: if ckpt["trialId"] not in t_ids: checkpoint_refs.append( checkpoint.Checkpoint.from_json(ckpt, self._master)) t_ids.add(ckpt["trialId"]) return checkpoint_refs[:limit]
def print_logs( offset: Optional[int], limit: Optional[int] = 5000, follow: bool = False ) -> None: query = {} # type: Dict[str, Any] if offset is not None: query["offset"] = offset if limit is not None: query["limit"] = limit if follow: query["follow"] = "true" for f in [ "agent_ids", "container_ids", "rank_ids", "sources", "stdtypes", "timestamp_before", "timestamp_after", ]: if getattr(args, f, None) is not None: query[f] = getattr(args, f) if getattr(args, "level", None) is not None: query["levels"] = to_levels_above(args.level) path = "/api/v1/trials/{}/logs?{}".format(args.trial_id, urlencode(query, doseq=True)) with api.get(args.master, path, stream=True) as r: for line in r.iter_lines(): log = simplejson.loads(line)["result"] print(log["message"], end="")
def list_versions(args: Namespace) -> None: if args.json: r = api.get(args.master, "models/{}/versions".format(args.name)) data = r.json() print(json.dumps(data, indent=2)) else: model = Determined(args.master).get_model(args.name) render_model(model) print("\n") headers = [ "Version #", "Trial ID", "Batch #", "Checkpoint UUID", "Validation Metrics", "Metadata", ] values = [[ ckpt.version, ckpt.trial_id, ckpt.batch_number, ckpt.uuid, json.dumps(ckpt.validation, indent=2), json.dumps(ckpt.metadata, indent=2), ] for ckpt in model.get_versions()] render.tabulate_or_csv(headers, values, False)
def request_valid_trials_snapshot(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/trials-snapshot".format( experiment_id), params={ "metric_name": "accuracy", "metric_type": "METRIC_TYPE_VALIDATION", "batches_processed": 200, "period_seconds": 1, }, ) results = [ message["result"] for message in map(json.loads, response.text.splitlines()) ] # First let's verify an empty response was sent back before any real work was done if results[0]["trials"] != []: return ("unexpected trials in first response", results) # Then we verify that we receive the expected number of trials and the right types trials = set() for i in range(1, len(results)): for trial in results[i]["trials"]: trials.add(trial["trialId"]) hparam_error = validate_hparam_types(trial["hparams"]) if hparam_error is not None: return (hparam_error, results) if type(trial["metric"]) != float: return ("metric of unexpected type", results) if len(trials) != 5: return ("unexpected number of trials received", results) return None
def request_valid_metric_batches(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/batches".format(experiment_id), params={ "metric_name": "accuracy", "metric_type": "METRIC_TYPE_VALIDATION", "period_seconds": 1, }, ) results = [ message["result"] for message in map(json.loads, response.text.splitlines()) ] # First let's verify an empty response was sent back before any real work was done if results[0]["batches"] != []: return ("unexpected batches in first response", results) # Then we verify that all expected responses are eventually received exactly once accumulated = set() for i in range(1, len(results)): for batch in results[i]["batches"]: if batch in accumulated: return ("batch appears twice", results) accumulated.add(batch) if accumulated != {200, 400}: return ("unexpected set of batches", results) return None
def request_metric_names(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/metric-names".format(experiment_id), params={"period_seconds": 1}, ) results = [message["result"] for message in map(json.loads, response.text.splitlines())] # First let's verify an empty response was sent back before any real work was done if results[0]["searcherMetric"] != "validation_loss": return ("unexpected searcher metric in first response", results) if results[0]["trainingMetrics"] != []: return ("unexpected training metric in first response", results) if results[0]["validationMetrics"] != []: return ("unexpected validation metric in first response", results) # Then we verify that all expected responses are eventually received exactly once accumulated_training = set() accumulated_validation = set() for i in range(1, len(results)): for training in results[i]["trainingMetrics"]: if training in accumulated_training: return ("training metric appeared twice", results) accumulated_training.add(training) for validation in results[i]["validationMetrics"]: if validation in accumulated_validation: return ("training metric appeared twice", results) accumulated_validation.add(validation) if accumulated_training != {"loss"}: return ("unexpected set of training metrics", results) if accumulated_validation != {"validation_loss", "accuracy"}: return ("unexpected set of validation metrics", results) return None
def experiment_has_active_workload(experiment_id: int) -> bool: r = api.get(conf.make_master_url(), "tasks").json() for task in r.values(): if "Experiment {}".format(experiment_id) in task["name"] and len(task["containers"]) > 0: return True return False
def open_tensorboard(args: Namespace) -> None: resp = api.get(args.master, "tensorboard/{}".format(args.tensorboard_id)).json() tensorboard = render.unmarshal(Command, resp) check_eq(tensorboard.state, "RUNNING", "TensorBoard must be in a running state") api.open(args.master, resp["service_address"])
def get_versions( self, order_by: ModelOrderBy = ModelOrderBy.DESC) -> List[Checkpoint]: """ Get a list of checkpoints corresponding to versions of this model. The models are sorted by version number and are returned in descending order by default. Arguments: order_by (enum): A member of the ModelOrderBy enum. """ resp = api.get( self._master, "/api/v1/models/{}/versions/".format(self.name), params={"order_by": order_by.value}, ) data = resp.json() return [ Checkpoint.from_json( { **version["checkpoint"], "version": version["version"], "model_name": data["model"]["name"], }, self._master, ) for version in data["versions"] ]
def open_shell(args: Namespace) -> None: shell = render.unmarshal( Command, api.get(args.master, "shells/{}".format(args.shell_id)).json()) check_eq(shell.state, "RUNNING", "Shell must be in a running state") agent_user = get_agent_user(args.master) _open_shell(shell, agent_user, args.ssh_opts)
def list(args: Namespace) -> None: params = {} if args.best is not None: if args.best < 0: raise AssertionError("--best must be a non-negative integer") params["best"] = args.best r = api.get(args.master, "experiments/{}/checkpoints".format(args.experiment_id), params=params).json() searcher_metric = r["metric_name"] headers = [ "Trial ID", "Step ID", "State", "Validation Metric", "UUID", "Resources", "Size" ] values = [[ c["trial_id"], c["step_id"], c["state"], api.metric.get_validation_metric(searcher_metric, c["step"]["validation"]), c["uuid"], render.format_resources(c["resources"]), render.format_resource_sizes(c["resources"]), ] for c in r["checkpoints"]] render.tabulate_or_csv(headers, values, args.csv)
def list_experiments(args: Namespace) -> None: params = {} if args.all: params["filter"] = "all" else: params["user"] = api.Authentication.instance().get_session_user() r = api.get(args.master, "experiments", params=params) def format_experiment(e: Any) -> List[Any]: result = [ e["id"], e["owner"]["username"], e["config"]["description"], e["state"], render.format_percent(e["progress"]), render.format_time(e["start_time"]), render.format_time(e["end_time"]), ] if args.all: result.append(e["archived"]) return result headers = [ "ID", "Owner", "Description", "State", "Progress", "Start Time", "End Time" ] if args.all: headers.append("Archived") values = [format_experiment(e) for e in r.json()] render.tabulate_or_csv(headers, values, args.csv)
def set_gc_policy(args: Namespace) -> None: policy = { "save_experiment_best": args.save_experiment_best, "save_trial_best": args.save_trial_best, "save_trial_latest": args.save_trial_latest, } if not args.yes: r = api.get( args.master, "experiments/{}/preview_gc".format(args.experiment_id), params=policy ) response = r.json() checkpoints = response["checkpoints"] metric_name = response["metric_name"] headers = [ "Trial ID", "Step ID", "State", "Validation Metric\n({})".format(metric_name), "UUID", "Resources", ] values = [ [ c["trial_id"], c["step_id"], c["state"], api.metric.get_validation_metric(metric_name, c["step"]["validation"]), c["uuid"], render.format_resources(c["resources"]), ] for c in sorted(checkpoints, key=lambda c: (c["trial_id"], c["step_id"])) if "step" in c and c["step"].get("validation") is not None ] if len(values) != 0: print( "The following checkpoints with validation will be deleted " "by applying this GC Policy:" ) print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=FLUSH) print( "This policy will delete {} checkpoints with " "validations and {} checkpoints without validations.".format( len(values), len(checkpoints) - len(values) ) ) if args.yes or render.yes_or_no( "Changing the checkpoint garbage collection policy of an " "experiment may result\n" "in the unrecoverable deletion of checkpoints. Do you wish to " "proceed?" ): patch_experiment(args, "change gc policy of", {"checkpoint_storage": policy}) print("Set GC policy of experiment {} to\n{}".format(args.experiment_id, pformat(policy))) else: print("Aborting operations.")
def get_model(self, name: str) -> Model: """ Get the :class:`~determined.experimental.Model` from the model registry with the provided name. If no model with that name is found in the registry, an exception is raised. """ r = api.get(self._session._master, "/api/v1/models/{}".format(name)) return Model.from_json(r.json().get("model"), self._session._master)
def print_logs(limit: Optional[int] = None) -> None: nonlocal offset, state path = "trials/{}/logsv2?offset={}".format(args.trial_id, offset) if limit: path = "{}&limit=?".format(limit) for log in api.get(args.master, path).json(): print(log["message"], end="") offset, state = log["id"], log["state"]
def get_checkpoint(self, uuid: str) -> Checkpoint: """ Get the :class:`~determined.experimental.Checkpoint` representing the checkpoint with the provided UUID. """ r = api.get(self._session._master, "checkpoints/{}".format(uuid)).json() return Checkpoint.from_json(r, master=self._session._master)
def get_num_running_commands() -> int: auth.initialize_session(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "commands") assert r.status_code == requests.codes.ok, r.text return len([ command for _id, command in r.json().items() if command["state"] == "RUNNING" ])
def list_template(args: Namespace) -> None: templates = [ render.unmarshal(TemplateAll, t, {"config": _parse_config}) for t in api.get(args.master, path="templates").json() ] if args.details: render.render_objects(TemplateAll, templates, table_fmt="grid") else: render.render_objects(TemplateClean, templates)
def download_model_def(args: Namespace) -> None: resp = api.get(args.master, "experiments/{}/model_def".format(args.experiment_id)) value, params = cgi.parse_header(resp.headers["Content-Disposition"]) if value == "attachment" and "filename" in params: with args.output_dir.joinpath(params["filename"]).open("wb") as f: f.write(resp.content) else: raise api.errors.BadResponseException( "Unexpected Content-Disposition header format. {}: {}".format(value, params) )