def logs(args: Namespace) -> None: def process_response(response: Response, latest_log_id: int) -> int: for log in response.json(): check_gt(log["id"], latest_log_id) latest_log_id = log["id"] print("{} [{}]: {}".format(log["time"], log["level"], log["message"])) return latest_log_id params = {} if args.tail: params["tail"] = args.tail response = api.get(args.master, "logs", params=params) latest_log_id = process_response(response, -1) # "Follow" mode is implemented as a loop in the CLI. We assume that # newer log messages have a numerically larger ID than older log # messages, so we keep track of the max ID seen so far. if args.follow: while True: try: # Poll for new logs every 100 ms. time.sleep(0.1) # The `tail` parameter only makes sense the first time we # fetch logs. response = api.get( args.master, "logs", params={"greater_than_id": str(latest_log_id)} ) latest_log_id = process_response(response, latest_log_id) except KeyboardInterrupt: break
def list_slots(args: argparse.Namespace) -> None: task_res = api.get(args.master, "tasks") agent_res = api.get(args.master, "agents") agents = agent_res.json() allocations = task_res.json() c_names = { r["container_id"]: { "name": a["name"], "allocation_id": a["allocation_id"] } for a in allocations.values() for r in a["resources"] if r["container_id"] } slots = [ OrderedDict([ ("agent_id", local_id(agent_id)), ("resource_pool", agent["resource_pool"]), ("slot_id", local_id(slot_id)), ("enabled", slot["enabled"]), ("draining", slot.get("draining", False)), ( "allocation_id", c_names[slot["container"]["id"]]["allocation_id"] if slot["container"] else "FREE", ), ( "task_name", c_names[slot["container"]["id"]]["name"] if slot["container"] else "None", ), ("type", slot["device"]["type"]), ("device", slot["device"]["brand"]), ]) for agent_id, agent in sorted(agents.items()) for slot_id, slot in sorted(agent["slots"].items()) ] headers = [ "Agent ID", "Resource Pool", "Slot ID", "Enabled", "Draining", "Allocation ID", "Task Name", "Type", "Device", ] if args.json: print(json.dumps(slots, indent=4)) return values = [s.values() for s in slots] render.tabulate_or_csv(headers, values, args.csv)
def describe_trial(args: Namespace) -> None: if args.metrics: r = api.get(args.master, "trials/{}/metrics".format(args.trial_id)) else: r = api.get(args.master, "trials/{}".format(args.trial_id)) trial = r.json() if args.json: print(json.dumps(trial, indent=4)) return # Print information about the trial itself. headers = [ "Experiment ID", "State", "H-Params", "Start Time", "End Time", ] values = [[ trial["experiment_id"], trial["state"], json.dumps(trial["hparams"], indent=4), render.format_time(trial["start_time"]), render.format_time(trial["end_time"]), ]] render.tabulate_or_csv(headers, values, args.csv) # Print information about individual steps. headers = [ "# of Batches", "State", "Start Time", "End Time", "Checkpoint", "Checkpoint UUID", "Checkpoint Metadata", "Validation", "Validation Metrics", ] if args.metrics: headers.append("Workload Metrics") values = [[ s["prior_batches_processed"] + s["num_batches"], s["state"], render.format_time(s["start_time"]), render.format_time(s["end_time"]), *format_checkpoint(s["checkpoint"]), *format_validation(s["validation"]), *([json.dumps(s["metrics"], indent=4)] if args.metrics else []), ] for s in trial["steps"]] print() print("Workloads:") render.tabulate_or_csv(headers, values, args.csv)
def patch(args: argparse.Namespace) -> None: check_false(args.all and args.agent_id) if not (args.all or args.agent_id): print("Error: must specify exactly one of `--all` or agent_id", file=sys.stderr) sys.exit(1) if args.agent_id: agent_ids = [args.agent_id] else: r = api.get(args.master, "agents") agent_ids = sorted(local_id(a) for a in r.json().keys()) drain_mode = None if enabled else args.drain for agent_id in agent_ids: action = "enable" if enabled else "disable" path = f"api/v1/agents/{agent_id}/{action}" payload = None if not enabled and drain_mode: payload = { "drain": drain_mode, } api.post(args.master, path, payload) status = "Disabled" if not enabled else "Enabled" print(f"{status} agent {agent_id}.", file=sys.stderr) # When draining, check if there're any tasks currently running on # these slots, and list them. if drain_mode: rsp = api.get(args.master, "tasks") tasks_data = { k: t for (k, t) in rsp.json().items() if any(a in agent_ids for r in t.get("resources", []) for a in r["agent_devices"]) } if not (args.json or args.csv): if tasks_data: print("Tasks still in progress on draining nodes.") else: print("No tasks in progress on draining nodes.") cli_task.render_tasks(args, tasks_data)
def test_agent_version() -> None: # DET_AGENT_VERSION is available and specifies the agent version in cross-version tests; for # other tests, this evaluates to the current version. target_version = os.environ.get("DET_AGENT_VERSION") or determined.__version__ agents = api.get(conf.make_master_url(), "agents").json() assert all(agent["version"] == target_version for agent in agents.values())
def experiment_has_active_workload(experiment_id: int) -> bool: r = api.get(conf.make_master_url(), "tasks").json() for task in r.values(): if "Experiment {}".format(experiment_id) in task["name"] and len(task["containers"]) > 0: return True return False
def get_models( self, sort_by: model.ModelSortBy = model.ModelSortBy.NAME, order_by: model.ModelOrderBy = model.ModelOrderBy.ASCENDING, name: str = "", description: str = "", ) -> List[model.Model]: """ Get a list of all models in the model registry. Arguments: sort_by: Which field to sort by. See :class:`~determined.experimental.ModelSortBy`. order_by: Whether to sort in ascending or descending order. See :class:`~determined.experimental.ModelOrderBy`. name: If this parameter is set, models will be filtered to only include models with names matching this parameter. description: If this parameter is set, models will be filtered to only include models with descriptions matching this parameter. """ r = api.get( self._session._master, "/api/v1/models/", params={ "sort_by": sort_by.value, "order_by": order_by.value, "name": name, "description": description, }, ) models = r.json().get("models") return [ model.Model.from_json(m, self._session._master) for m in models ]
def trial_metrics(trial_id: int) -> Dict[str, Any]: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "trials/{}/metrics".format(trial_id)) json = r.json() # type: Dict[str, Any] return json
def list_agents(args: argparse.Namespace) -> None: r = api.get(args.master, "agents") agents = r.json() agents = [ OrderedDict([ ("id", local_id(agent_id)), ("registered_time", render.format_time(agent["registered_time"])), ("num_slots", len(agent["slots"])), ("num_containers", agent["num_containers"]), ("resource_pool", agent["resource_pool"]), ("label", agent["label"]), ("addresses", ", ".join(agent["addresses"])), ]) for agent_id, agent in sorted(agents.items()) ] if args.json: print(json.dumps(agents, indent=4)) return headers = [ "Agent ID", "Registered Time", "Slots", "Containers", "Resource Pool", "Label", "Addresses", ] values = [a.values() for a in agents] render.tabulate_or_csv(headers, values, args.csv)
def get_profiling_metrics(trial_id: int, metric_type: str) -> List[float]: """ Calls profiler API to return a list of metric values given trial ID and metric type """ with api.get( conf.make_master_url(), "api/v1/trials/{}/profiler/metrics?{}".format( trial_id, urlencode( { "labels.name": metric_type, "labels.metricType": "PROFILER_METRIC_TYPE_SYSTEM", "follow": "true", } ), ), stream=True, ) as r: return [ batch for batches in [ json.loads(line)["result"]["batch"]["values"] for line in r.iter_lines() ] for batch in batches ]
def raw(args: Namespace) -> None: params = { "timestamp_after": args.timestamp_after, "timestamp_before": args.timestamp_before } path = "api/v1/resources/allocation/raw" if args.json else "resources/allocation/raw" print_response(api.get(args.master, path, params=params))
def get_versions(self, order_by: ModelOrderBy = ModelOrderBy.DESC) -> List[Checkpoint]: """ Get a list of checkpoints corresponding to versions of this model. The models are sorted by version number and are returned in descending order by default. Arguments: order_by (enum): A member of the :class:`ModelOrderBy` enum. """ resp = api.get( self._master, "/api/v1/models/{}/versions/".format(self.name), params={"order_by": order_by.value}, ) data = resp.json() return [ Checkpoint.from_json( { **version["checkpoint"], "model_version": version["version"], "model_name": data["model"]["name"], }, self._master, ) for version in data["modelVersions"] ]
def open_tensorboard(args: Namespace) -> None: resp = api.get(args.master, "tensorboard/{}".format(args.tensorboard_id)).json() tensorboard = render.unmarshal(Command, resp) check_eq(tensorboard.state, "RUNNING", "TensorBoard must be in a running state") api.open(args.master, resp["service_address"])
def request_profiling_system_metrics(trial_id: int, metric_name: str) -> None: def validate_gpu_metric_batch(batch: Dict[str, Any]) -> None: num_values = len(batch["values"]) num_batch_indexes = len(batch["batches"]) num_timestamps = len(batch["timestamps"]) if not (num_values == num_batch_indexes == num_timestamps): pytest.fail( f"mismatched lists: not ({num_values} == {num_batch_indexes} == {num_timestamps})" ) if num_values == 0: pytest.fail( f"received batch of size 0, something went wrong: {batch}") with api.get( conf.make_master_url(), "api/v1/trials/{}/profiler/metrics?{}".format( trial_id, to_query_params(PROFILER_METRIC_TYPE_SYSTEM, metric_name), ), stream=True, ) as r: for line in r.iter_lines(): batch = simplejson.loads(line)["result"]["batch"] validate_gpu_metric_batch(batch)
def start_shell(args: Namespace) -> None: data = {} if args.passphrase: data["passphrase"] = getpass.getpass("Enter new passphrase: ") config = parse_config(args.config_file, None, args.config, args.volume) resp = launch_command( args.master, "api/v1/shells", config, args.template, context_path=args.context, data=data, )["shell"] if args.detach: print(resp["id"]) return ready = False with api.ws(args.master, "shells/{}/events".format(resp["id"])) as ws: for msg in ws: if msg["service_ready_event"]: ready = True break render_event_stream(msg) if ready: shell = api.get(args.master, "api/v1/shells/{}".format(resp["id"])).json()["shell"] check_eq(shell["state"], "STATE_RUNNING", "Shell must be in a running state") _open_shell(args.master, shell, args.ssh_opts)
def num_experiments() -> int: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "experiments") assert r.status_code == requests.codes.ok, r.text return len(r.json())
def wait_for_master_url( master_url: str, timeout: int = DEFAULT_TIMEOUT, cert: Optional[certs.Cert] = None, ) -> None: POLL_INTERVAL = 2 polling = False start_time = time.time() try: while time.time() - start_time < timeout: try: r = api.get(master_url, "info", authenticated=False, cert=cert) if r.status_code == requests.codes.ok: return except api.errors.MasterNotFoundException: pass if not polling: polling = True print("Waiting for master instance to be available...", end="", flush=True) time.sleep(POLL_INTERVAL) print(".", end="", flush=True) raise MasterTimeoutExpired finally: if polling: print()
def list(args: Namespace) -> None: params = {} if args.best is not None: if args.best < 0: raise AssertionError("--best must be a non-negative integer") params["best"] = args.best r = api.get(args.master, "experiments/{}/checkpoints".format(args.experiment_id), params=params).json() searcher_metric = r["metric_name"] headers = [ "Trial ID", "# of Batches", "State", "Validation Metric", "UUID", "Resources", "Size", ] values = [[ c["trial_id"], c["step"]["total_batches"], c["state"], api.metric.get_validation_metric(searcher_metric, c["step"]["validation"]), c["uuid"], render.format_resources(c["resources"]), render.format_resource_sizes(c["resources"]), ] for c in r["checkpoints"]] render.tabulate_or_csv(headers, values, args.csv)
def list_versions(args: Namespace) -> None: if args.json: r = api.get(args.master, "models/{}/versions".format(args.name)) data = r.json() print(json.dumps(data, indent=2)) else: model = Determined(args.master).get_model(args.name) render_model(model) print("\n") headers = [ "Version #", "Trial ID", "Batch #", "Checkpoint UUID", "Validation Metrics", "Metadata", ] values = [ [ ckpt.model_version, ckpt.trial_id, ckpt.batch_number, ckpt.uuid, json.dumps(ckpt.validation, indent=2), json.dumps(ckpt.metadata, indent=2), ] for ckpt in model.get_versions() ] render.tabulate_or_csv(headers, values, False)
def num_agents() -> int: authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "agents") assert r.status_code == requests.codes.ok, r.text return len(r.json())
def request_metric_names(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/metric-names".format(experiment_id), params={"period_seconds": 1}, ) results = [message["result"] for message in map(json.loads, response.text.splitlines())] # First let's verify an empty response was sent back before any real work was done if results[0]["searcherMetric"] != "validation_loss": return ("unexpected searcher metric in first response", results) if results[0]["trainingMetrics"] != []: return ("unexpected training metric in first response", results) if results[0]["validationMetrics"] != []: return ("unexpected validation metric in first response", results) # Then we verify that all expected responses are eventually received exactly once accumulated_training = set() accumulated_validation = set() for i in range(1, len(results)): for training in results[i]["trainingMetrics"]: if training in accumulated_training: return ("training metric appeared twice", results) accumulated_training.add(training) for validation in results[i]["validationMetrics"]: if validation in accumulated_validation: return ("training metric appeared twice", results) accumulated_validation.add(validation) if accumulated_training != {"loss"}: return ("unexpected set of training metrics", results) if accumulated_validation != {"validation_loss", "accuracy"}: return ("unexpected set of validation metrics", results) return None
def list_tasks(args: Namespace) -> None: api_path = RemoteTaskNewAPIs[args._command] api_full_path = "api/v1/{}".format(api_path) table_header = RemoteTaskListTableHeaders[args._command] if args.all: params = {} # type: Dict[str, Any] else: params = {"users": [authentication.must_cli_auth().get_session_user()]} res = api.get(args.master, api_full_path, params=params).json()[api_path] if args.quiet: for command in res: print(command["id"]) return for item in res: if item["state"].startswith("STATE_"): item["state"] = item["state"][6:] if getattr(args, "json", None): print(json.dumps(res, indent=4)) return values = render.select_values(res, table_header) render.tabulate_or_csv(table_header, values, getattr(args, "csv", False))
def request_valid_trials_snapshot(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/trials-snapshot".format(experiment_id), params={ "metric_name": "accuracy", "metric_type": "METRIC_TYPE_VALIDATION", "batches_processed": 200, "period_seconds": 1, }, ) results = [message["result"] for message in map(json.loads, response.text.splitlines())] # First let's verify an empty response was sent back before any real work was done if results[0]["trials"] != []: return ("unexpected trials in first response", results) # Then we verify that we receive the expected number of trials and the right types trials = set() for i in range(1, len(results)): for trial in results[i]["trials"]: trials.add(trial["trialId"]) hparam_error = validate_hparam_types(trial["hparams"]) if hparam_error is not None: return (hparam_error, results) if type(trial["metric"]) != float: return ("metric of unexpected type", results) if len(trials) != 5: return ("unexpected number of trials received", results) return None
def request_valid_metric_batches(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/batches".format(experiment_id), params={ "metric_name": "accuracy", "metric_type": "METRIC_TYPE_VALIDATION", "period_seconds": 1, }, ) results = [message["result"] for message in map(json.loads, response.text.splitlines())] # First let's verify an empty response was sent back before any real work was done if results[0]["batches"] != []: return ("unexpected batches in first response", results) # Then we verify that all expected responses are eventually received exactly once accumulated = set() for i in range(1, len(results)): for batch in results[i]["batches"]: if batch in accumulated: return ("batch appears twice", results) accumulated.add(batch) if accumulated != {200, 400}: return ("unexpected set of batches", results) return None
def do_GET(self) -> None: try: """Serve a GET request.""" token = parse_qs(urlparse(self.path).query)["token"][0] tmp_auth = {"Cookie": "auth={token}".format(token=token)} me = api.get(master_url, "/users/me", headers=tmp_auth, authenticated=False).json() token_store = authentication.TokenStore(master_url) token_store.set_token(me["username"], token) token_store.set_active(me["username"]) print("Authenticated as {}.".format(me["username"])) self.send_response(200) self.send_header("Content-type", "text/html") self.end_headers() self.wfile.write(b"You can close this window now.") close_cb(0) except Exception as e: print("Error authenticating: {}.".format(e)) close_cb(1)
def get_command(command_id: str) -> Any: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "api/v1/commands/" + command_id) assert r.status_code == requests.codes.ok, r.text return r.json()["command"]
def set_gc_policy(args: Namespace) -> None: policy = { "save_experiment_best": args.save_experiment_best, "save_trial_best": args.save_trial_best, "save_trial_latest": args.save_trial_latest, } if not args.yes: r = api.get( args.master, "experiments/{}/preview_gc".format(args.experiment_id), params=policy ) response = r.json() checkpoints = response["checkpoints"] metric_name = response["metric_name"] headers = [ "Trial ID", "# of Batches", "State", "Validation Metric\n({})".format(metric_name), "UUID", "Resources", ] values = [ [ c["trial_id"], c["step"]["total_batches"], c["state"], api.metric.get_validation_metric(metric_name, c["step"]["validation"]), c["uuid"], render.format_resources(c["resources"]), ] for c in sorted(checkpoints, key=lambda c: (c["trial_id"], c["step_id"])) if "step" in c and c["step"].get("validation") is not None ] if len(values) != 0: print( "The following checkpoints with validation will be deleted " "by applying this GC Policy:" ) print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=FLUSH) print( "This policy will delete {} checkpoints with " "validations and {} checkpoints without validations.".format( len(values), len(checkpoints) - len(values) ) ) if args.yes or render.yes_or_no( "Changing the checkpoint garbage collection policy of an " "experiment may result\n" "in the unrecoverable deletion of checkpoints. Do you wish to " "proceed?" ): patch_experiment(args, "change gc policy of", {"checkpoint_storage": policy}) print("Set GC policy of experiment {} to\n{}".format(args.experiment_id, pformat(policy))) else: print("Aborting operations.")
def experiment_json(experiment_id: int) -> Dict[str, Any]: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "experiments/{}".format(experiment_id)) assert r.status_code == requests.codes.ok, r.text json = r.json() # type: Dict[str, Any] return json
def open_tensorboard(args: Namespace) -> None: tensorboard_id = command.expand_uuid_prefixes(args) resp = api.get( args.master, "api/v1/tensorboards/{}".format(tensorboard_id)).json()["tensorboard"] check_eq(resp["state"], "STATE_RUNNING", "TensorBoard must be in a running state") api.browser_open(args.master, resp["serviceAddress"])
def get_num_running_commands() -> int: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "api/v1/commands") assert r.status_code == requests.codes.ok, r.text return len([command for command in r.json()["commands"] if command["state"] == "STATE_RUNNING"])