Esempio n. 1
0
def logs(args: Namespace) -> None:
    def process_response(response: Response, latest_log_id: int) -> int:
        for log in response.json():
            check_gt(log["id"], latest_log_id)
            latest_log_id = log["id"]
            print("{} [{}]: {}".format(log["time"], log["level"], log["message"]))
        return latest_log_id

    params = {}
    if args.tail:
        params["tail"] = args.tail

    response = api.get(args.master, "logs", params=params)
    latest_log_id = process_response(response, -1)

    # "Follow" mode is implemented as a loop in the CLI. We assume that
    # newer log messages have a numerically larger ID than older log
    # messages, so we keep track of the max ID seen so far.
    if args.follow:
        while True:
            try:
                # Poll for new logs every 100 ms.
                time.sleep(0.1)

                # The `tail` parameter only makes sense the first time we
                # fetch logs.
                response = api.get(
                    args.master, "logs", params={"greater_than_id": str(latest_log_id)}
                )
                latest_log_id = process_response(response, latest_log_id)
            except KeyboardInterrupt:
                break
Esempio n. 2
0
def list_slots(args: argparse.Namespace) -> None:
    task_res = api.get(args.master, "tasks")
    agent_res = api.get(args.master, "agents")

    agents = agent_res.json()
    allocations = task_res.json()

    c_names = {
        r["container_id"]: {
            "name": a["name"],
            "allocation_id": a["allocation_id"]
        }
        for a in allocations.values() for r in a["resources"]
        if r["container_id"]
    }

    slots = [
        OrderedDict([
            ("agent_id", local_id(agent_id)),
            ("resource_pool", agent["resource_pool"]),
            ("slot_id", local_id(slot_id)),
            ("enabled", slot["enabled"]),
            ("draining", slot.get("draining", False)),
            (
                "allocation_id",
                c_names[slot["container"]["id"]]["allocation_id"]
                if slot["container"] else "FREE",
            ),
            (
                "task_name",
                c_names[slot["container"]["id"]]["name"]
                if slot["container"] else "None",
            ),
            ("type", slot["device"]["type"]),
            ("device", slot["device"]["brand"]),
        ]) for agent_id, agent in sorted(agents.items())
        for slot_id, slot in sorted(agent["slots"].items())
    ]

    headers = [
        "Agent ID",
        "Resource Pool",
        "Slot ID",
        "Enabled",
        "Draining",
        "Allocation ID",
        "Task Name",
        "Type",
        "Device",
    ]

    if args.json:
        print(json.dumps(slots, indent=4))
        return

    values = [s.values() for s in slots]

    render.tabulate_or_csv(headers, values, args.csv)
Esempio n. 3
0
def describe_trial(args: Namespace) -> None:
    if args.metrics:
        r = api.get(args.master, "trials/{}/metrics".format(args.trial_id))
    else:
        r = api.get(args.master, "trials/{}".format(args.trial_id))

    trial = r.json()

    if args.json:
        print(json.dumps(trial, indent=4))
        return

    # Print information about the trial itself.
    headers = [
        "Experiment ID",
        "State",
        "H-Params",
        "Start Time",
        "End Time",
    ]
    values = [[
        trial["experiment_id"],
        trial["state"],
        json.dumps(trial["hparams"], indent=4),
        render.format_time(trial["start_time"]),
        render.format_time(trial["end_time"]),
    ]]
    render.tabulate_or_csv(headers, values, args.csv)

    # Print information about individual steps.
    headers = [
        "# of Batches",
        "State",
        "Start Time",
        "End Time",
        "Checkpoint",
        "Checkpoint UUID",
        "Checkpoint Metadata",
        "Validation",
        "Validation Metrics",
    ]
    if args.metrics:
        headers.append("Workload Metrics")

    values = [[
        s["prior_batches_processed"] + s["num_batches"],
        s["state"],
        render.format_time(s["start_time"]),
        render.format_time(s["end_time"]),
        *format_checkpoint(s["checkpoint"]),
        *format_validation(s["validation"]),
        *([json.dumps(s["metrics"], indent=4)] if args.metrics else []),
    ] for s in trial["steps"]]

    print()
    print("Workloads:")
    render.tabulate_or_csv(headers, values, args.csv)
Esempio n. 4
0
    def patch(args: argparse.Namespace) -> None:
        check_false(args.all and args.agent_id)

        if not (args.all or args.agent_id):
            print("Error: must specify exactly one of `--all` or agent_id",
                  file=sys.stderr)
            sys.exit(1)

        if args.agent_id:
            agent_ids = [args.agent_id]
        else:
            r = api.get(args.master, "agents")
            agent_ids = sorted(local_id(a) for a in r.json().keys())

        drain_mode = None if enabled else args.drain

        for agent_id in agent_ids:
            action = "enable" if enabled else "disable"
            path = f"api/v1/agents/{agent_id}/{action}"

            payload = None
            if not enabled and drain_mode:
                payload = {
                    "drain": drain_mode,
                }

            api.post(args.master, path, payload)
            status = "Disabled" if not enabled else "Enabled"
            print(f"{status} agent {agent_id}.", file=sys.stderr)

        # When draining, check if there're any tasks currently running on
        # these slots, and list them.
        if drain_mode:
            rsp = api.get(args.master, "tasks")
            tasks_data = {
                k: t
                for (k, t) in rsp.json().items()
                if any(a in agent_ids for r in t.get("resources", [])
                       for a in r["agent_devices"])
            }

            if not (args.json or args.csv):
                if tasks_data:
                    print("Tasks still in progress on draining nodes.")
                else:
                    print("No tasks in progress on draining nodes.")

            cli_task.render_tasks(args, tasks_data)
def test_agent_version() -> None:
    # DET_AGENT_VERSION is available and specifies the agent version in cross-version tests; for
    # other tests, this evaluates to the current version.
    target_version = os.environ.get("DET_AGENT_VERSION") or determined.__version__
    agents = api.get(conf.make_master_url(), "agents").json()

    assert all(agent["version"] == target_version for agent in agents.values())
Esempio n. 6
0
def experiment_has_active_workload(experiment_id: int) -> bool:
    r = api.get(conf.make_master_url(), "tasks").json()
    for task in r.values():
        if "Experiment {}".format(experiment_id) in task["name"] and len(task["containers"]) > 0:
            return True

    return False
Esempio n. 7
0
    def get_models(
        self,
        sort_by: model.ModelSortBy = model.ModelSortBy.NAME,
        order_by: model.ModelOrderBy = model.ModelOrderBy.ASCENDING,
        name: str = "",
        description: str = "",
    ) -> List[model.Model]:
        """
        Get a list of all models in the model registry.

        Arguments:
            sort_by: Which field to sort by. See :class:`~determined.experimental.ModelSortBy`.
            order_by: Whether to sort in ascending or descending order. See
                :class:`~determined.experimental.ModelOrderBy`.
            name: If this parameter is set, models will be filtered to only
                include models with names matching this parameter.
            description: If this parameter is set, models will be filtered to
                only include models with descriptions matching this parameter.
        """
        r = api.get(
            self._session._master,
            "/api/v1/models/",
            params={
                "sort_by": sort_by.value,
                "order_by": order_by.value,
                "name": name,
                "description": description,
            },
        )

        models = r.json().get("models")
        return [
            model.Model.from_json(m, self._session._master) for m in models
        ]
Esempio n. 8
0
def trial_metrics(trial_id: int) -> Dict[str, Any]:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "trials/{}/metrics".format(trial_id))
    json = r.json()  # type: Dict[str, Any]
    return json
Esempio n. 9
0
def list_agents(args: argparse.Namespace) -> None:
    r = api.get(args.master, "agents")

    agents = r.json()
    agents = [
        OrderedDict([
            ("id", local_id(agent_id)),
            ("registered_time", render.format_time(agent["registered_time"])),
            ("num_slots", len(agent["slots"])),
            ("num_containers", agent["num_containers"]),
            ("resource_pool", agent["resource_pool"]),
            ("label", agent["label"]),
            ("addresses", ", ".join(agent["addresses"])),
        ]) for agent_id, agent in sorted(agents.items())
    ]

    if args.json:
        print(json.dumps(agents, indent=4))
        return

    headers = [
        "Agent ID",
        "Registered Time",
        "Slots",
        "Containers",
        "Resource Pool",
        "Label",
        "Addresses",
    ]
    values = [a.values() for a in agents]

    render.tabulate_or_csv(headers, values, args.csv)
Esempio n. 10
0
def get_profiling_metrics(trial_id: int, metric_type: str) -> List[float]:
    """
    Calls profiler API to return a list of metric values given trial ID and metric type
    """
    with api.get(
        conf.make_master_url(),
        "api/v1/trials/{}/profiler/metrics?{}".format(
            trial_id,
            urlencode(
                {
                    "labels.name": metric_type,
                    "labels.metricType": "PROFILER_METRIC_TYPE_SYSTEM",
                    "follow": "true",
                }
            ),
        ),
        stream=True,
    ) as r:
        return [
            batch
            for batches in [
                json.loads(line)["result"]["batch"]["values"] for line in r.iter_lines()
            ]
            for batch in batches
        ]
Esempio n. 11
0
def raw(args: Namespace) -> None:
    params = {
        "timestamp_after": args.timestamp_after,
        "timestamp_before": args.timestamp_before
    }
    path = "api/v1/resources/allocation/raw" if args.json else "resources/allocation/raw"
    print_response(api.get(args.master, path, params=params))
Esempio n. 12
0
    def get_versions(self, order_by: ModelOrderBy = ModelOrderBy.DESC) -> List[Checkpoint]:
        """
        Get a list of checkpoints corresponding to versions of this model. The
        models are sorted by version number and are returned in descending
        order by default.

        Arguments:
            order_by (enum): A member of the :class:`ModelOrderBy` enum.
        """
        resp = api.get(
            self._master,
            "/api/v1/models/{}/versions/".format(self.name),
            params={"order_by": order_by.value},
        )
        data = resp.json()

        return [
            Checkpoint.from_json(
                {
                    **version["checkpoint"],
                    "model_version": version["version"],
                    "model_name": data["model"]["name"],
                },
                self._master,
            )
            for version in data["modelVersions"]
        ]
Esempio n. 13
0
def open_tensorboard(args: Namespace) -> None:
    resp = api.get(args.master,
                   "tensorboard/{}".format(args.tensorboard_id)).json()
    tensorboard = render.unmarshal(Command, resp)
    check_eq(tensorboard.state, "RUNNING",
             "TensorBoard must be in a running state")
    api.open(args.master, resp["service_address"])
Esempio n. 14
0
def request_profiling_system_metrics(trial_id: int, metric_name: str) -> None:
    def validate_gpu_metric_batch(batch: Dict[str, Any]) -> None:
        num_values = len(batch["values"])
        num_batch_indexes = len(batch["batches"])
        num_timestamps = len(batch["timestamps"])
        if not (num_values == num_batch_indexes == num_timestamps):
            pytest.fail(
                f"mismatched lists: not ({num_values} == {num_batch_indexes} == {num_timestamps})"
            )

        if num_values == 0:
            pytest.fail(
                f"received batch of size 0, something went wrong: {batch}")

    with api.get(
            conf.make_master_url(),
            "api/v1/trials/{}/profiler/metrics?{}".format(
                trial_id,
                to_query_params(PROFILER_METRIC_TYPE_SYSTEM, metric_name),
            ),
            stream=True,
    ) as r:
        for line in r.iter_lines():
            batch = simplejson.loads(line)["result"]["batch"]
            validate_gpu_metric_batch(batch)
Esempio n. 15
0
def start_shell(args: Namespace) -> None:
    data = {}
    if args.passphrase:
        data["passphrase"] = getpass.getpass("Enter new passphrase: ")
    config = parse_config(args.config_file, None, args.config, args.volume)
    resp = launch_command(
        args.master,
        "api/v1/shells",
        config,
        args.template,
        context_path=args.context,
        data=data,
    )["shell"]

    if args.detach:
        print(resp["id"])
        return

    ready = False
    with api.ws(args.master, "shells/{}/events".format(resp["id"])) as ws:
        for msg in ws:
            if msg["service_ready_event"]:
                ready = True
                break
            render_event_stream(msg)
    if ready:
        shell = api.get(args.master, "api/v1/shells/{}".format(resp["id"])).json()["shell"]
        check_eq(shell["state"], "STATE_RUNNING", "Shell must be in a running state")
        _open_shell(args.master, shell, args.ssh_opts)
Esempio n. 16
0
def num_experiments() -> int:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "experiments")
    assert r.status_code == requests.codes.ok, r.text
    return len(r.json())
Esempio n. 17
0
def wait_for_master_url(
    master_url: str,
    timeout: int = DEFAULT_TIMEOUT,
    cert: Optional[certs.Cert] = None,
) -> None:
    POLL_INTERVAL = 2
    polling = False
    start_time = time.time()

    try:
        while time.time() - start_time < timeout:
            try:
                r = api.get(master_url, "info", authenticated=False, cert=cert)
                if r.status_code == requests.codes.ok:
                    return
            except api.errors.MasterNotFoundException:
                pass
            if not polling:
                polling = True
                print("Waiting for master instance to be available...",
                      end="",
                      flush=True)
            time.sleep(POLL_INTERVAL)
            print(".", end="", flush=True)

        raise MasterTimeoutExpired
    finally:
        if polling:
            print()
Esempio n. 18
0
def list(args: Namespace) -> None:
    params = {}
    if args.best is not None:
        if args.best < 0:
            raise AssertionError("--best must be a non-negative integer")
        params["best"] = args.best

    r = api.get(args.master,
                "experiments/{}/checkpoints".format(args.experiment_id),
                params=params).json()
    searcher_metric = r["metric_name"]

    headers = [
        "Trial ID",
        "# of Batches",
        "State",
        "Validation Metric",
        "UUID",
        "Resources",
        "Size",
    ]
    values = [[
        c["trial_id"],
        c["step"]["total_batches"],
        c["state"],
        api.metric.get_validation_metric(searcher_metric,
                                         c["step"]["validation"]),
        c["uuid"],
        render.format_resources(c["resources"]),
        render.format_resource_sizes(c["resources"]),
    ] for c in r["checkpoints"]]

    render.tabulate_or_csv(headers, values, args.csv)
Esempio n. 19
0
def list_versions(args: Namespace) -> None:
    if args.json:
        r = api.get(args.master, "models/{}/versions".format(args.name))
        data = r.json()
        print(json.dumps(data, indent=2))

    else:
        model = Determined(args.master).get_model(args.name)
        render_model(model)
        print("\n")

        headers = [
            "Version #",
            "Trial ID",
            "Batch #",
            "Checkpoint UUID",
            "Validation Metrics",
            "Metadata",
        ]

        values = [
            [
                ckpt.model_version,
                ckpt.trial_id,
                ckpt.batch_number,
                ckpt.uuid,
                json.dumps(ckpt.validation, indent=2),
                json.dumps(ckpt.metadata, indent=2),
            ]
            for ckpt in model.get_versions()
        ]

        render.tabulate_or_csv(headers, values, False)
Esempio n. 20
0
def num_agents() -> int:
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "agents")
    assert r.status_code == requests.codes.ok, r.text

    return len(r.json())
Esempio n. 21
0
def request_metric_names(experiment_id):  # type: ignore
    response = api.get(
        conf.make_master_url(),
        "api/v1/experiments/{}/metrics-stream/metric-names".format(experiment_id),
        params={"period_seconds": 1},
    )
    results = [message["result"] for message in map(json.loads, response.text.splitlines())]

    # First let's verify an empty response was sent back before any real work was done
    if results[0]["searcherMetric"] != "validation_loss":
        return ("unexpected searcher metric in first response", results)
    if results[0]["trainingMetrics"] != []:
        return ("unexpected training metric in first response", results)
    if results[0]["validationMetrics"] != []:
        return ("unexpected validation metric in first response", results)

    # Then we verify that all expected responses are eventually received exactly once
    accumulated_training = set()
    accumulated_validation = set()
    for i in range(1, len(results)):
        for training in results[i]["trainingMetrics"]:
            if training in accumulated_training:
                return ("training metric appeared twice", results)
            accumulated_training.add(training)
        for validation in results[i]["validationMetrics"]:
            if validation in accumulated_validation:
                return ("training metric appeared twice", results)
            accumulated_validation.add(validation)

    if accumulated_training != {"loss"}:
        return ("unexpected set of training metrics", results)
    if accumulated_validation != {"validation_loss", "accuracy"}:
        return ("unexpected set of validation metrics", results)
    return None
Esempio n. 22
0
def list_tasks(args: Namespace) -> None:
    api_path = RemoteTaskNewAPIs[args._command]
    api_full_path = "api/v1/{}".format(api_path)
    table_header = RemoteTaskListTableHeaders[args._command]

    if args.all:
        params = {}  # type: Dict[str, Any]
    else:
        params = {"users": [authentication.must_cli_auth().get_session_user()]}

    res = api.get(args.master, api_full_path, params=params).json()[api_path]

    if args.quiet:
        for command in res:
            print(command["id"])
        return

    for item in res:
        if item["state"].startswith("STATE_"):
            item["state"] = item["state"][6:]

    if getattr(args, "json", None):
        print(json.dumps(res, indent=4))
        return

    values = render.select_values(res, table_header)

    render.tabulate_or_csv(table_header, values, getattr(args, "csv", False))
Esempio n. 23
0
def request_valid_trials_snapshot(experiment_id):  # type: ignore
    response = api.get(
        conf.make_master_url(),
        "api/v1/experiments/{}/metrics-stream/trials-snapshot".format(experiment_id),
        params={
            "metric_name": "accuracy",
            "metric_type": "METRIC_TYPE_VALIDATION",
            "batches_processed": 200,
            "period_seconds": 1,
        },
    )
    results = [message["result"] for message in map(json.loads, response.text.splitlines())]

    # First let's verify an empty response was sent back before any real work was done
    if results[0]["trials"] != []:
        return ("unexpected trials in first response", results)

    # Then we verify that we receive the expected number of trials and the right types
    trials = set()
    for i in range(1, len(results)):
        for trial in results[i]["trials"]:
            trials.add(trial["trialId"])
            hparam_error = validate_hparam_types(trial["hparams"])
            if hparam_error is not None:
                return (hparam_error, results)
            if type(trial["metric"]) != float:
                return ("metric of unexpected type", results)
    if len(trials) != 5:
        return ("unexpected number of trials received", results)
    return None
Esempio n. 24
0
def request_valid_metric_batches(experiment_id):  # type: ignore
    response = api.get(
        conf.make_master_url(),
        "api/v1/experiments/{}/metrics-stream/batches".format(experiment_id),
        params={
            "metric_name": "accuracy",
            "metric_type": "METRIC_TYPE_VALIDATION",
            "period_seconds": 1,
        },
    )
    results = [message["result"] for message in map(json.loads, response.text.splitlines())]

    # First let's verify an empty response was sent back before any real work was done
    if results[0]["batches"] != []:
        return ("unexpected batches in first response", results)

    # Then we verify that all expected responses are eventually received exactly once
    accumulated = set()
    for i in range(1, len(results)):
        for batch in results[i]["batches"]:
            if batch in accumulated:
                return ("batch appears twice", results)
            accumulated.add(batch)
    if accumulated != {200, 400}:
        return ("unexpected set of batches", results)
    return None
Esempio n. 25
0
        def do_GET(self) -> None:
            try:
                """Serve a GET request."""
                token = parse_qs(urlparse(self.path).query)["token"][0]

                tmp_auth = {"Cookie": "auth={token}".format(token=token)}
                me = api.get(master_url,
                             "/users/me",
                             headers=tmp_auth,
                             authenticated=False).json()

                token_store = authentication.TokenStore(master_url)
                token_store.set_token(me["username"], token)
                token_store.set_active(me["username"])

                print("Authenticated as {}.".format(me["username"]))

                self.send_response(200)
                self.send_header("Content-type", "text/html")
                self.end_headers()
                self.wfile.write(b"You can close this window now.")
                close_cb(0)
            except Exception as e:
                print("Error authenticating: {}.".format(e))
                close_cb(1)
Esempio n. 26
0
def get_command(command_id: str) -> Any:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "api/v1/commands/" + command_id)
    assert r.status_code == requests.codes.ok, r.text
    return r.json()["command"]
Esempio n. 27
0
def set_gc_policy(args: Namespace) -> None:
    policy = {
        "save_experiment_best": args.save_experiment_best,
        "save_trial_best": args.save_trial_best,
        "save_trial_latest": args.save_trial_latest,
    }

    if not args.yes:
        r = api.get(
            args.master, "experiments/{}/preview_gc".format(args.experiment_id), params=policy
        )
        response = r.json()
        checkpoints = response["checkpoints"]
        metric_name = response["metric_name"]

        headers = [
            "Trial ID",
            "# of Batches",
            "State",
            "Validation Metric\n({})".format(metric_name),
            "UUID",
            "Resources",
        ]
        values = [
            [
                c["trial_id"],
                c["step"]["total_batches"],
                c["state"],
                api.metric.get_validation_metric(metric_name, c["step"]["validation"]),
                c["uuid"],
                render.format_resources(c["resources"]),
            ]
            for c in sorted(checkpoints, key=lambda c: (c["trial_id"], c["step_id"]))
            if "step" in c and c["step"].get("validation") is not None
        ]

        if len(values) != 0:
            print(
                "The following checkpoints with validation will be deleted "
                "by applying this GC Policy:"
            )
            print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=FLUSH)
        print(
            "This policy will delete {} checkpoints with "
            "validations and {} checkpoints without validations.".format(
                len(values), len(checkpoints) - len(values)
            )
        )

    if args.yes or render.yes_or_no(
        "Changing the checkpoint garbage collection policy of an "
        "experiment may result\n"
        "in the unrecoverable deletion of checkpoints.  Do you wish to "
        "proceed?"
    ):
        patch_experiment(args, "change gc policy of", {"checkpoint_storage": policy})
        print("Set GC policy of experiment {} to\n{}".format(args.experiment_id, pformat(policy)))
    else:
        print("Aborting operations.")
Esempio n. 28
0
def experiment_json(experiment_id: int) -> Dict[str, Any]:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "experiments/{}".format(experiment_id))
    assert r.status_code == requests.codes.ok, r.text
    json = r.json()  # type: Dict[str, Any]
    return json
Esempio n. 29
0
def open_tensorboard(args: Namespace) -> None:
    tensorboard_id = command.expand_uuid_prefixes(args)
    resp = api.get(
        args.master,
        "api/v1/tensorboards/{}".format(tensorboard_id)).json()["tensorboard"]
    check_eq(resp["state"], "STATE_RUNNING",
             "TensorBoard must be in a running state")
    api.browser_open(args.master, resp["serviceAddress"])
Esempio n. 30
0
def get_num_running_commands() -> int:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "api/v1/commands")
    assert r.status_code == requests.codes.ok, r.text

    return len([command for command in r.json()["commands"] if command["state"] == "STATE_RUNNING"])