Exemple #1
0
def describe_trial(args: Namespace) -> None:
    if args.metrics:
        r = api.get(args.master, "trials/{}/metrics".format(args.trial_id))
    else:
        r = api.get(args.master, "trials/{}".format(args.trial_id))

    trial = r.json()

    if args.json:
        print(json.dumps(trial, indent=4))
        return

    # Print information about the trial itself.
    headers = [
        "Experiment ID",
        "State",
        "H-Params",
        "Start Time",
        "End Time",
    ]
    values = [[
        trial["experiment_id"],
        trial["state"],
        json.dumps(trial["hparams"], indent=4),
        render.format_time(trial["start_time"]),
        render.format_time(trial["end_time"]),
    ]]
    render.tabulate_or_csv(headers, values, args.csv)

    # Print information about individual steps.
    headers = [
        "# of Batches",
        "State",
        "Start Time",
        "End Time",
        "Checkpoint",
        "Checkpoint UUID",
        "Checkpoint Metadata",
        "Validation",
        "Validation Metrics",
    ]
    if args.metrics:
        headers.append("Workload Metrics")

    values = [[
        s["prior_batches_processed"] + s["num_batches"],
        s["state"],
        render.format_time(s["start_time"]),
        render.format_time(s["end_time"]),
        *format_checkpoint(s["checkpoint"]),
        *format_validation(s["validation"]),
        *([json.dumps(s["metrics"], indent=4)] if args.metrics else []),
    ] for s in trial["steps"]]

    print()
    print("Workloads:")
    render.tabulate_or_csv(headers, values, args.csv)
Exemple #2
0
 def format_experiment(e: Any) -> List[Any]:
     result = [
         e["id"],
         e["owner"]["username"],
         e["config"]["description"],
         e["state"],
         render.format_percent(e["progress"]),
         render.format_time(e["start_time"]),
         render.format_time(e["end_time"]),
         e["config"]["resources"].get("resource_pool"),
     ]
     if args.all:
         result.append(e["archived"])
     return result
Exemple #3
0
 def format_experiment(e: Any) -> List[Any]:
     result = [
         e.id,
         e.username,
         e.name,
         e.forkedFrom,
         e.state.value.replace("STATE_", ""),
         render.format_percent(e.progress),
         render.format_time(e.startTime),
         render.format_time(e.endTime),
         e.resourcePool,
     ]
     if args.all:
         result.append(e.archived)
     return result
Exemple #4
0
def list_agents(args: argparse.Namespace) -> None:
    r = api.get(args.master, "agents")

    agents = r.json()
    agents = [
        OrderedDict([
            ("id", local_id(agent_id)),
            ("registered_time", render.format_time(agent["registered_time"])),
            ("num_slots", len(agent["slots"])),
            ("num_containers", agent["num_containers"]),
            ("resource_pool", agent["resource_pool"]),
            ("label", agent["label"]),
            ("addresses", ", ".join(agent["addresses"])),
        ]) for agent_id, agent in sorted(agents.items())
    ]

    if args.json:
        print(json.dumps(agents, indent=4))
        return

    headers = [
        "Agent ID",
        "Registered Time",
        "Slots",
        "Containers",
        "Resource Pool",
        "Label",
        "Addresses",
    ]
    values = [a.values() for a in agents]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #5
0
def list_trials(args: Namespace) -> None:
    r = api.get(args.master, "experiments/{}/summary".format(args.experiment_id))
    experiment = r.json()

    headers = ["Trial ID", "State", "H-Params", "Start Time", "End Time", "# of Batches"]
    values = [
        [
            t["id"],
            t["state"],
            json.dumps(t["hparams"], indent=4),
            render.format_time(t["start_time"]),
            render.format_time(t["end_time"]),
            t["total_batches_processed"],
        ]
        for t in experiment["trials"]
    ]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #6
0
def render_tasks(args: Namespace, tasks: Dict[str, Dict[str, Any]]) -> None:
    def agent_info(t: Dict[str, Any]) -> Union[str, List[str]]:
        resources = t.get("resources", [])
        if not resources:
            return "unassigned"
        agents = [a for r in resources for a in r["agent_devices"]]
        if len(agents) == 1:
            agent = agents[0]  # type: str
            return agent
        return agents

    if args.json:
        print(json.dumps(tasks, indent=4))
        return

    headers = [
        "Task ID",
        "Allocation ID",
        "Name",
        "Slots Needed",
        "Registered Time",
        "Agent",
        "Priority",
        "Resource Pool",
    ]
    values = [
        [
            task["task_id"],
            task["allocation_id"],
            task["name"],
            task["slots_needed"],
            render.format_time(task["registered_time"]),
            agent_info(task),
            task["priority"] if task["scheduler_type"] == "priority" else "N/A",
            task["resource_pool"],
        ]
        for task_id, task in sorted(
            tasks.items(),
            key=lambda tup: (render.format_time(tup[1]["registered_time"]),),
        )
    ]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #7
0
def list_tasks(args: Namespace) -> None:
    r = api.get(args.master, "tasks")

    def agent_info(t: Dict[str, Any]) -> Union[str, List[str]]:
        containers = t.get("containers", [])
        if not containers:
            return "unassigned"
        if len(containers) == 1:
            agent = containers[0]["agent"]  # type: str
            return agent
        return [c["agent"] for c in containers]

    tasks = r.json()
    headers = [
        "ID",
        "Name",
        "Slots Needed",
        "Registered Time",
        "Agent",
        "Priority",
        "Resource Pool",
    ]
    values = [[
        task["id"],
        task["name"],
        task["slots_needed"],
        render.format_time(task["registered_time"]),
        agent_info(task),
        task["priority"] if task["scheduler_type"] == "priority" else "N/A",
        task["resource_pool"],
    ] for task_id, task in sorted(
        tasks.items(),
        key=lambda tup: (render.format_time(tup[1]["registered_time"]), ),
    )]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #8
0
def list_trials(args: Namespace) -> None:
    all_trials: List[bindings.trialv1Trial] = limit_offset_paginator(
        bindings.get_GetExperimentTrials,
        "trials",
        setup_session(args),
        experimentId=args.experiment_id,
        limit=args.limit,
        offset=args.offset,
    )

    headers = [
        "Trial ID", "State", "H-Params", "Start Time", "End Time",
        "# of Batches"
    ]
    values = [[
        t.id,
        t.state.value.replace("STATE_", ""),
        json.dumps(t.hparams, indent=4),
        render.format_time(t.startTime),
        render.format_time(t.endTime),
        t.totalBatchesProcessed,
    ] for t in all_trials]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #9
0
def list_agents(args: argparse.Namespace) -> None:
    resp = bindings.get_GetAgents(setup_session(args))

    agents = [
        OrderedDict([
            ("id", local_id(a.id)),
            ("version", a.version),
            ("registered_time", render.format_time(a.registeredTime)),
            ("num_slots", len(a.slots) if a.slots is not None else ""),
            ("num_containers",
             len(a.containers) if a.containers is not None else ""),
            (
                "resource_pools",
                ", ".join(a.resourcePools)
                if a.resourcePools is not None else "",
            ),
            ("enabled", a.enabled),
            ("draining", a.draining),
            ("label", a.label),
            ("addresses",
             ", ".join(a.addresses) if a.addresses is not None else ""),
        ]) for a in sorted(resp.agents or [], key=attrgetter("id"))
    ]

    if args.json:
        print(json.dumps(agents, indent=4))
        return

    headers = [
        "Agent ID",
        "Version",
        "Registered Time",
        "Slots",
        "Containers",
        "Resource Pool",
        "Enabled",
        "Draining",
        "Label",
        "Addresses",
    ]
    values = [a.values() for a in agents]

    render.tabulate_or_csv(headers, values, args.csv)
Exemple #10
0
def describe(args: Namespace) -> None:
    docs = []
    for experiment_id in args.experiment_ids.split(","):
        if args.metrics:
            r = api.get(args.master, "experiments/{}/metrics/summary".format(experiment_id))
        else:
            r = api.get(args.master, "experiments/{}".format(experiment_id))
        docs.append(r.json())

    if args.json:
        print(json.dumps(docs, indent=4))
        return

    # Display overall experiment information.
    headers = [
        "Experiment ID",
        "State",
        "Progress",
        "Start Time",
        "End Time",
        "Description",
        "Archived",
        "Resource Pool",
        "Labels",
    ]
    values = [
        [
            doc["id"],
            doc["state"],
            render.format_percent(doc["progress"]),
            render.format_time(doc.get("start_time")),
            render.format_time(doc.get("end_time")),
            doc["config"].get("description"),
            doc["archived"],
            doc["config"]["resources"].get("resource_pool"),
            ", ".join(sorted(doc["config"].get("labels") or [])),
        ]
        for doc in docs
    ]
    if not args.outdir:
        outfile = None
        print("Experiment:")
    else:
        outfile = args.outdir.joinpath("experiments.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display trial-related information.
    headers = ["Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params"]
    values = [
        [
            trial["id"],
            doc["id"],
            trial["state"],
            render.format_time(trial.get("start_time")),
            render.format_time(trial.get("end_time")),
            json.dumps(trial["hparams"], indent=4),
        ]
        for doc in docs
        for trial in doc["trials"]
    ]
    if not args.outdir:
        outfile = None
        print("\nTrials:")
    else:
        outfile = args.outdir.joinpath("trials.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display step-related information.
    if args.metrics:
        # Accumulate the scalar training and validation metric names from all provided experiments.
        t_metrics_names = sorted({n for doc in docs for n in scalar_training_metrics_names(doc)})
        t_metrics_headers = ["Training Metric: {}".format(name) for name in t_metrics_names]

        v_metrics_names = sorted({n for doc in docs for n in scalar_validation_metrics_names(doc)})
        v_metrics_headers = ["Validation Metric: {}".format(name) for name in v_metrics_names]
    else:
        t_metrics_headers = []
        v_metrics_headers = []

    headers = (
        ["Trial ID", "# of Batches", "State", "Start Time", "End Time"]
        + t_metrics_headers
        + [
            "Checkpoint State",
            "Checkpoint Start Time",
            "Checkpoint End Time",
            "Validation State",
            "Validation Start Time",
            "Validation End Time",
        ]
        + v_metrics_headers
    )

    values = []
    for doc in docs:
        for trial in doc["trials"]:
            for step in trial["steps"]:
                t_metrics_fields = []
                if step.get("metrics"):
                    avg_metrics = step["metrics"]["avg_metrics"]
                    for name in t_metrics_names:
                        if name in avg_metrics:
                            t_metrics_fields.append(avg_metrics[name])
                        else:
                            t_metrics_fields.append(None)

                checkpoint = step.get("checkpoint")
                if checkpoint:
                    checkpoint_state = checkpoint["state"]
                    checkpoint_start_time = checkpoint.get("start_time")
                    checkpoint_end_time = checkpoint.get("end_time")
                else:
                    checkpoint_state = None
                    checkpoint_start_time = None
                    checkpoint_end_time = None

                validation = step.get("validation")
                if validation:
                    validation_state = validation["state"]
                    validation_start_time = validation.get("start_time")
                    validation_end_time = validation.get("end_time")
                else:
                    validation_state = None
                    validation_start_time = None
                    validation_end_time = None

                if args.metrics:
                    v_metrics_fields = [
                        api.metric.get_validation_metric(name, validation)
                        for name in v_metrics_names
                    ]
                else:
                    v_metrics_fields = []

                row = (
                    [
                        step["trial_id"],
                        step["total_batches"],
                        step["state"],
                        render.format_time(step.get("start_time")),
                        render.format_time(step.get("end_time")),
                    ]
                    + t_metrics_fields
                    + [
                        checkpoint_state,
                        render.format_time(checkpoint_start_time),
                        render.format_time(checkpoint_end_time),
                        validation_state,
                        render.format_time(validation_start_time),
                        render.format_time(validation_end_time),
                    ]
                    + v_metrics_fields
                )
                values.append(row)

    if not args.outdir:
        outfile = None
        print("\nWorkloads:")
    else:
        outfile = args.outdir.joinpath("workloads.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)
Exemple #11
0
def describe(args: Namespace) -> None:
    session = setup_session(args)
    exps = []
    for experiment_id in args.experiment_ids.split(","):
        r = bindings.get_GetExperiment(session, experimentId=experiment_id)
        if args.json:
            exps.append(r.to_json())
        else:
            exps.append(r.experiment)

    if args.json:
        print(json.dumps(exps, indent=4))
        return

    # Display overall experiment information.
    headers = [
        "Experiment ID",
        "State",
        "Progress",
        "Start Time",
        "End Time",
        "Name",
        "Description",
        "Archived",
        "Resource Pool",
        "Labels",
    ]
    values = [[
        exp.id,
        exp.state.value.replace("STATE_", ""),
        render.format_percent(exp.progress),
        render.format_time(exp.startTime),
        render.format_time(exp.endTime),
        exp.name,
        exp.description,
        exp.archived,
        exp.resourcePool,
        ", ".join(sorted(exp.labels or [])),
    ] for exp in exps]
    if not args.outdir:
        outfile = None
        print("Experiment:")
    else:
        outfile = args.outdir.joinpath("experiments.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display trial-related information.
    trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {}
    for exp in exps:
        trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials(
            session, experimentId=exp.id).trials

    headers = [
        "Trial ID", "Experiment ID", "State", "Start Time", "End Time",
        "H-Params"
    ]
    values = [[
        trial.id,
        exp.id,
        trial.state.value.replace("STATE_", ""),
        render.format_time(trial.startTime),
        render.format_time(trial.endTime),
        json.dumps(trial.hparams, indent=4),
    ] for exp in exps for trial in trials_for_experiment[exp.id]]
    if not args.outdir:
        outfile = None
        print("\nTrials:")
    else:
        outfile = args.outdir.joinpath("trials.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display step-related information.
    t_metrics_headers: List[str] = []
    t_metrics_names: List[str] = []
    v_metrics_headers: List[str] = []
    v_metrics_names: List[str] = []
    if args.metrics:
        # Accumulate the scalar training and validation metric names from all provided experiments.
        for exp in exps:
            sample_trial = trials_for_experiment[exp.id][0]
            sample_workloads = bindings.get_GetTrial(
                session, trialId=sample_trial.id).workloads
            t_metrics_names += scalar_training_metrics_names(sample_workloads)
            v_metrics_names += scalar_validation_metrics_names(
                sample_workloads)
        t_metrics_names = sorted(set(t_metrics_names))
        t_metrics_headers = [
            "Training Metric: {}".format(name) for name in t_metrics_names
        ]
        v_metrics_names = sorted(set(v_metrics_names))
        v_metrics_headers = [
            "Validation Metric: {}".format(name) for name in v_metrics_names
        ]

    headers = (["Trial ID", "# of Batches", "State", "Report Time"] +
               t_metrics_headers + [
                   "Checkpoint State",
                   "Checkpoint Report Time",
                   "Validation State",
                   "Validation Report Time",
               ] + v_metrics_headers)

    wl_output: Dict[int, List[Any]] = {}
    for exp in exps:
        for trial in trials_for_experiment[exp.id]:
            workloads = bindings.get_GetTrial(session,
                                              trialId=trial.id).workloads
            for workload in workloads:
                t_metrics_fields = []
                wl_detail: Optional[
                    Union[bindings.v1MetricsWorkload,
                          bindings.v1CheckpointWorkload]] = None
                if workload.training:
                    wl_detail = workload.training
                    for name in t_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            t_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            t_metrics_fields.append(None)
                else:
                    t_metrics_fields = [None for name in t_metrics_names]

                if workload.checkpoint:
                    wl_detail = workload.checkpoint

                if workload.checkpoint and wl_detail:
                    checkpoint_state = wl_detail.state.value
                    checkpoint_end_time = wl_detail.endTime
                else:
                    checkpoint_state = ""
                    checkpoint_end_time = None

                v_metrics_fields = []
                if workload.validation:
                    wl_detail = workload.validation
                    validation_state = wl_detail.state.value
                    validation_end_time = wl_detail.endTime
                    for name in v_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            v_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            v_metrics_fields.append(None)
                else:
                    validation_state = ""
                    validation_end_time = None
                    v_metrics_fields = [None for name in v_metrics_names]

                if wl_detail:
                    if wl_detail.totalBatches in wl_output:
                        # condense training, checkpoints, validation workloads into one step-like
                        # row for compatibility with previous versions of describe
                        merge_row = wl_output[wl_detail.totalBatches]
                        merge_row[3] = max(
                            merge_row[3],
                            render.format_time(wl_detail.endTime))
                        for idx, tfield in enumerate(t_metrics_fields):
                            if tfield and merge_row[4 + idx] is None:
                                merge_row[4 + idx] = tfield
                        start_checkpoint = 4 + len(t_metrics_fields)
                        if checkpoint_state:
                            merge_row[
                                start_checkpoint] = checkpoint_state.replace(
                                    "STATE_", "")
                            merge_row[start_checkpoint +
                                      1] = render.format_time(
                                          checkpoint_end_time)
                        if validation_end_time:
                            merge_row[start_checkpoint +
                                      3] = render.format_time(
                                          validation_end_time)
                        if validation_state:
                            merge_row[start_checkpoint +
                                      2] = validation_state.replace(
                                          "STATE_", "")
                        for idx, vfield in enumerate(v_metrics_fields):
                            if vfield and merge_row[start_checkpoint + idx +
                                                    4] is None:
                                merge_row[start_checkpoint + idx + 4] = vfield
                    else:
                        row = ([
                            trial.id,
                            wl_detail.totalBatches,
                            wl_detail.state.value.replace("STATE_", ""),
                            render.format_time(wl_detail.endTime),
                        ] + t_metrics_fields + [
                            checkpoint_state.replace("STATE_", ""),
                            render.format_time(checkpoint_end_time),
                            validation_state.replace("STATE_", ""),
                            render.format_time(validation_end_time),
                        ] + v_metrics_fields)
                        wl_output[wl_detail.totalBatches] = row

    if not args.outdir:
        outfile = None
        print("\nWorkloads:")
    else:
        outfile = args.outdir.joinpath("workloads.csv")
    values = sorted(wl_output.values(), key=lambda a: int(a[1]))
    render.tabulate_or_csv(headers, values, args.csv, outfile)