Example #1
0
 def format_experiment(e: Any) -> List[Any]:
     result = [
         e["id"],
         e["owner"]["username"],
         e["config"]["description"],
         e["state"],
         render.format_percent(e["progress"]),
         render.format_time(e["start_time"]),
         render.format_time(e["end_time"]),
         e["config"]["resources"].get("resource_pool"),
     ]
     if args.all:
         result.append(e["archived"])
     return result
Example #2
0
 def format_experiment(e: Any) -> List[Any]:
     result = [
         e.id,
         e.username,
         e.name,
         e.forkedFrom,
         e.state.value.replace("STATE_", ""),
         render.format_percent(e.progress),
         render.format_time(e.startTime),
         render.format_time(e.endTime),
         e.resourcePool,
     ]
     if args.all:
         result.append(e.archived)
     return result
Example #3
0
def describe(args: Namespace) -> None:
    docs = []
    for experiment_id in args.experiment_ids.split(","):
        if args.metrics:
            r = api.get(args.master, "experiments/{}/metrics/summary".format(experiment_id))
        else:
            r = api.get(args.master, "experiments/{}".format(experiment_id))
        docs.append(r.json())

    if args.json:
        print(json.dumps(docs, indent=4))
        return

    # Display overall experiment information.
    headers = [
        "Experiment ID",
        "State",
        "Progress",
        "Start Time",
        "End Time",
        "Description",
        "Archived",
        "Resource Pool",
        "Labels",
    ]
    values = [
        [
            doc["id"],
            doc["state"],
            render.format_percent(doc["progress"]),
            render.format_time(doc.get("start_time")),
            render.format_time(doc.get("end_time")),
            doc["config"].get("description"),
            doc["archived"],
            doc["config"]["resources"].get("resource_pool"),
            ", ".join(sorted(doc["config"].get("labels") or [])),
        ]
        for doc in docs
    ]
    if not args.outdir:
        outfile = None
        print("Experiment:")
    else:
        outfile = args.outdir.joinpath("experiments.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display trial-related information.
    headers = ["Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params"]
    values = [
        [
            trial["id"],
            doc["id"],
            trial["state"],
            render.format_time(trial.get("start_time")),
            render.format_time(trial.get("end_time")),
            json.dumps(trial["hparams"], indent=4),
        ]
        for doc in docs
        for trial in doc["trials"]
    ]
    if not args.outdir:
        outfile = None
        print("\nTrials:")
    else:
        outfile = args.outdir.joinpath("trials.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display step-related information.
    if args.metrics:
        # Accumulate the scalar training and validation metric names from all provided experiments.
        t_metrics_names = sorted({n for doc in docs for n in scalar_training_metrics_names(doc)})
        t_metrics_headers = ["Training Metric: {}".format(name) for name in t_metrics_names]

        v_metrics_names = sorted({n for doc in docs for n in scalar_validation_metrics_names(doc)})
        v_metrics_headers = ["Validation Metric: {}".format(name) for name in v_metrics_names]
    else:
        t_metrics_headers = []
        v_metrics_headers = []

    headers = (
        ["Trial ID", "# of Batches", "State", "Start Time", "End Time"]
        + t_metrics_headers
        + [
            "Checkpoint State",
            "Checkpoint Start Time",
            "Checkpoint End Time",
            "Validation State",
            "Validation Start Time",
            "Validation End Time",
        ]
        + v_metrics_headers
    )

    values = []
    for doc in docs:
        for trial in doc["trials"]:
            for step in trial["steps"]:
                t_metrics_fields = []
                if step.get("metrics"):
                    avg_metrics = step["metrics"]["avg_metrics"]
                    for name in t_metrics_names:
                        if name in avg_metrics:
                            t_metrics_fields.append(avg_metrics[name])
                        else:
                            t_metrics_fields.append(None)

                checkpoint = step.get("checkpoint")
                if checkpoint:
                    checkpoint_state = checkpoint["state"]
                    checkpoint_start_time = checkpoint.get("start_time")
                    checkpoint_end_time = checkpoint.get("end_time")
                else:
                    checkpoint_state = None
                    checkpoint_start_time = None
                    checkpoint_end_time = None

                validation = step.get("validation")
                if validation:
                    validation_state = validation["state"]
                    validation_start_time = validation.get("start_time")
                    validation_end_time = validation.get("end_time")
                else:
                    validation_state = None
                    validation_start_time = None
                    validation_end_time = None

                if args.metrics:
                    v_metrics_fields = [
                        api.metric.get_validation_metric(name, validation)
                        for name in v_metrics_names
                    ]
                else:
                    v_metrics_fields = []

                row = (
                    [
                        step["trial_id"],
                        step["total_batches"],
                        step["state"],
                        render.format_time(step.get("start_time")),
                        render.format_time(step.get("end_time")),
                    ]
                    + t_metrics_fields
                    + [
                        checkpoint_state,
                        render.format_time(checkpoint_start_time),
                        render.format_time(checkpoint_end_time),
                        validation_state,
                        render.format_time(validation_start_time),
                        render.format_time(validation_end_time),
                    ]
                    + v_metrics_fields
                )
                values.append(row)

    if not args.outdir:
        outfile = None
        print("\nWorkloads:")
    else:
        outfile = args.outdir.joinpath("workloads.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)
Example #4
0
def describe(args: Namespace) -> None:
    session = setup_session(args)
    exps = []
    for experiment_id in args.experiment_ids.split(","):
        r = bindings.get_GetExperiment(session, experimentId=experiment_id)
        if args.json:
            exps.append(r.to_json())
        else:
            exps.append(r.experiment)

    if args.json:
        print(json.dumps(exps, indent=4))
        return

    # Display overall experiment information.
    headers = [
        "Experiment ID",
        "State",
        "Progress",
        "Start Time",
        "End Time",
        "Name",
        "Description",
        "Archived",
        "Resource Pool",
        "Labels",
    ]
    values = [[
        exp.id,
        exp.state.value.replace("STATE_", ""),
        render.format_percent(exp.progress),
        render.format_time(exp.startTime),
        render.format_time(exp.endTime),
        exp.name,
        exp.description,
        exp.archived,
        exp.resourcePool,
        ", ".join(sorted(exp.labels or [])),
    ] for exp in exps]
    if not args.outdir:
        outfile = None
        print("Experiment:")
    else:
        outfile = args.outdir.joinpath("experiments.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display trial-related information.
    trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {}
    for exp in exps:
        trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials(
            session, experimentId=exp.id).trials

    headers = [
        "Trial ID", "Experiment ID", "State", "Start Time", "End Time",
        "H-Params"
    ]
    values = [[
        trial.id,
        exp.id,
        trial.state.value.replace("STATE_", ""),
        render.format_time(trial.startTime),
        render.format_time(trial.endTime),
        json.dumps(trial.hparams, indent=4),
    ] for exp in exps for trial in trials_for_experiment[exp.id]]
    if not args.outdir:
        outfile = None
        print("\nTrials:")
    else:
        outfile = args.outdir.joinpath("trials.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display step-related information.
    t_metrics_headers: List[str] = []
    t_metrics_names: List[str] = []
    v_metrics_headers: List[str] = []
    v_metrics_names: List[str] = []
    if args.metrics:
        # Accumulate the scalar training and validation metric names from all provided experiments.
        for exp in exps:
            sample_trial = trials_for_experiment[exp.id][0]
            sample_workloads = bindings.get_GetTrial(
                session, trialId=sample_trial.id).workloads
            t_metrics_names += scalar_training_metrics_names(sample_workloads)
            v_metrics_names += scalar_validation_metrics_names(
                sample_workloads)
        t_metrics_names = sorted(set(t_metrics_names))
        t_metrics_headers = [
            "Training Metric: {}".format(name) for name in t_metrics_names
        ]
        v_metrics_names = sorted(set(v_metrics_names))
        v_metrics_headers = [
            "Validation Metric: {}".format(name) for name in v_metrics_names
        ]

    headers = (["Trial ID", "# of Batches", "State", "Report Time"] +
               t_metrics_headers + [
                   "Checkpoint State",
                   "Checkpoint Report Time",
                   "Validation State",
                   "Validation Report Time",
               ] + v_metrics_headers)

    wl_output: Dict[int, List[Any]] = {}
    for exp in exps:
        for trial in trials_for_experiment[exp.id]:
            workloads = bindings.get_GetTrial(session,
                                              trialId=trial.id).workloads
            for workload in workloads:
                t_metrics_fields = []
                wl_detail: Optional[
                    Union[bindings.v1MetricsWorkload,
                          bindings.v1CheckpointWorkload]] = None
                if workload.training:
                    wl_detail = workload.training
                    for name in t_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            t_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            t_metrics_fields.append(None)
                else:
                    t_metrics_fields = [None for name in t_metrics_names]

                if workload.checkpoint:
                    wl_detail = workload.checkpoint

                if workload.checkpoint and wl_detail:
                    checkpoint_state = wl_detail.state.value
                    checkpoint_end_time = wl_detail.endTime
                else:
                    checkpoint_state = ""
                    checkpoint_end_time = None

                v_metrics_fields = []
                if workload.validation:
                    wl_detail = workload.validation
                    validation_state = wl_detail.state.value
                    validation_end_time = wl_detail.endTime
                    for name in v_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            v_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            v_metrics_fields.append(None)
                else:
                    validation_state = ""
                    validation_end_time = None
                    v_metrics_fields = [None for name in v_metrics_names]

                if wl_detail:
                    if wl_detail.totalBatches in wl_output:
                        # condense training, checkpoints, validation workloads into one step-like
                        # row for compatibility with previous versions of describe
                        merge_row = wl_output[wl_detail.totalBatches]
                        merge_row[3] = max(
                            merge_row[3],
                            render.format_time(wl_detail.endTime))
                        for idx, tfield in enumerate(t_metrics_fields):
                            if tfield and merge_row[4 + idx] is None:
                                merge_row[4 + idx] = tfield
                        start_checkpoint = 4 + len(t_metrics_fields)
                        if checkpoint_state:
                            merge_row[
                                start_checkpoint] = checkpoint_state.replace(
                                    "STATE_", "")
                            merge_row[start_checkpoint +
                                      1] = render.format_time(
                                          checkpoint_end_time)
                        if validation_end_time:
                            merge_row[start_checkpoint +
                                      3] = render.format_time(
                                          validation_end_time)
                        if validation_state:
                            merge_row[start_checkpoint +
                                      2] = validation_state.replace(
                                          "STATE_", "")
                        for idx, vfield in enumerate(v_metrics_fields):
                            if vfield and merge_row[start_checkpoint + idx +
                                                    4] is None:
                                merge_row[start_checkpoint + idx + 4] = vfield
                    else:
                        row = ([
                            trial.id,
                            wl_detail.totalBatches,
                            wl_detail.state.value.replace("STATE_", ""),
                            render.format_time(wl_detail.endTime),
                        ] + t_metrics_fields + [
                            checkpoint_state.replace("STATE_", ""),
                            render.format_time(checkpoint_end_time),
                            validation_state.replace("STATE_", ""),
                            render.format_time(validation_end_time),
                        ] + v_metrics_fields)
                        wl_output[wl_detail.totalBatches] = row

    if not args.outdir:
        outfile = None
        print("\nWorkloads:")
    else:
        outfile = args.outdir.joinpath("workloads.csv")
    values = sorted(wl_output.values(), key=lambda a: int(a[1]))
    render.tabulate_or_csv(headers, values, args.csv, outfile)