def verify_completed_experiment_metadata( experiment_id: int, num_expected_trials: Optional[int], has_zeroth_step: bool = False ) -> None: # If `expected_trials` is None, the expected number of trials is # non-deterministic. if num_expected_trials is not None: assert num_trials(experiment_id) == num_expected_trials assert num_completed_trials(experiment_id) == num_expected_trials # Check that every trial and step is COMPLETED. trials = experiment_trials(experiment_id) assert len(trials) > 0 for trial in trials: if trial["state"] != "COMPLETED": report_failed_trial(trial["id"], trial["state"]) pytest.fail(f"Trial {trial['id']} was not COMPLETED but {trial['state']}") assert len(trial["steps"]) > 0 # Check that steps appear in increasing order of step ID. # Step IDs should start at 0 or 1 and have no gaps. step_ids = [s["id"] for s in trial["steps"]] assert step_ids == sorted(step_ids) if has_zeroth_step: assert step_ids == list(range(0, len(step_ids))) else: assert step_ids == list(range(1, len(step_ids) + 1)) for step in trial["steps"]: assert step["state"] == "COMPLETED" if step["validation"]: validation = step["validation"] assert validation["state"] == "COMPLETED" if step["checkpoint"]: checkpoint = step["checkpoint"] assert checkpoint["state"] in {"COMPLETED", "DELETED"} # The last step of every trial should have a checkpoint. for trial in trials: last_step = trial["steps"][-1] assert last_step["checkpoint"] # When the experiment completes, all slots should now be free. This # requires terminating the experiment's last container, which might # take some time. max_secs_to_free_slots = 30 for _ in range(max_secs_to_free_slots): if cluster.num_free_slots() == cluster.num_slots(): break time.sleep(1) else: raise AssertionError("Slots failed to free after experiment {}".format(experiment_id)) # Run a series of CLI tests on the finished experiment, to sanity check # that basic CLI commands don't raise errors. run_describe_cli_tests(experiment_id) run_list_cli_tests(experiment_id)
def verify_completed_experiment_metadata( experiment_id: int, num_expected_trials: Optional[int]) -> None: # If `expected_trials` is None, the expected number of trials is # non-deterministic. if num_expected_trials is not None: assert num_trials(experiment_id) == num_expected_trials assert num_completed_trials(experiment_id) == num_expected_trials # Check that every trial and step is COMPLETED. trials = experiment_trials(experiment_id) assert len(trials) > 0 for trial in trials: assert trial.state == "COMPLETED" assert len(trial.steps) > 0 # Check that steps appear in increasing order of step ID. # Step IDs should start at 1 and have no gaps. step_ids = [s.id for s in trial.steps] assert step_ids == sorted(step_ids) assert step_ids == list(range(1, len(step_ids) + 1)) for step in trial.steps: assert step.state == "COMPLETED" if step.validation: assert step.validation.state == "COMPLETED" if step.checkpoint: assert step.checkpoint.state in {"COMPLETED", "DELETED"} # The last step of every trial should have a checkpoint. for trial in trials: assert trial.steps[-1].checkpoint # When the experiment completes, all slots should now be free. This # requires terminating the experiment's last container, which might # take some time. max_secs_to_free_slots = 30 for _ in range(max_secs_to_free_slots): if cluster.num_free_slots() == cluster.num_slots(): break time.sleep(1) else: raise AssertionError( "Slots failed to free after experiment {}".format(experiment_id)) # Run a series of CLI tests on the finished experiment, to sanity check # that basic CLI commands don't raise errors. run_describe_cli_tests(experiment_id) run_list_cli_tests(experiment_id)