def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config( conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def test_agent_restart_exp_container_failure( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) container_ids = list(_local_container_ids_for_experiment(exp_id)) if len(container_ids) != 1: pytest.fail( f"unexpected number of local containers for the experiment: {len(container_ids)}" ) # Get task id / allocation id tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_before = list(tasks_data.values())[0] managed_cluster_restarts.kill_agent() subprocess.run(["docker", "kill", container_ids[0]], check=True, stdout=subprocess.PIPE) except Exception: managed_cluster_restarts.restart_agent() raise else: managed_cluster_restarts.restart_agent() # As soon as the agent is back, the original allocation should be considered dead, # but the new one should be allocated. state = exp.experiment_state(exp_id) assert state == EXP_STATE.STATE_ACTIVE tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_after = list(tasks_data.values())[0] assert exp_task_before["task_id"] == exp_task_after["task_id"] assert exp_task_before["allocation_id"] != exp_task_after[ "allocation_id"] exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)