def test_master_restart_reattach_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) # TODO(ilia): don't wait for progress. exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_master() time.sleep(downtime) managed_cluster_restarts.restart_master() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED, max_wait_secs=downtime + 60) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent() raise
def test_disable_agent_experiment_resume() -> None: """ Start an experiment with max_restarts=0 and ensure that being killed due to an explicit agent disable/enable (without draining) does not count toward the number of restarts. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), ["--config", "max_restarts=0"], ) exp.wait_for_experiment_workload_progress(exp_id) with _disable_agent(agent_id): # Wait for the allocation to go away. for _ in range(20): slots = _fetch_slots() print(slots) if not any(s["allocation_id"] != "FREE" for s in slots): break time.sleep(1) else: pytest.fail("Experiment stayed scheduled after agent was disabled") exp.wait_for_experiment_state(exp_id, determinedexperimentv1State.STATE_COMPLETED)
def test_master_restart_kill_works( managed_cluster_restarts: ManagedCluster) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), [ "--config", "searcher.max_length.batches=10000", "--config", "max_restarts=0" ], ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_master() time.sleep(0) managed_cluster_restarts.restart_master() command = [ "det", "-m", conf.make_master_url(), "e", "kill", str(exp_id) ] subprocess.check_call(command) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_CANCELED, max_wait_secs=10) managed_cluster_restarts.ensure_agent_ok() except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent()
def test_agent_reconnect_keep_experiment( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_proxy() time.sleep(1) managed_cluster_restarts.restart_proxy() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_proxy(wait_for_reconnect=False) managed_cluster_restarts.restart_agent() raise
def test_agent_restart_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: if not managed_cluster_restarts.reattach: pytest.skip() managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_agent() time.sleep(downtime) managed_cluster_restarts.restart_agent(wait_for_amnesia=False) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_agent() raise
def test_drain_agent_sched() -> None: """ Start an experiment, drain it. Start a second one and make sure it schedules on the second agent *before* the first one has finished. """ slots = _wait_for_slots(2) assert len(slots) == 2 exp_id1 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id1) slots = _fetch_slots() used_slots = [s for s in slots if s["allocation_id"] != "FREE"] assert len(used_slots) == 1 agent_id1 = used_slots[0]["agent_id"] with _disable_agent(agent_id1, drain=True): exp_id2 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(exp_id2, determinedexperimentv1State.STATE_ACTIVE) # Wait for a state when *BOTH* experiments are scheduled. for _ in range(20): slots = _fetch_slots() assert len(slots) == 2 used_slots = [s for s in slots if s["allocation_id"] != "FREE"] if len(used_slots) == 2: # All good. break else: pytest.fail( "Second experiment didn't schedule on the second agent " "while the first agent was draining") exp.wait_for_experiment_state( exp_id1, determinedexperimentv1State.STATE_COMPLETED) exp.wait_for_experiment_state( exp_id2, determinedexperimentv1State.STATE_COMPLETED) trials1 = exp.experiment_trials(exp_id1) trials2 = exp.experiment_trials(exp_id2) assert len(trials1) == len(trials2) == 1 assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
def test_agent_restart_exp_container_failure( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) container_ids = list(_local_container_ids_for_experiment(exp_id)) if len(container_ids) != 1: pytest.fail( f"unexpected number of local containers for the experiment: {len(container_ids)}" ) # Get task id / allocation id tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_before = list(tasks_data.values())[0] managed_cluster_restarts.kill_agent() subprocess.run(["docker", "kill", container_ids[0]], check=True, stdout=subprocess.PIPE) except Exception: managed_cluster_restarts.restart_agent() raise else: managed_cluster_restarts.restart_agent() # As soon as the agent is back, the original allocation should be considered dead, # but the new one should be allocated. state = exp.experiment_state(exp_id) assert state == EXP_STATE.STATE_ACTIVE tasks_data = _task_list_json(managed_cluster_restarts.master_url) assert len(tasks_data) == 1 exp_task_after = list(tasks_data.values())[0] assert exp_task_before["task_id"] == exp_task_after["task_id"] assert exp_task_before["allocation_id"] != exp_task_after[ "allocation_id"] exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE) # Wait for the only trial to get scheduled. exp.wait_for_experiment_active_workload(experiment_id) # Wait for the only trial to show progress, indicating the image is built and running. exp.wait_for_experiment_workload_progress(experiment_id) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED) # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state( experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED )
def test_drain_agent() -> None: """ Start an experiment, `disable --drain` the agent once the trial is running, make sure the experiment still finishes, but the new ones won't schedule. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE) exp.wait_for_experiment_active_workload(experiment_id) exp.wait_for_experiment_workload_progress(experiment_id) # Disable and quickly enable it back. with _disable_agent(agent_id, drain=True): pass # Try to launch another experiment. It shouldn't get scheduled because the # slot is still busy with the first experiment. experiment_id_no_start = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) time.sleep(5) exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE) with _disable_agent(agent_id, drain=True): # Check for 15 seconds it doesn't get scheduled into the same slot. for _ in range(15): trials = exp.experiment_trials(experiment_id_no_start) assert len(trials) == 0 # Ensure the first one has finished with the correct number of workloads. exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 assert len(trials[0].workloads) == 7 # Ensure the slot is empty. slots = _fetch_slots() assert len(slots) == 1 assert slots[0]["enabled"] is False assert slots[0]["draining"] is True assert slots[0]["allocation_id"] == "FREE" # Check agent state. command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"] output = subprocess.check_output(command).decode() agent_data = cast(List[Dict[str, Any]], json.loads(output))[0] assert agent_data["id"] == agent_id assert agent_data["enabled"] is False assert agent_data["draining"] is True exp.cancel_single(experiment_id_no_start)
def test_allocation_resources_incremental_release() -> None: """ Start an two container experiment and ensure one container exits before the other. Ensure resources are released and schedule-able without the other needing to be released. """ cleanup_exp_ids = [] try: slots = _wait_for_slots(2) assert len(slots) == 2 with tempfile.TemporaryDirectory() as context_dir, open( os.path.join(context_dir, "const.yaml"), "w") as config_file: # Launch an experiment that has one resource (docker container) that exits immediately. config_obj = conf.load_config( conf.fixtures_path("no_op/single.yaml")) config_obj["resources"] = { **config_obj.get("resources", {}), **{ "slots": 2 } } config_obj["hyperparameters"] = { **config_obj.get("hyperparameters", {}), **{ "non_chief_exit_immediately": True }, } yaml.dump(config_obj, config_file) shutil.copy(conf.fixtures_path("no_op/model_def.py"), os.path.join(context_dir, "model_def.py")) exp_id = exp.create_experiment(config_file.name, context_dir, None) cleanup_exp_ids.append(exp_id) # Wait for the experiment to start and run some. exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_ACTIVE, ) exp.wait_for_experiment_active_workload(exp_id) # And wait for exactly one of the resources to free, while one is still in use. confirmations = 0 for _ in range(RANK_ONE_WAIT_TIME): free_agents = list_free_agents() if len(free_agents) == 1: confirmations += 1 if confirmations == 2: # Just for the race where one container has exited and the other hasn't quite yet, # but is going to, make sure we see it at least twice. break # Still waiting on partial exit time.sleep(1) else: pytest.fail( "exactly one agent did not free after {} seconds".format( RANK_ONE_WAIT_TIME)) # Ensure we can schedule on the free slot, not only that the API says its available. exp_id_2 = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), None, ) cleanup_exp_ids.append(exp_id_2) exp.wait_for_experiment_workload_progress(exp_id_2) exp.wait_for_experiment_state( exp_id_2, determinedexperimentv1State.STATE_COMPLETED) cleanup_exp_ids = cleanup_exp_ids[:-1] # And check the hung experiment still is holding on to its hung slot. free_agents = list_free_agents() if len(free_agents) != 1: pytest.fail( f"should still have exactly one agent scheduled: {free_agents}" ) finally: for exp_id in cleanup_exp_ids: bindings.post_KillExperiment(determined_test_session(), id=exp_id) exp.wait_for_experiment_state( exp_id, determinedexperimentv1State.STATE_CANCELED)