def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config( conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "ACTIVE") for _ in range(5): assert exp.experiment_state(experiment_id) == "ACTIVE" time.sleep(1) exp.cancel_single(experiment_id)
def test_cancel_one_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) exp.cancel_single(experiment_id)
def test_cancel_ten_experiments() -> None: experiment_ids = [ exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(10) ] for experiment_id in experiment_ids: exp.cancel_single(experiment_id)
def test_noop_experiment_config_override() -> None: config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, conf.fixtures_path("no_op"), ["--config", "reproducibility.experiment_seed=8200"], ) exp_config = exp.experiment_config_json(experiment_id) assert exp_config["reproducibility"]["experiment_seed"] == 8200 exp.cancel_single(experiment_id)
def test_cancel_one_active_experiment_ready() -> None: experiment_id = exp.create_experiment( conf.tutorials_path("mnist_pytorch/const.yaml"), conf.tutorials_path("mnist_pytorch"), ) while 1: if exp.experiment_has_completed_workload(experiment_id): break time.sleep(1) exp.cancel_single(experiment_id, should_have_trial=True) exp.assert_performed_final_checkpoint(experiment_id)
def test_cancel_one_active_experiment() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-many-long-steps.yaml"), conf.fixtures_path("no_op"), ) for _ in range(15): if exp.experiment_has_active_workload(experiment_id): break time.sleep(1) else: raise AssertionError("no workload active after 15 seconds") exp.cancel_single(experiment_id, should_have_trial=True)
def test_drain_agent() -> None: """ Start an experiment, `disable --drain` the agent once the trial is running, make sure the experiment still finishes, but the new ones won't schedule. """ slots = _fetch_slots() assert len(slots) == 1 agent_id = slots[0]["agent_id"] experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE) exp.wait_for_experiment_active_workload(experiment_id) exp.wait_for_experiment_workload_progress(experiment_id) # Disable and quickly enable it back. with _disable_agent(agent_id, drain=True): pass # Try to launch another experiment. It shouldn't get scheduled because the # slot is still busy with the first experiment. experiment_id_no_start = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) time.sleep(5) exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE) with _disable_agent(agent_id, drain=True): # Check for 15 seconds it doesn't get scheduled into the same slot. for _ in range(15): trials = exp.experiment_trials(experiment_id_no_start) assert len(trials) == 0 # Ensure the first one has finished with the correct number of workloads. exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 assert len(trials[0].workloads) == 7 # Ensure the slot is empty. slots = _fetch_slots() assert len(slots) == 1 assert slots[0]["enabled"] is False assert slots[0]["draining"] is True assert slots[0]["allocation_id"] == "FREE" # Check agent state. command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"] output = subprocess.check_output(command).decode() agent_data = cast(List[Dict[str, Any]], json.loads(output))[0] assert agent_data["id"] == agent_id assert agent_data["enabled"] is False assert agent_data["draining"] is True exp.cancel_single(experiment_id_no_start)