def get_cfg(self): cfg = RunConfig() cfg.set("run_as", "root") cfg.set("cluster_id", 123) cfg.set("priority", 0.5) cfg.set("preemtible", True) return cfg
def test_role_preproc_called(self): scheduler_mock = SchedulerTest.MockScheduler("test_session") app_mock = MagicMock() app_mock.roles = [MagicMock()] cfg = RunConfig() cfg.set("foo", "bar") scheduler_mock.submit_dryrun(app_mock, cfg) role_mock = app_mock.roles[0] role_mock.pre_proc.assert_called_once()
def test_invalid_dryrun_cfg(self): scheduler_mock = SchedulerTest.MockScheduler("test_session") app_mock = MagicMock() with self.assertRaises(InvalidRunConfigException): empty_cfg = RunConfig() scheduler_mock.submit_dryrun(app_mock, empty_cfg) with self.assertRaises(InvalidRunConfigException): bad_type_cfg = RunConfig() bad_type_cfg.set("foo", 100) scheduler_mock.submit_dryrun(app_mock, empty_cfg)
def test_submit(self): # make sure the macro substitution works # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir) test_file_name = f"{macros.app_id}_{macros.replica_id}" num_replicas = 2 role = (Role("role1").runs( "touch.sh", join(f"{macros.img_root}", test_file_name)).on( self.test_container).replicas(num_replicas)) app = Application(name="test_app").of(role) expected_app_id = make_unique(app.name) with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id): cfg = RunConfig({"log_dir": self.test_dir}) app_id = self.scheduler.submit(app, cfg) self.assertEqual(f"{expected_app_id}", app_id) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state) for i in range(num_replicas): self.assertTrue( os.path.isfile(join(self.test_dir, f"{expected_app_id}_{i}"))) role = Role("role1").runs("fail.sh").on( self.test_container).replicas(2) app = Application(name="test_app").of(role) expected_app_id = make_unique(app.name) with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id): app_id = self.scheduler.submit(app, cfg) self.assertEqual(f"{expected_app_id}", app_id) self.assertEqual(AppState.FAILED, self.wait(app_id).state)
def test_log_iterator(self): role = (Role("role1").runs("echo_range.sh", "10", "0.5").on(self.test_container).replicas(1)) log_dir = join(self.test_dir, "log") cfg = RunConfig({"log_dir": log_dir}) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, cfg) for i, line in enumerate(self.scheduler.log_iter(app_id, "role1", k=0)): self.assertEqual(str(i), line) # since and until ignored for i, line in enumerate( self.scheduler.log_iter(app_id, "role1", k=0, since=datetime.now(), until=datetime.now())): self.assertEqual(str(i), line) for i, line in enumerate( self.scheduler.log_iter(app_id, "role1", k=0, regex=r"[02468]")): self.assertEqual(str(i * 2), line)
def test_submit_dryrun_with_log_dir(self, img_fetcher_fetch_mock): trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(trainer) cfg = RunConfig({"log_dir": "/tmp"}) info = self.scheduler.submit_dryrun(app, cfg) print(info) trainer_info = info.request[0]["trainer"] self.assertEqual(2, len(trainer_info)) self.assertEqual( { "args": ["trainer.par"], "env": {}, "stdout": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stdout.log", "stderr": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stderr.log", }, trainer_info[0], ) self.assertEqual( { "args": ["trainer.par"], "env": {}, "stdout": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stdout.log", "stderr": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stderr.log", }, trainer_info[1], )
def test_submit_dryrun(self, img_fetcher_fetch_mock): master = (Role("master").runs( "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container)) trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(master, trainer) cfg = RunConfig() info = self.scheduler.submit_dryrun(app, cfg) print(info) self.assertEqual(2, len(info.request)) master_info = info.request[0]["master"] trainer_info = info.request[1]["trainer"] self.assertEqual(1, len(master_info)) self.assertEqual(2, len(trainer_info)) self.assertEqual( { "args": ["master.par", "arg1"], "env": { "ENV_VAR_1": "VAL1", }, }, master_info[0], ) self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[0]) self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[1])
def test_cache_evict(self): scheduler = LocalScheduler(session_name="test_session", cache_size=1) test_file1 = join(self.test_dir, "test_file_1") test_file2 = join(self.test_dir, "test_file_2") role1 = Role("role1").runs("touch.sh", test_file1).on(self.test_container) role2 = Role("role2").runs("touch.sh", test_file2).on(self.test_container) app1 = Application(name="touch_test_file1").of(role1) app2 = Application(name="touch_test_file2").of(role2) cfg = RunConfig({"log_dir": self.test_dir}) app_id1 = scheduler.submit(app1, cfg) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id1, scheduler).state) app_id2 = scheduler.submit(app2, cfg) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id2, scheduler).state) # app1 should've been evicted self.assertIsNone(scheduler.describe(app_id1)) self.assertIsNone(self.wait(app_id1, scheduler)) self.assertIsNotNone(scheduler.describe(app_id2)) self.assertIsNotNone(self.wait(app_id2, scheduler))
def _get_app_log_dir(self, app_id: str, cfg: RunConfig) -> Optional[str]: # pyre-ignore [6]: type check already done by runopt.resolve log_dir: str = cfg.get("log_dir") if log_dir: return os.path.join(log_dir, self.session_name, app_id) else: return None
def test_submit_with_log_dir_stdout(self): num_replicas = 2 for std_stream in ["stdout", "stderr"]: with self.subTest(std_stream=std_stream): log_dir = join(self.test_dir, f"test_{std_stream}_log") cfg = RunConfig({"log_dir": log_dir}) role = (Role("role1").runs( f"echo_{std_stream}.sh", "hello_world").on( self.test_container).replicas(num_replicas)) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, cfg) self.wait(app_id) success_file = join(log_dir, self.scheduler.session_name, app_id, "SUCCESS") with open(success_file, "r") as f: sf_json = json.load(f) self.assertEqual(app_id, sf_json["app_id"]) self.assertEqual( join(log_dir, self.scheduler.session_name, app_id), sf_json["log_dir"], ) self.assertEqual(AppState.SUCCEEDED.name, sf_json["final_state"]) for replica_id in range(num_replicas): replica_info = sf_json["roles"]["role1"][replica_id] self._assert_file_content(replica_info[std_stream], "hello_world\n")
def test_submit_dryrun_with_log_dir_cfg(self, img_fetcher_fetch_mock): trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(trainer) cfg = RunConfig({"log_dir": self.test_dir}) info = self.scheduler.submit_dryrun(app, cfg) print(info) trainer_info = info.request[0]["trainer"] self.assertEqual(2, len(trainer_info)) app_log_dir = join(self.test_dir, self.scheduler.session_name, "test_app_##") for i, role in enumerate(app.roles): role_name = role.name role_info = info.request[i][role_name] for j in range(role.num_replicas): replica_log_dir = join(app_log_dir, role_name, str(j)) # dryrun should NOT create any directories self.assertFalse(os.path.isdir(replica_log_dir)) self.assertEqual( { "args": [role.entrypoint, *role.args], "env": { ERR_FILE_ENV: join(replica_log_dir, "error.json"), **role.env, }, "stdout": join(replica_log_dir, "stdout.log"), "stderr": join(replica_log_dir, "stderr.log"), }, role_info[j], # replica_info )
def test_submit_inherit_parent_envs(self): role = Role("echo_foo").runs("echo_env_foo.sh").on(self.test_container) app = Application(name="check_foo_env_var").of(role) app_id = self.scheduler.submit(app, RunConfig({"log_dir": self.test_dir})) for line in self.scheduler.log_iter(app_id, "echo_foo"): self.assertEqual("bar", line) desc = self.wait(app_id, self.scheduler) self.assertEqual(AppState.SUCCEEDED, desc.state)
def test_log_iterator_no_log_dir(self): role = (Role("role1").runs("echo_range.sh", "10", "0.5").on(self.test_container).replicas(1)) app = Application(name="test_app").of(role) with self.assertRaises(RuntimeError, msg="log_dir must be set to iterate logs"): app_id = self.scheduler.submit(app, RunConfig()) self.scheduler.log_iter(app_id, "role1", k=0)
def test_exists(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig({"log_dir": self.test_dir}) app_id = self.scheduler.submit(app, cfg) self.assertTrue(self.scheduler.exists(app_id)) self.scheduler.cancel(app_id) self.assertTrue(self.scheduler.exists(app_id))
def test_cache_full(self): scheduler = LocalScheduler(session_name="test_session", cache_size=1) role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig({"log_dir": self.test_dir}) scheduler.submit(app, cfg) with self.assertRaises(IndexError): scheduler.submit(app, cfg)
def test_describe(self): role = Role("role1").runs("sleep.sh", "2").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig({"log_dir": self.test_dir}) self.assertIsNone(self.scheduler.describe("test_app_0")) app_id = self.scheduler.submit(app, cfg) desc = self.scheduler.describe(app_id) self.assertEqual(AppState.RUNNING, desc.state) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state)
def test_runopts_resolve_override(self): opts = self.get_runopts() cfg = RunConfig() cfg.set("run_as", "foobar") cfg.set("priority", 20) cfg.set("cluster_id", "test_cluster") resolved = opts.resolve(cfg) self.assertEqual("foobar", resolved.get("run_as")) self.assertEqual(20, resolved.get("priority")) self.assertEqual("test_cluster", resolved.get("cluster_id"))
def test_cancel(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig() app_id = self.scheduler.submit(app, cfg) desc = self.scheduler.describe(app_id) self.assertEqual(AppState.RUNNING, desc.state) self.scheduler.cancel(app_id) self.assertEqual(AppState.CANCELLED, self.scheduler.describe(app_id).state)
def test_schedule_fail(self, record_tsm_mock): app_info = AppDryRunInfo("test", lambda x: "test") app_info._scheduler = "default" cfg = RunConfig({"image_fetcher": "dir"}) app_info._cfg = cfg session = DummySession("test_session") with self.assertRaises(RuntimeError): with patch.object(session, "_schedule") as schedule_mock: schedule_mock.side_effect = RuntimeError("test error") session.schedule(app_info) record_tsm_mock.assert_called()
def test_get_schedulers(self): default_sched_mock = MagicMock() local_sched_mock = MagicMock() schedulers = {"default": default_sched_mock, "local": local_sched_mock} session = StandaloneSession(name="test_session", schedulers=schedulers) role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container) app = Application("sleeper").of(role) cfg = RunConfig() session.run(app, scheduler="local", cfg=cfg) local_sched_mock.submit.called_once_with(app, cfg)
def _get_img_fetcher(self, cfg: RunConfig) -> ImageFetcher: img_fetcher_type = cfg.get("image_fetcher") fetchers = self._img_fetchers() # pyre-ignore [6]: type check already done by runopt.resolve img_fetcher = fetchers.get(img_fetcher_type, None) if not img_fetcher: raise InvalidRunConfigException( f"Unsupported image fetcher type: {img_fetcher_type}. Must be one of: {fetchers.keys()}", cfg, self.run_opts(), ) return img_fetcher
def setUp(self): self.test_dir = tempfile.mkdtemp("StandaloneSessionTest") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) self.scheduler = LocalScheduler(SESSION_NAME) self.cfg = RunConfig({"image_fetcher": "dir"}) # resource ignored for local scheduler; adding as an example self.test_container = Container(image=self.test_dir).require(resource.SMALL)
def test_submit_multiple_roles(self): test_file1 = join(self.test_dir, "test_file_1") test_file2 = join(self.test_dir, "test_file_2") role1 = (Role("role1").runs("touch.sh", test_file1).on( self.test_container).replicas(1)) role2 = (Role("role2").runs("touch.sh", test_file2).on( self.test_container).replicas(1)) app = Application(name="test_app").of(role1, role2) cfg = RunConfig({"log_dir": self.test_dir}) app_id = self.scheduler.submit(app, cfg) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state) self.assertTrue(os.path.isfile(test_file1)) self.assertTrue(os.path.isfile(test_file2))
def test_runopts_resolve_bad_type(self): opts = self.get_runopts() cfg = RunConfig() cfg.set("run_as", "foobar") cfg.set("cluster_id", 123) with self.assertRaises(InvalidRunConfigException): opts.resolve(cfg)
def test_runopts_resolve_missing_required(self): opts = self.get_runopts() cfg = RunConfig() cfg.set("priority", 20) cfg.set("cluster_id", "test_cluster") with self.assertRaises(InvalidRunConfigException): opts.resolve(cfg)
def test_schedule_success(self, record_tsm_mock): app_info = AppDryRunInfo("test", lambda x: "test") app_info._scheduler = "default" cfg = RunConfig({"image_fetcher": "dir"}) app_info._cfg = cfg session = DummySession("test_session") app_handle = session.schedule(app_info) actual_tsm_event = record_tsm_mock.call_args[0][0] # first arg _, _, app_id = parse_app_handle(app_handle) self.assert_tsm_event( session._generate_tsm_event( "schedule", "default", app_id, runcfg=json.dumps(cfg.cfgs) ), actual_tsm_event, )
def test_submit_dryrun_without_log_dir_cfg(self, img_fetcher_fetch_mock): master = (Role("master").runs( "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container)) trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(master, trainer) cfg = RunConfig() info = self.scheduler.submit_dryrun(app, cfg) # intentional print (to make sure it actually prints with no errors) print(info) request = info.request role_params = request.role_params role_log_dirs = request.role_log_dirs self.assertEqual(2, len(role_params)) self.assertEqual(2, len(role_log_dirs)) master_params = role_params["master"] trainer_params = role_params["trainer"] app_log_dir = request.log_dir self.assertEqual(1, len(master_params)) self.assertEqual(2, len(trainer_params)) for role in app.roles: replica_params = role_params[role.name] replica_log_dirs = role_log_dirs[role.name] for j in range(role.num_replicas): replica_param = replica_params[j] replica_log_dir = replica_log_dirs[j] # dryrun should NOT create any directories self.assertFalse(os.path.isdir(replica_log_dir)) self.assertTrue(replica_log_dir.startswith(app_log_dir)) self.assertEqual([role.entrypoint, *role.args], replica_param.args) self.assertEqual( { ERR_FILE_ENV: join(replica_log_dir, "error.json"), **role.env, }, replica_param.env, ) self.assertIsNone(replica_param.stdout) self.assertIsNone(replica_param.stderr)
def _get_app_log_dir(self, app_id: str, cfg: RunConfig) -> Tuple[str, bool]: """ Returns the log dir and a bool (should_redirect_std). We redirect stdout/err to a log file ONLY if the log_dir is user-provided in the cfg 1. if cfg.get("log_dir") -> (user-specified log dir, True) 2. if not cfg.get("log_dir") -> (autogen tmp log dir, False) """ base_log_dir = cfg.get("log_dir") redirect_std = True if not base_log_dir: base_log_dir = tempfile.mkdtemp(prefix="tsm_") redirect_std = False return os.path.join(str(base_log_dir), self.session_name, app_id), redirect_std
def test_runopts_resolve_minimal(self): opts = self.get_runopts() cfg = RunConfig() cfg.set("run_as", "foobar") resolved = opts.resolve(cfg) self.assertEqual("foobar", resolved.get("run_as")) self.assertEqual(10, resolved.get("priority")) self.assertIsNone(resolved.get("cluster_id")) # make sure original config is untouched self.assertEqual("foobar", cfg.get("run_as")) self.assertIsNone(cfg.get("priority")) self.assertIsNone(cfg.get("cluster_id"))
def test_serde(self): """ tests trivial serialization into dict then back """ cfg = self.get_cfg() ser = dataclasses.asdict(cfg) deser = RunConfig(**ser) self.assertEqual("root", deser.get("run_as")) self.assertEqual(123, deser.get("cluster_id")) self.assertEqual(0.5, deser.get("priority")) self.assertTrue(deser.get("preemtible"))