def test_submit(self): # make sure the macro substitution works # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir) test_file_name = f"{macros.app_id}_{macros.replica_id}" num_replicas = 2 role = (Role("role1").runs( "touch.sh", join(f"{macros.img_root}", test_file_name)).on( self.test_container).replicas(num_replicas)) app = Application(name="test_app").of(role) expected_app_id = make_unique(app.name) with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id): cfg = RunConfig({"log_dir": self.test_dir}) app_id = self.scheduler.submit(app, cfg) self.assertEqual(f"{expected_app_id}", app_id) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state) for i in range(num_replicas): self.assertTrue( os.path.isfile(join(self.test_dir, f"{expected_app_id}_{i}"))) role = Role("role1").runs("fail.sh").on( self.test_container).replicas(2) app = Application(name="test_app").of(role) expected_app_id = make_unique(app.name) with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id): app_id = self.scheduler.submit(app, cfg) self.assertEqual(f"{expected_app_id}", app_id) self.assertEqual(AppState.FAILED, self.wait(app_id).state)
def test_submit(self): # make sure the macro substitution works # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir) test_file_name = f"{macros.app_id}_{macros.replica_id}" num_replicas = 2 role = (Role("role1").runs( "touch.sh", os.path.join(f"{macros.img_root}", test_file_name)).on( self.test_container).replicas(num_replicas)) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) self.assertEqual("test_app_0", app_id) self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state) for i in range(num_replicas): self.assertTrue( os.path.isfile(os.path.join(self.test_dir, f"{app_id}_{i}"))) role = Role("role1").runs("fail.sh").on( self.test_container).replicas(2) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) self.assertEqual("test_app_1", app_id) self.assertEqual(AppState.FAILED, self.scheduler.wait(app_id).state)
def test_cache_evict(self): scheduler = LocalScheduler(session_name="test_session", cache_size=1) test_file1 = join(self.test_dir, "test_file_1") test_file2 = join(self.test_dir, "test_file_2") role1 = Role("role1").runs("touch.sh", test_file1).on(self.test_container) role2 = Role("role2").runs("touch.sh", test_file2).on(self.test_container) app1 = Application(name="touch_test_file1").of(role1) app2 = Application(name="touch_test_file2").of(role2) cfg = RunConfig({"log_dir": self.test_dir}) app_id1 = scheduler.submit(app1, cfg) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id1, scheduler).state) app_id2 = scheduler.submit(app2, cfg) self.assertEqual(AppState.SUCCEEDED, self.wait(app_id2, scheduler).state) # app1 should've been evicted self.assertIsNone(scheduler.describe(app_id1)) self.assertIsNone(self.wait(app_id1, scheduler)) self.assertIsNotNone(scheduler.describe(app_id2)) self.assertIsNotNone(self.wait(app_id2, scheduler))
def test_submit_dryrun_with_log_dir(self, img_fetcher_fetch_mock): trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(trainer) cfg = RunConfig({"log_dir": "/tmp"}) info = self.scheduler.submit_dryrun(app, cfg) print(info) trainer_info = info.request[0]["trainer"] self.assertEqual(2, len(trainer_info)) self.assertEqual( { "args": ["trainer.par"], "env": {}, "stdout": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stdout.log", "stderr": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stderr.log", }, trainer_info[0], ) self.assertEqual( { "args": ["trainer.par"], "env": {}, "stdout": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stdout.log", "stderr": f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stderr.log", }, trainer_info[1], )
def test_submit_dryrun_with_log_dir_cfg(self, img_fetcher_fetch_mock): trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(trainer) cfg = RunConfig({"log_dir": self.test_dir}) info = self.scheduler.submit_dryrun(app, cfg) print(info) trainer_info = info.request[0]["trainer"] self.assertEqual(2, len(trainer_info)) app_log_dir = join(self.test_dir, self.scheduler.session_name, "test_app_##") for i, role in enumerate(app.roles): role_name = role.name role_info = info.request[i][role_name] for j in range(role.num_replicas): replica_log_dir = join(app_log_dir, role_name, str(j)) # dryrun should NOT create any directories self.assertFalse(os.path.isdir(replica_log_dir)) self.assertEqual( { "args": [role.entrypoint, *role.args], "env": { ERR_FILE_ENV: join(replica_log_dir, "error.json"), **role.env, }, "stdout": join(replica_log_dir, "stdout.log"), "stderr": join(replica_log_dir, "stderr.log"), }, role_info[j], # replica_info )
def test_submit_dryrun(self, img_fetcher_fetch_mock): master = (Role("master").runs( "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container)) trainer = (Role("trainer").runs("trainer.par").on( self.test_container).replicas(2)) app = Application(name="test_app").of(master, trainer) cfg = RunConfig() info = self.scheduler.submit_dryrun(app, cfg) print(info) self.assertEqual(2, len(info.request)) master_info = info.request[0]["master"] trainer_info = info.request[1]["trainer"] self.assertEqual(1, len(master_info)) self.assertEqual(2, len(trainer_info)) self.assertEqual( { "args": ["master.par", "arg1"], "env": { "ENV_VAR_1": "VAL1", }, }, master_info[0], ) self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[0]) self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[1])
def test_log_iterator(self): role = (Role("role1").runs("echo_range.sh", "10", "0.5").on(self.test_container).replicas(1)) log_dir = join(self.test_dir, "log") cfg = RunConfig({"log_dir": log_dir}) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, cfg) for i, line in enumerate(self.scheduler.log_iter(app_id, "role1", k=0)): self.assertEqual(str(i), line) # since and until ignored for i, line in enumerate( self.scheduler.log_iter(app_id, "role1", k=0, since=datetime.now(), until=datetime.now())): self.assertEqual(str(i), line) for i, line in enumerate( self.scheduler.log_iter(app_id, "role1", k=0, regex=r"[02468]")): self.assertEqual(str(i * 2), line)
def test_submit_with_log_dir_stdout(self): num_replicas = 2 for std_stream in ["stdout", "stderr"]: with self.subTest(std_stream=std_stream): log_dir = join(self.test_dir, f"test_{std_stream}_log") cfg = RunConfig({"log_dir": log_dir}) role = (Role("role1").runs( f"echo_{std_stream}.sh", "hello_world").on( self.test_container).replicas(num_replicas)) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, cfg) self.wait(app_id) success_file = join(log_dir, self.scheduler.session_name, app_id, "SUCCESS") with open(success_file, "r") as f: sf_json = json.load(f) self.assertEqual(app_id, sf_json["app_id"]) self.assertEqual( join(log_dir, self.scheduler.session_name, app_id), sf_json["log_dir"], ) self.assertEqual(AppState.SUCCEEDED.name, sf_json["final_state"]) for replica_id in range(num_replicas): replica_info = sf_json["roles"]["role1"][replica_id] self._assert_file_content(replica_info[std_stream], "hello_world\n")
def test_validate_no_resource(self): session = self.MockSession() with self.assertRaises(ValueError): container = Container("no resource") role = Role("no resource").runs("echo", "hello_world").on(container) app = Application("no resource").of(role) session.run(app)
def test_application(self): container = Container(image="test_image") trainer = Role("trainer").runs("/bin/sleep", "10").on(container).replicas(2) app = Application(name="test_app").of(trainer) self.assertEqual("test_app", app.name) self.assertEqual(1, len(app.roles)) self.assertEqual(trainer, app.roles[0])
def test_evict_non_existent_app(self): # tests that apps previously run with this session that are finished and eventually # removed by the scheduler also get removed from the session after a status() API has been # called on the app scheduler = LocalScheduler(self.image_fetcher, cache_size=1) session = StandaloneSession(name="test_session", scheduler=scheduler, wait_interval=1) test_file = os.path.join(self.test_dir, "test_file") role = Role(name="touch").runs("touch.sh", test_file).on(self.test_container) app = Application("touch_test_file").of(role) # local scheduler was setup with a cache size of 1 # run the same app twice (the first will be removed from the scheduler's cache) # then validate that the first one will drop from the session's app cache as well app_id1 = session.run(app) session.wait(app_id1) app_id2 = session.run(app) session.wait(app_id2) apps = session.list() self.assertEqual(1, len(apps)) self.assertFalse(app_id1 in apps) self.assertTrue(app_id2 in apps)
def test_wait_timeout(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.MANAGED) with self.assertRaises(TimeoutError): self.scheduler.wait(app_id, timeout=1)
def test_describe(self): role = Role("role1").runs("sleep.sh", "2").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) self.assertIsNone(self.scheduler.describe("test_app_0")) app_id = self.scheduler.submit(app, RunMode.HEADLESS) desc = self.scheduler.describe(app_id) self.assertEqual(AppState.RUNNING, desc.state) self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state)
def test_cache_full(self): scheduler = LocalScheduler(self.image_fetcher, cache_size=1) role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) scheduler.submit(app, RunMode.MANAGED) with self.assertRaises(IndexError): scheduler.submit(app, RunMode.MANAGED)
def test_validate_invalid_replicas(self): session = self.MockSession() with self.assertRaises(ValueError): container = Container("torch").require( Resource(cpu=1, gpu=0, memMB=500)) role = (Role("no container").runs( "echo", "hello_world").on(container).replicas(0)) app = Application("no container").of(role) session.run(app)
def test_exists(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) self.assertTrue(self.scheduler.exists(app_id)) self.scheduler.cancel(app_id) self.assertTrue(self.scheduler.exists(app_id))
def test_submit_inherit_parent_envs(self): role = Role("echo_foo").runs("echo_env_foo.sh").on(self.test_container) app = Application(name="check_foo_env_var").of(role) app_id = self.scheduler.submit(app, RunConfig({"log_dir": self.test_dir})) for line in self.scheduler.log_iter(app_id, "echo_foo"): self.assertEqual("bar", line) desc = self.wait(app_id, self.scheduler) self.assertEqual(AppState.SUCCEEDED, desc.state)
def test_exists(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig({"log_dir": self.test_dir}) app_id = self.scheduler.submit(app, cfg) self.assertTrue(self.scheduler.exists(app_id)) self.scheduler.cancel(app_id) self.assertTrue(self.scheduler.exists(app_id))
def test_submit(self): test_file = os.path.join(self.test_dir, "test_file") role = (Role("role1").runs("touch.sh", test_file).on( self.test_container).replicas(2)) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) self.assertEqual("test_app_0", app_id) self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state) self.assertTrue(os.path.isfile(test_file)) role = Role("role1").runs("fail.sh").on( self.test_container).replicas(2) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) self.assertEqual("test_app_1", app_id) self.assertEqual(AppState.FAILED, self.scheduler.wait(app_id).state)
def test_cache_full(self): scheduler = LocalScheduler(session_name="test_session", cache_size=1) role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) cfg = RunConfig({"log_dir": self.test_dir}) scheduler.submit(app, cfg) with self.assertRaises(IndexError): scheduler.submit(app, cfg)
def test_log_iterator_no_log_dir(self): role = (Role("role1").runs("echo_range.sh", "10", "0.5").on(self.test_container).replicas(1)) app = Application(name="test_app").of(role) with self.assertRaises(RuntimeError, msg="log_dir must be set to iterate logs"): app_id = self.scheduler.submit(app, RunConfig()) self.scheduler.log_iter(app_id, "role1", k=0)
def test_dryrun(self): scheduler_mock = MagicMock() session = StandaloneSession(name=SESSION_NAME, schedulers={"default": scheduler_mock}, wait_interval=1) role = Role(name="touch").runs("echo", "hello world").on(self.test_container) app = Application("name").of(role) session.dryrun(app, "default", cfg=self.cfg) scheduler_mock.submit_dryrun.assert_called_once_with(app, self.cfg)
def test_cancel(self): role = Role("role1").runs("sleep.sh", "10").on(self.test_container).replicas(1) app = Application(name="test_app").of(role) app_id = self.scheduler.submit(app, RunMode.HEADLESS) desc = self.scheduler.describe(app_id) self.assertEqual(AppState.RUNNING, desc.state) self.scheduler.cancel(app_id) self.assertEqual(AppState.CANCELLED, self.scheduler.describe(app_id).state)
def test_status(self, _): session = StandaloneSession( name=SESSION_NAME, schedulers={"default": self.scheduler}, wait_interval=1 ) role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container) app = Application("sleeper").of(role) app_handle = session.run(app, cfg=self.cfg) self.assertEqual(AppState.RUNNING, session.status(app_handle).state) session.stop(app_handle) self.assertEqual(AppState.CANCELLED, session.status(app_handle).state)
def test_run(self): test_file = os.path.join(self.test_dir, "test_file") session = StandaloneSession(name="test_session", scheduler=self.scheduler, wait_interval=1) role = Role(name="touch").runs("touch.sh", test_file).on(self.test_container) app = Application("name").of(role) app_id = session.run(app) self.assertEqual(AppState.SUCCEEDED, session.wait(app_id).state)
def test_status(self): session = StandaloneSession(name="test_session", scheduler=self.scheduler, wait_interval=1) role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container) app = Application("sleeper").of(role) app_id = session.run(app) self.assertEqual(AppState.RUNNING, session.status(app_id).state) session.stop(app_id) self.assertEqual(AppState.CANCELLED, session.status(app_id).state)
def test_describe(self, _): session = StandaloneSession(name=SESSION_NAME, schedulers={"default": self.scheduler}) role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container) app = Application("sleeper").of(role) app_handle = session.run(app, cfg=self.cfg) self.assertEqual(app, session.describe(app_handle)) # unknown app should return None self.assertIsNone(session.describe("default://session1/unknown_app"))
def test_get_schedulers(self): default_sched_mock = MagicMock() local_sched_mock = MagicMock() schedulers = {"default": default_sched_mock, "local": local_sched_mock} session = StandaloneSession(name="test_session", schedulers=schedulers) role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container) app = Application("sleeper").of(role) cfg = RunConfig() session.run(app, scheduler="local", cfg=cfg) local_sched_mock.submit.called_once_with(app, cfg)
def test_run(self, _): test_file = os.path.join(self.test_dir, "test_file") session = StandaloneSession( name=SESSION_NAME, schedulers={"default": self.scheduler}, wait_interval=1 ) self.assertEqual(1, len(session.scheduler_backends())) role = Role(name="touch").runs("touch.sh", test_file).on(self.test_container) app = Application("name").of(role) app_handle = session.run(app, cfg=self.cfg) self.assertEqual(AppState.SUCCEEDED, session.wait(app_handle).state)
def describe(self, app_handle: AppHandle) -> Optional[Application]: scheduler, app_id = self._scheduler_app_id(app_handle, check_session=False) # if the app is in the apps list, then short circuit everything and return it app = self._apps.get(app_handle, None) if app: return app desc = scheduler.describe(app_id) if not desc: return None else: return Application(name=app_id).of(*desc.roles)