Beispiel #1
0
    def test_submit(self):
        # make sure the macro substitution works
        # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir)
        test_file_name = f"{macros.app_id}_{macros.replica_id}"
        num_replicas = 2
        role = (Role("role1").runs(
            "touch.sh", join(f"{macros.img_root}", test_file_name)).on(
                self.test_container).replicas(num_replicas))
        app = Application(name="test_app").of(role)
        expected_app_id = make_unique(app.name)
        with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id):
            cfg = RunConfig({"log_dir": self.test_dir})
            app_id = self.scheduler.submit(app, cfg)

        self.assertEqual(f"{expected_app_id}", app_id)
        self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state)

        for i in range(num_replicas):
            self.assertTrue(
                os.path.isfile(join(self.test_dir, f"{expected_app_id}_{i}")))

        role = Role("role1").runs("fail.sh").on(
            self.test_container).replicas(2)
        app = Application(name="test_app").of(role)
        expected_app_id = make_unique(app.name)
        with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id):
            app_id = self.scheduler.submit(app, cfg)

        self.assertEqual(f"{expected_app_id}", app_id)
        self.assertEqual(AppState.FAILED, self.wait(app_id).state)
Beispiel #2
0
    def test_submit(self):
        # make sure the macro substitution works
        # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir)
        test_file_name = f"{macros.app_id}_{macros.replica_id}"
        num_replicas = 2
        role = (Role("role1").runs(
            "touch.sh", os.path.join(f"{macros.img_root}", test_file_name)).on(
                self.test_container).replicas(num_replicas))
        app = Application(name="test_app").of(role)

        app_id = self.scheduler.submit(app, RunMode.HEADLESS)

        self.assertEqual("test_app_0", app_id)
        self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state)

        for i in range(num_replicas):
            self.assertTrue(
                os.path.isfile(os.path.join(self.test_dir, f"{app_id}_{i}")))

        role = Role("role1").runs("fail.sh").on(
            self.test_container).replicas(2)
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, RunMode.HEADLESS)

        self.assertEqual("test_app_1", app_id)
        self.assertEqual(AppState.FAILED, self.scheduler.wait(app_id).state)
Beispiel #3
0
    def test_cache_evict(self):
        scheduler = LocalScheduler(session_name="test_session", cache_size=1)
        test_file1 = join(self.test_dir, "test_file_1")
        test_file2 = join(self.test_dir, "test_file_2")
        role1 = Role("role1").runs("touch.sh",
                                   test_file1).on(self.test_container)
        role2 = Role("role2").runs("touch.sh",
                                   test_file2).on(self.test_container)
        app1 = Application(name="touch_test_file1").of(role1)
        app2 = Application(name="touch_test_file2").of(role2)
        cfg = RunConfig({"log_dir": self.test_dir})

        app_id1 = scheduler.submit(app1, cfg)
        self.assertEqual(AppState.SUCCEEDED,
                         self.wait(app_id1, scheduler).state)

        app_id2 = scheduler.submit(app2, cfg)
        self.assertEqual(AppState.SUCCEEDED,
                         self.wait(app_id2, scheduler).state)

        # app1 should've been evicted
        self.assertIsNone(scheduler.describe(app_id1))
        self.assertIsNone(self.wait(app_id1, scheduler))

        self.assertIsNotNone(scheduler.describe(app_id2))
        self.assertIsNotNone(self.wait(app_id2, scheduler))
Beispiel #4
0
    def test_submit_dryrun_with_log_dir(self, img_fetcher_fetch_mock):
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(trainer)
        cfg = RunConfig({"log_dir": "/tmp"})
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        trainer_info = info.request[0]["trainer"]
        self.assertEqual(2, len(trainer_info))

        self.assertEqual(
            {
                "args": ["trainer.par"],
                "env": {},
                "stdout":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stdout.log",
                "stderr":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stderr.log",
            },
            trainer_info[0],
        )
        self.assertEqual(
            {
                "args": ["trainer.par"],
                "env": {},
                "stdout":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stdout.log",
                "stderr":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stderr.log",
            },
            trainer_info[1],
        )
    def test_submit_dryrun_with_log_dir_cfg(self, img_fetcher_fetch_mock):
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(trainer)
        cfg = RunConfig({"log_dir": self.test_dir})
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        trainer_info = info.request[0]["trainer"]

        self.assertEqual(2, len(trainer_info))

        app_log_dir = join(self.test_dir, self.scheduler.session_name,
                           "test_app_##")

        for i, role in enumerate(app.roles):
            role_name = role.name
            role_info = info.request[i][role_name]
            for j in range(role.num_replicas):
                replica_log_dir = join(app_log_dir, role_name, str(j))
                # dryrun should NOT create any directories
                self.assertFalse(os.path.isdir(replica_log_dir))
                self.assertEqual(
                    {
                        "args": [role.entrypoint, *role.args],
                        "env": {
                            ERR_FILE_ENV: join(replica_log_dir, "error.json"),
                            **role.env,
                        },
                        "stdout": join(replica_log_dir, "stdout.log"),
                        "stderr": join(replica_log_dir, "stderr.log"),
                    },
                    role_info[j],  # replica_info
                )
Beispiel #6
0
    def test_submit_dryrun(self, img_fetcher_fetch_mock):
        master = (Role("master").runs(
            "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container))
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(master, trainer)
        cfg = RunConfig()
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        self.assertEqual(2, len(info.request))
        master_info = info.request[0]["master"]
        trainer_info = info.request[1]["trainer"]
        self.assertEqual(1, len(master_info))
        self.assertEqual(2, len(trainer_info))
        self.assertEqual(
            {
                "args": ["master.par", "arg1"],
                "env": {
                    "ENV_VAR_1": "VAL1",
                },
            },
            master_info[0],
        )
        self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[0])
        self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[1])
Beispiel #7
0
    def test_log_iterator(self):
        role = (Role("role1").runs("echo_range.sh", "10",
                                   "0.5").on(self.test_container).replicas(1))

        log_dir = join(self.test_dir, "log")
        cfg = RunConfig({"log_dir": log_dir})
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, cfg)

        for i, line in enumerate(self.scheduler.log_iter(app_id, "role1",
                                                         k=0)):
            self.assertEqual(str(i), line)

        # since and until ignored
        for i, line in enumerate(
                self.scheduler.log_iter(app_id,
                                        "role1",
                                        k=0,
                                        since=datetime.now(),
                                        until=datetime.now())):
            self.assertEqual(str(i), line)

        for i, line in enumerate(
                self.scheduler.log_iter(app_id, "role1", k=0,
                                        regex=r"[02468]")):
            self.assertEqual(str(i * 2), line)
Beispiel #8
0
    def test_submit_with_log_dir_stdout(self):
        num_replicas = 2

        for std_stream in ["stdout", "stderr"]:
            with self.subTest(std_stream=std_stream):
                log_dir = join(self.test_dir, f"test_{std_stream}_log")
                cfg = RunConfig({"log_dir": log_dir})

                role = (Role("role1").runs(
                    f"echo_{std_stream}.sh", "hello_world").on(
                        self.test_container).replicas(num_replicas))
                app = Application(name="test_app").of(role)

                app_id = self.scheduler.submit(app, cfg)
                self.wait(app_id)

                success_file = join(log_dir, self.scheduler.session_name,
                                    app_id, "SUCCESS")
                with open(success_file, "r") as f:
                    sf_json = json.load(f)
                    self.assertEqual(app_id, sf_json["app_id"])
                    self.assertEqual(
                        join(log_dir, self.scheduler.session_name, app_id),
                        sf_json["log_dir"],
                    )
                    self.assertEqual(AppState.SUCCEEDED.name,
                                     sf_json["final_state"])

                    for replica_id in range(num_replicas):
                        replica_info = sf_json["roles"]["role1"][replica_id]
                        self._assert_file_content(replica_info[std_stream],
                                                  "hello_world\n")
Beispiel #9
0
 def test_validate_no_resource(self):
     session = self.MockSession()
     with self.assertRaises(ValueError):
         container = Container("no resource")
         role = Role("no resource").runs("echo", "hello_world").on(container)
         app = Application("no resource").of(role)
         session.run(app)
Beispiel #10
0
 def test_application(self):
     container = Container(image="test_image")
     trainer = Role("trainer").runs("/bin/sleep", "10").on(container).replicas(2)
     app = Application(name="test_app").of(trainer)
     self.assertEqual("test_app", app.name)
     self.assertEqual(1, len(app.roles))
     self.assertEqual(trainer, app.roles[0])
    def test_evict_non_existent_app(self):
        # tests that apps previously run with this session that are finished and eventually
        # removed by the scheduler also get removed from the session after a status() API has been
        # called on the app

        scheduler = LocalScheduler(self.image_fetcher, cache_size=1)
        session = StandaloneSession(name="test_session",
                                    scheduler=scheduler,
                                    wait_interval=1)
        test_file = os.path.join(self.test_dir, "test_file")
        role = Role(name="touch").runs("touch.sh",
                                       test_file).on(self.test_container)
        app = Application("touch_test_file").of(role)

        # local scheduler was setup with a cache size of 1
        # run the same app twice (the first will be removed from the scheduler's cache)
        # then validate that the first one will drop from the session's app cache as well
        app_id1 = session.run(app)
        session.wait(app_id1)

        app_id2 = session.run(app)
        session.wait(app_id2)

        apps = session.list()

        self.assertEqual(1, len(apps))
        self.assertFalse(app_id1 in apps)
        self.assertTrue(app_id2 in apps)
    def test_wait_timeout(self):
        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, RunMode.MANAGED)

        with self.assertRaises(TimeoutError):
            self.scheduler.wait(app_id, timeout=1)
 def test_describe(self):
     role = Role("role1").runs("sleep.sh",
                               "2").on(self.test_container).replicas(1)
     app = Application(name="test_app").of(role)
     self.assertIsNone(self.scheduler.describe("test_app_0"))
     app_id = self.scheduler.submit(app, RunMode.HEADLESS)
     desc = self.scheduler.describe(app_id)
     self.assertEqual(AppState.RUNNING, desc.state)
     self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state)
    def test_cache_full(self):
        scheduler = LocalScheduler(self.image_fetcher, cache_size=1)

        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        scheduler.submit(app, RunMode.MANAGED)
        with self.assertRaises(IndexError):
            scheduler.submit(app, RunMode.MANAGED)
Beispiel #15
0
 def test_validate_invalid_replicas(self):
     session = self.MockSession()
     with self.assertRaises(ValueError):
         container = Container("torch").require(
             Resource(cpu=1, gpu=0, memMB=500))
         role = (Role("no container").runs(
             "echo", "hello_world").on(container).replicas(0))
         app = Application("no container").of(role)
         session.run(app)
    def test_exists(self):
        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, RunMode.HEADLESS)

        self.assertTrue(self.scheduler.exists(app_id))
        self.scheduler.cancel(app_id)
        self.assertTrue(self.scheduler.exists(app_id))
Beispiel #17
0
    def test_submit_inherit_parent_envs(self):
        role = Role("echo_foo").runs("echo_env_foo.sh").on(self.test_container)
        app = Application(name="check_foo_env_var").of(role)
        app_id = self.scheduler.submit(app,
                                       RunConfig({"log_dir": self.test_dir}))
        for line in self.scheduler.log_iter(app_id, "echo_foo"):
            self.assertEqual("bar", line)

        desc = self.wait(app_id, self.scheduler)
        self.assertEqual(AppState.SUCCEEDED, desc.state)
Beispiel #18
0
    def test_exists(self):
        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        cfg = RunConfig({"log_dir": self.test_dir})
        app_id = self.scheduler.submit(app, cfg)

        self.assertTrue(self.scheduler.exists(app_id))
        self.scheduler.cancel(app_id)
        self.assertTrue(self.scheduler.exists(app_id))
    def test_submit(self):
        test_file = os.path.join(self.test_dir, "test_file")
        role = (Role("role1").runs("touch.sh", test_file).on(
            self.test_container).replicas(2))
        app = Application(name="test_app").of(role)

        app_id = self.scheduler.submit(app, RunMode.HEADLESS)

        self.assertEqual("test_app_0", app_id)
        self.assertEqual(AppState.SUCCEEDED, self.scheduler.wait(app_id).state)
        self.assertTrue(os.path.isfile(test_file))

        role = Role("role1").runs("fail.sh").on(
            self.test_container).replicas(2)
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, RunMode.HEADLESS)

        self.assertEqual("test_app_1", app_id)
        self.assertEqual(AppState.FAILED, self.scheduler.wait(app_id).state)
Beispiel #20
0
    def test_cache_full(self):
        scheduler = LocalScheduler(session_name="test_session", cache_size=1)

        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        cfg = RunConfig({"log_dir": self.test_dir})
        scheduler.submit(app, cfg)
        with self.assertRaises(IndexError):
            scheduler.submit(app, cfg)
Beispiel #21
0
    def test_log_iterator_no_log_dir(self):
        role = (Role("role1").runs("echo_range.sh", "10",
                                   "0.5").on(self.test_container).replicas(1))

        app = Application(name="test_app").of(role)

        with self.assertRaises(RuntimeError,
                               msg="log_dir must be set to iterate logs"):
            app_id = self.scheduler.submit(app, RunConfig())
            self.scheduler.log_iter(app_id, "role1", k=0)
 def test_dryrun(self):
     scheduler_mock = MagicMock()
     session = StandaloneSession(name=SESSION_NAME,
                                 schedulers={"default": scheduler_mock},
                                 wait_interval=1)
     role = Role(name="touch").runs("echo",
                                    "hello world").on(self.test_container)
     app = Application("name").of(role)
     session.dryrun(app, "default", cfg=self.cfg)
     scheduler_mock.submit_dryrun.assert_called_once_with(app, self.cfg)
 def test_cancel(self):
     role = Role("role1").runs("sleep.sh",
                               "10").on(self.test_container).replicas(1)
     app = Application(name="test_app").of(role)
     app_id = self.scheduler.submit(app, RunMode.HEADLESS)
     desc = self.scheduler.describe(app_id)
     self.assertEqual(AppState.RUNNING, desc.state)
     self.scheduler.cancel(app_id)
     self.assertEqual(AppState.CANCELLED,
                      self.scheduler.describe(app_id).state)
 def test_status(self, _):
     session = StandaloneSession(
         name=SESSION_NAME, schedulers={"default": self.scheduler}, wait_interval=1
     )
     role = Role(name="sleep").runs("sleep.sh", "60").on(self.test_container)
     app = Application("sleeper").of(role)
     app_handle = session.run(app, cfg=self.cfg)
     self.assertEqual(AppState.RUNNING, session.status(app_handle).state)
     session.stop(app_handle)
     self.assertEqual(AppState.CANCELLED, session.status(app_handle).state)
    def test_run(self):
        test_file = os.path.join(self.test_dir, "test_file")
        session = StandaloneSession(name="test_session",
                                    scheduler=self.scheduler,
                                    wait_interval=1)
        role = Role(name="touch").runs("touch.sh",
                                       test_file).on(self.test_container)
        app = Application("name").of(role)

        app_id = session.run(app)
        self.assertEqual(AppState.SUCCEEDED, session.wait(app_id).state)
 def test_status(self):
     session = StandaloneSession(name="test_session",
                                 scheduler=self.scheduler,
                                 wait_interval=1)
     role = Role(name="sleep").runs("sleep.sh",
                                    "60").on(self.test_container)
     app = Application("sleeper").of(role)
     app_id = session.run(app)
     self.assertEqual(AppState.RUNNING, session.status(app_id).state)
     session.stop(app_id)
     self.assertEqual(AppState.CANCELLED, session.status(app_id).state)
    def test_describe(self, _):
        session = StandaloneSession(name=SESSION_NAME,
                                    schedulers={"default": self.scheduler})
        role = Role(name="sleep").runs("sleep.sh",
                                       "60").on(self.test_container)
        app = Application("sleeper").of(role)

        app_handle = session.run(app, cfg=self.cfg)
        self.assertEqual(app, session.describe(app_handle))
        # unknown app should return None
        self.assertIsNone(session.describe("default://session1/unknown_app"))
    def test_get_schedulers(self):
        default_sched_mock = MagicMock()
        local_sched_mock = MagicMock()
        schedulers = {"default": default_sched_mock, "local": local_sched_mock}
        session = StandaloneSession(name="test_session", schedulers=schedulers)

        role = Role(name="sleep").runs("sleep.sh",
                                       "60").on(self.test_container)
        app = Application("sleeper").of(role)
        cfg = RunConfig()
        session.run(app, scheduler="local", cfg=cfg)
        local_sched_mock.submit.called_once_with(app, cfg)
    def test_run(self, _):
        test_file = os.path.join(self.test_dir, "test_file")
        session = StandaloneSession(
            name=SESSION_NAME, schedulers={"default": self.scheduler}, wait_interval=1
        )
        self.assertEqual(1, len(session.scheduler_backends()))

        role = Role(name="touch").runs("touch.sh", test_file).on(self.test_container)
        app = Application("name").of(role)

        app_handle = session.run(app, cfg=self.cfg)
        self.assertEqual(AppState.SUCCEEDED, session.wait(app_handle).state)
    def describe(self, app_handle: AppHandle) -> Optional[Application]:
        scheduler, app_id = self._scheduler_app_id(app_handle, check_session=False)

        # if the app is in the apps list, then short circuit everything and return it
        app = self._apps.get(app_handle, None)
        if app:
            return app

        desc = scheduler.describe(app_id)
        if not desc:
            return None
        else:
            return Application(name=app_id).of(*desc.roles)