Beispiel #1
0
 def get_cfg(self):
     cfg = RunConfig()
     cfg.set("run_as", "root")
     cfg.set("cluster_id", 123)
     cfg.set("priority", 0.5)
     cfg.set("preemtible", True)
     return cfg
Beispiel #2
0
    def test_role_preproc_called(self):
        scheduler_mock = SchedulerTest.MockScheduler("test_session")
        app_mock = MagicMock()
        app_mock.roles = [MagicMock()]

        cfg = RunConfig()
        cfg.set("foo", "bar")
        scheduler_mock.submit_dryrun(app_mock, cfg)
        role_mock = app_mock.roles[0]
        role_mock.pre_proc.assert_called_once()
Beispiel #3
0
    def test_invalid_dryrun_cfg(self):
        scheduler_mock = SchedulerTest.MockScheduler("test_session")
        app_mock = MagicMock()

        with self.assertRaises(InvalidRunConfigException):
            empty_cfg = RunConfig()
            scheduler_mock.submit_dryrun(app_mock, empty_cfg)

        with self.assertRaises(InvalidRunConfigException):
            bad_type_cfg = RunConfig()
            bad_type_cfg.set("foo", 100)
            scheduler_mock.submit_dryrun(app_mock, empty_cfg)
Beispiel #4
0
    def test_submit(self):
        # make sure the macro substitution works
        # touch a file called {app_id}_{replica_id} in the img_root directory (self.test_dir)
        test_file_name = f"{macros.app_id}_{macros.replica_id}"
        num_replicas = 2
        role = (Role("role1").runs(
            "touch.sh", join(f"{macros.img_root}", test_file_name)).on(
                self.test_container).replicas(num_replicas))
        app = Application(name="test_app").of(role)
        expected_app_id = make_unique(app.name)
        with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id):
            cfg = RunConfig({"log_dir": self.test_dir})
            app_id = self.scheduler.submit(app, cfg)

        self.assertEqual(f"{expected_app_id}", app_id)
        self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state)

        for i in range(num_replicas):
            self.assertTrue(
                os.path.isfile(join(self.test_dir, f"{expected_app_id}_{i}")))

        role = Role("role1").runs("fail.sh").on(
            self.test_container).replicas(2)
        app = Application(name="test_app").of(role)
        expected_app_id = make_unique(app.name)
        with patch(LOCAL_SCHEDULER_MAKE_UNIQUE, return_value=expected_app_id):
            app_id = self.scheduler.submit(app, cfg)

        self.assertEqual(f"{expected_app_id}", app_id)
        self.assertEqual(AppState.FAILED, self.wait(app_id).state)
Beispiel #5
0
    def test_log_iterator(self):
        role = (Role("role1").runs("echo_range.sh", "10",
                                   "0.5").on(self.test_container).replicas(1))

        log_dir = join(self.test_dir, "log")
        cfg = RunConfig({"log_dir": log_dir})
        app = Application(name="test_app").of(role)
        app_id = self.scheduler.submit(app, cfg)

        for i, line in enumerate(self.scheduler.log_iter(app_id, "role1",
                                                         k=0)):
            self.assertEqual(str(i), line)

        # since and until ignored
        for i, line in enumerate(
                self.scheduler.log_iter(app_id,
                                        "role1",
                                        k=0,
                                        since=datetime.now(),
                                        until=datetime.now())):
            self.assertEqual(str(i), line)

        for i, line in enumerate(
                self.scheduler.log_iter(app_id, "role1", k=0,
                                        regex=r"[02468]")):
            self.assertEqual(str(i * 2), line)
Beispiel #6
0
    def test_submit_dryrun_with_log_dir(self, img_fetcher_fetch_mock):
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(trainer)
        cfg = RunConfig({"log_dir": "/tmp"})
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        trainer_info = info.request[0]["trainer"]
        self.assertEqual(2, len(trainer_info))

        self.assertEqual(
            {
                "args": ["trainer.par"],
                "env": {},
                "stdout":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stdout.log",
                "stderr":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/0/stderr.log",
            },
            trainer_info[0],
        )
        self.assertEqual(
            {
                "args": ["trainer.par"],
                "env": {},
                "stdout":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stdout.log",
                "stderr":
                f"/tmp/{self.scheduler.session_name}/test_app_##/trainer/1/stderr.log",
            },
            trainer_info[1],
        )
Beispiel #7
0
    def test_submit_dryrun(self, img_fetcher_fetch_mock):
        master = (Role("master").runs(
            "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container))
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(master, trainer)
        cfg = RunConfig()
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        self.assertEqual(2, len(info.request))
        master_info = info.request[0]["master"]
        trainer_info = info.request[1]["trainer"]
        self.assertEqual(1, len(master_info))
        self.assertEqual(2, len(trainer_info))
        self.assertEqual(
            {
                "args": ["master.par", "arg1"],
                "env": {
                    "ENV_VAR_1": "VAL1",
                },
            },
            master_info[0],
        )
        self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[0])
        self.assertEqual({"args": ["trainer.par"], "env": {}}, trainer_info[1])
Beispiel #8
0
    def test_cache_evict(self):
        scheduler = LocalScheduler(session_name="test_session", cache_size=1)
        test_file1 = join(self.test_dir, "test_file_1")
        test_file2 = join(self.test_dir, "test_file_2")
        role1 = Role("role1").runs("touch.sh",
                                   test_file1).on(self.test_container)
        role2 = Role("role2").runs("touch.sh",
                                   test_file2).on(self.test_container)
        app1 = Application(name="touch_test_file1").of(role1)
        app2 = Application(name="touch_test_file2").of(role2)
        cfg = RunConfig({"log_dir": self.test_dir})

        app_id1 = scheduler.submit(app1, cfg)
        self.assertEqual(AppState.SUCCEEDED,
                         self.wait(app_id1, scheduler).state)

        app_id2 = scheduler.submit(app2, cfg)
        self.assertEqual(AppState.SUCCEEDED,
                         self.wait(app_id2, scheduler).state)

        # app1 should've been evicted
        self.assertIsNone(scheduler.describe(app_id1))
        self.assertIsNone(self.wait(app_id1, scheduler))

        self.assertIsNotNone(scheduler.describe(app_id2))
        self.assertIsNotNone(self.wait(app_id2, scheduler))
Beispiel #9
0
 def _get_app_log_dir(self, app_id: str, cfg: RunConfig) -> Optional[str]:
     # pyre-ignore [6]: type check already done by runopt.resolve
     log_dir: str = cfg.get("log_dir")
     if log_dir:
         return os.path.join(log_dir, self.session_name, app_id)
     else:
         return None
Beispiel #10
0
    def test_submit_with_log_dir_stdout(self):
        num_replicas = 2

        for std_stream in ["stdout", "stderr"]:
            with self.subTest(std_stream=std_stream):
                log_dir = join(self.test_dir, f"test_{std_stream}_log")
                cfg = RunConfig({"log_dir": log_dir})

                role = (Role("role1").runs(
                    f"echo_{std_stream}.sh", "hello_world").on(
                        self.test_container).replicas(num_replicas))
                app = Application(name="test_app").of(role)

                app_id = self.scheduler.submit(app, cfg)
                self.wait(app_id)

                success_file = join(log_dir, self.scheduler.session_name,
                                    app_id, "SUCCESS")
                with open(success_file, "r") as f:
                    sf_json = json.load(f)
                    self.assertEqual(app_id, sf_json["app_id"])
                    self.assertEqual(
                        join(log_dir, self.scheduler.session_name, app_id),
                        sf_json["log_dir"],
                    )
                    self.assertEqual(AppState.SUCCEEDED.name,
                                     sf_json["final_state"])

                    for replica_id in range(num_replicas):
                        replica_info = sf_json["roles"]["role1"][replica_id]
                        self._assert_file_content(replica_info[std_stream],
                                                  "hello_world\n")
    def test_submit_dryrun_with_log_dir_cfg(self, img_fetcher_fetch_mock):
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(trainer)
        cfg = RunConfig({"log_dir": self.test_dir})
        info = self.scheduler.submit_dryrun(app, cfg)
        print(info)
        trainer_info = info.request[0]["trainer"]

        self.assertEqual(2, len(trainer_info))

        app_log_dir = join(self.test_dir, self.scheduler.session_name,
                           "test_app_##")

        for i, role in enumerate(app.roles):
            role_name = role.name
            role_info = info.request[i][role_name]
            for j in range(role.num_replicas):
                replica_log_dir = join(app_log_dir, role_name, str(j))
                # dryrun should NOT create any directories
                self.assertFalse(os.path.isdir(replica_log_dir))
                self.assertEqual(
                    {
                        "args": [role.entrypoint, *role.args],
                        "env": {
                            ERR_FILE_ENV: join(replica_log_dir, "error.json"),
                            **role.env,
                        },
                        "stdout": join(replica_log_dir, "stdout.log"),
                        "stderr": join(replica_log_dir, "stderr.log"),
                    },
                    role_info[j],  # replica_info
                )
Beispiel #12
0
    def test_submit_inherit_parent_envs(self):
        role = Role("echo_foo").runs("echo_env_foo.sh").on(self.test_container)
        app = Application(name="check_foo_env_var").of(role)
        app_id = self.scheduler.submit(app,
                                       RunConfig({"log_dir": self.test_dir}))
        for line in self.scheduler.log_iter(app_id, "echo_foo"):
            self.assertEqual("bar", line)

        desc = self.wait(app_id, self.scheduler)
        self.assertEqual(AppState.SUCCEEDED, desc.state)
Beispiel #13
0
    def test_log_iterator_no_log_dir(self):
        role = (Role("role1").runs("echo_range.sh", "10",
                                   "0.5").on(self.test_container).replicas(1))

        app = Application(name="test_app").of(role)

        with self.assertRaises(RuntimeError,
                               msg="log_dir must be set to iterate logs"):
            app_id = self.scheduler.submit(app, RunConfig())
            self.scheduler.log_iter(app_id, "role1", k=0)
Beispiel #14
0
    def test_exists(self):
        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        cfg = RunConfig({"log_dir": self.test_dir})
        app_id = self.scheduler.submit(app, cfg)

        self.assertTrue(self.scheduler.exists(app_id))
        self.scheduler.cancel(app_id)
        self.assertTrue(self.scheduler.exists(app_id))
Beispiel #15
0
    def test_cache_full(self):
        scheduler = LocalScheduler(session_name="test_session", cache_size=1)

        role = Role("role1").runs("sleep.sh",
                                  "10").on(self.test_container).replicas(1)
        app = Application(name="test_app").of(role)
        cfg = RunConfig({"log_dir": self.test_dir})
        scheduler.submit(app, cfg)
        with self.assertRaises(IndexError):
            scheduler.submit(app, cfg)
Beispiel #16
0
 def test_describe(self):
     role = Role("role1").runs("sleep.sh",
                               "2").on(self.test_container).replicas(1)
     app = Application(name="test_app").of(role)
     cfg = RunConfig({"log_dir": self.test_dir})
     self.assertIsNone(self.scheduler.describe("test_app_0"))
     app_id = self.scheduler.submit(app, cfg)
     desc = self.scheduler.describe(app_id)
     self.assertEqual(AppState.RUNNING, desc.state)
     self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state)
Beispiel #17
0
    def test_runopts_resolve_override(self):
        opts = self.get_runopts()

        cfg = RunConfig()
        cfg.set("run_as", "foobar")
        cfg.set("priority", 20)
        cfg.set("cluster_id", "test_cluster")

        resolved = opts.resolve(cfg)
        self.assertEqual("foobar", resolved.get("run_as"))
        self.assertEqual(20, resolved.get("priority"))
        self.assertEqual("test_cluster", resolved.get("cluster_id"))
Beispiel #18
0
 def test_cancel(self):
     role = Role("role1").runs("sleep.sh",
                               "10").on(self.test_container).replicas(1)
     app = Application(name="test_app").of(role)
     cfg = RunConfig()
     app_id = self.scheduler.submit(app, cfg)
     desc = self.scheduler.describe(app_id)
     self.assertEqual(AppState.RUNNING, desc.state)
     self.scheduler.cancel(app_id)
     self.assertEqual(AppState.CANCELLED,
                      self.scheduler.describe(app_id).state)
 def test_schedule_fail(self, record_tsm_mock):
     app_info = AppDryRunInfo("test", lambda x: "test")
     app_info._scheduler = "default"
     cfg = RunConfig({"image_fetcher": "dir"})
     app_info._cfg = cfg
     session = DummySession("test_session")
     with self.assertRaises(RuntimeError):
         with patch.object(session, "_schedule") as schedule_mock:
             schedule_mock.side_effect = RuntimeError("test error")
             session.schedule(app_info)
     record_tsm_mock.assert_called()
    def test_get_schedulers(self):
        default_sched_mock = MagicMock()
        local_sched_mock = MagicMock()
        schedulers = {"default": default_sched_mock, "local": local_sched_mock}
        session = StandaloneSession(name="test_session", schedulers=schedulers)

        role = Role(name="sleep").runs("sleep.sh",
                                       "60").on(self.test_container)
        app = Application("sleeper").of(role)
        cfg = RunConfig()
        session.run(app, scheduler="local", cfg=cfg)
        local_sched_mock.submit.called_once_with(app, cfg)
Beispiel #21
0
 def _get_img_fetcher(self, cfg: RunConfig) -> ImageFetcher:
     img_fetcher_type = cfg.get("image_fetcher")
     fetchers = self._img_fetchers()
     # pyre-ignore [6]: type check already done by runopt.resolve
     img_fetcher = fetchers.get(img_fetcher_type, None)
     if not img_fetcher:
         raise InvalidRunConfigException(
             f"Unsupported image fetcher type: {img_fetcher_type}. Must be one of: {fetchers.keys()}",
             cfg,
             self.run_opts(),
         )
     return img_fetcher
    def setUp(self):
        self.test_dir = tempfile.mkdtemp("StandaloneSessionTest")

        write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
        write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
        write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])

        self.scheduler = LocalScheduler(SESSION_NAME)
        self.cfg = RunConfig({"image_fetcher": "dir"})

        # resource ignored for local scheduler; adding as an example
        self.test_container = Container(image=self.test_dir).require(resource.SMALL)
Beispiel #23
0
    def test_submit_multiple_roles(self):
        test_file1 = join(self.test_dir, "test_file_1")
        test_file2 = join(self.test_dir, "test_file_2")
        role1 = (Role("role1").runs("touch.sh", test_file1).on(
            self.test_container).replicas(1))
        role2 = (Role("role2").runs("touch.sh", test_file2).on(
            self.test_container).replicas(1))
        app = Application(name="test_app").of(role1, role2)
        cfg = RunConfig({"log_dir": self.test_dir})
        app_id = self.scheduler.submit(app, cfg)

        self.assertEqual(AppState.SUCCEEDED, self.wait(app_id).state)
        self.assertTrue(os.path.isfile(test_file1))
        self.assertTrue(os.path.isfile(test_file2))
Beispiel #24
0
    def test_runopts_resolve_bad_type(self):
        opts = self.get_runopts()

        cfg = RunConfig()
        cfg.set("run_as", "foobar")
        cfg.set("cluster_id", 123)

        with self.assertRaises(InvalidRunConfigException):
            opts.resolve(cfg)
Beispiel #25
0
    def test_runopts_resolve_missing_required(self):
        opts = self.get_runopts()

        cfg = RunConfig()
        cfg.set("priority", 20)
        cfg.set("cluster_id", "test_cluster")

        with self.assertRaises(InvalidRunConfigException):
            opts.resolve(cfg)
 def test_schedule_success(self, record_tsm_mock):
     app_info = AppDryRunInfo("test", lambda x: "test")
     app_info._scheduler = "default"
     cfg = RunConfig({"image_fetcher": "dir"})
     app_info._cfg = cfg
     session = DummySession("test_session")
     app_handle = session.schedule(app_info)
     actual_tsm_event = record_tsm_mock.call_args[0][0]  # first arg
     _, _, app_id = parse_app_handle(app_handle)
     self.assert_tsm_event(
         session._generate_tsm_event(
             "schedule", "default", app_id, runcfg=json.dumps(cfg.cfgs)
         ),
         actual_tsm_event,
     )
Beispiel #27
0
    def test_submit_dryrun_without_log_dir_cfg(self, img_fetcher_fetch_mock):
        master = (Role("master").runs(
            "master.par", "arg1", ENV_VAR_1="VAL1").on(self.test_container))
        trainer = (Role("trainer").runs("trainer.par").on(
            self.test_container).replicas(2))

        app = Application(name="test_app").of(master, trainer)
        cfg = RunConfig()
        info = self.scheduler.submit_dryrun(app, cfg)
        # intentional print (to make sure it actually prints with no errors)
        print(info)

        request = info.request
        role_params = request.role_params
        role_log_dirs = request.role_log_dirs
        self.assertEqual(2, len(role_params))
        self.assertEqual(2, len(role_log_dirs))

        master_params = role_params["master"]
        trainer_params = role_params["trainer"]

        app_log_dir = request.log_dir

        self.assertEqual(1, len(master_params))
        self.assertEqual(2, len(trainer_params))

        for role in app.roles:
            replica_params = role_params[role.name]
            replica_log_dirs = role_log_dirs[role.name]

            for j in range(role.num_replicas):
                replica_param = replica_params[j]
                replica_log_dir = replica_log_dirs[j]

                # dryrun should NOT create any directories
                self.assertFalse(os.path.isdir(replica_log_dir))
                self.assertTrue(replica_log_dir.startswith(app_log_dir))
                self.assertEqual([role.entrypoint, *role.args],
                                 replica_param.args)
                self.assertEqual(
                    {
                        ERR_FILE_ENV: join(replica_log_dir, "error.json"),
                        **role.env,
                    },
                    replica_param.env,
                )
                self.assertIsNone(replica_param.stdout)
                self.assertIsNone(replica_param.stderr)
Beispiel #28
0
    def _get_app_log_dir(self, app_id: str, cfg: RunConfig) -> Tuple[str, bool]:
        """
        Returns the log dir and a bool (should_redirect_std). We redirect stdout/err
        to a log file ONLY if the log_dir is user-provided in the cfg

        1. if cfg.get("log_dir") -> (user-specified log dir, True)
        2. if not cfg.get("log_dir") -> (autogen tmp log dir, False)
        """

        base_log_dir = cfg.get("log_dir")
        redirect_std = True
        if not base_log_dir:
            base_log_dir = tempfile.mkdtemp(prefix="tsm_")
            redirect_std = False

        return os.path.join(str(base_log_dir), self.session_name, app_id), redirect_std
Beispiel #29
0
    def test_runopts_resolve_minimal(self):
        opts = self.get_runopts()

        cfg = RunConfig()
        cfg.set("run_as", "foobar")

        resolved = opts.resolve(cfg)
        self.assertEqual("foobar", resolved.get("run_as"))
        self.assertEqual(10, resolved.get("priority"))
        self.assertIsNone(resolved.get("cluster_id"))

        # make sure original config is untouched
        self.assertEqual("foobar", cfg.get("run_as"))
        self.assertIsNone(cfg.get("priority"))
        self.assertIsNone(cfg.get("cluster_id"))
Beispiel #30
0
    def test_serde(self):
        """
        tests trivial serialization into dict then back
        """
        cfg = self.get_cfg()
        ser = dataclasses.asdict(cfg)
        deser = RunConfig(**ser)

        self.assertEqual("root", deser.get("run_as"))
        self.assertEqual(123, deser.get("cluster_id"))
        self.assertEqual(0.5, deser.get("priority"))
        self.assertTrue(deser.get("preemtible"))