Exemple #1
0
 def test_get_resource_none(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     res2 = Resource(cpu=1, gpu=2, memMB=256)
     container = Container("torch").require(
         {"default": res1, "test_scheduler": res2}
     )
     self.assertEqual(NULL_RESOURCE, container.get_resource("non-existent"))
Exemple #2
0
 def test_get_resource_incorrect_input(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     res2 = Resource(cpu=1, gpu=2, memMB=256)
     with self.assertRaises(ValueError):
         Container("torch").require(
             {"default": res1, "test_scheduler": res2}, "new_scheduler"
         )
Exemple #3
0
 def test_application(self):
     container = Container(image="test_image")
     trainer = Role("trainer").runs("/bin/sleep", "10").on(container).replicas(2)
     app = Application(name="test_app").of(trainer)
     self.assertEqual("test_app", app.name)
     self.assertEqual(1, len(app.roles))
     self.assertEqual(trainer, app.roles[0])
Exemple #4
0
 def test_validate_no_resource(self):
     session = self.MockSession()
     with self.assertRaises(ValueError):
         container = Container("no resource")
         role = Role("no resource").runs("echo", "hello_world").on(container)
         app = Application("no resource").of(role)
         session.run(app)
Exemple #5
0
 def test_get_resource_mapping(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     res2 = Resource(cpu=1, gpu=2, memMB=256)
     container = Container("torch").require({"default": res1, ALL: res2})
     self.assertEqual(2, len(container.resources))
     self.assertEqual(res1, container.get_resource("default"))
     self.assertEqual(res2, container.get_resource(ALL))
     self.assertEqual(res2, container.get_resource("unknown_scheduler"))
Exemple #6
0
 def test_create_container_with_resource(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     res2 = Resource(cpu=1, gpu=2, memMB=256)
     container = (Container("torch").require(res1, "default").require(
         res2, "test_scheduler"))
     self.assertEqual(2, len(container.resources))
     self.assertEqual(res1, container.resources["default"])
     self.assertEqual(res2, container.resources["test_scheduler"])
Exemple #7
0
 def test_validate_invalid_replicas(self):
     session = self.MockSession()
     with self.assertRaises(ValueError):
         container = Container("torch").require(
             Resource(cpu=1, gpu=0, memMB=500))
         role = (Role("no container").runs(
             "echo", "hello_world").on(container).replicas(0))
         app = Application("no container").of(role)
         session.run(app)
    def setUp(self):
        self.test_dir = tempfile.mkdtemp("LocalSchedulerTest")
        write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
        write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
        write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])

        self.image_fetcher = LocalDirectoryImageFetcher()
        self.scheduler = LocalScheduler(self.image_fetcher)

        self.test_container = Container(image=self.test_dir)
    def setUp(self):
        self.test_dir = tempfile.mkdtemp("StandaloneSessionTest")

        write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
        write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
        write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])

        self.scheduler = LocalScheduler(SESSION_NAME)
        self.cfg = RunConfig({"image_fetcher": "dir"})

        # resource ignored for local scheduler; adding as an example
        self.test_container = Container(image=self.test_dir).require(resource.SMALL)
Exemple #10
0
    def test_json_serialization(self):
        """
        Tests that an ElasticRole can be serialized into json (dict)
        then recreated as a Role. An ElasticRole is really just a builder
        utility to make it easy for users to create a Role with the entrypoint
        being ``torchelastic.distributed.launch``
        """
        resource = Resource(cpu=1, gpu=0, memMB=512)
        container = Container(image="user_image",
                              resources={
                                  "default": resource
                              }).ports(tensorboard=8080)
        elastic_role = (ElasticRole("test_role",
                                    nnodes="2:4",
                                    rdzv_backend="etcd",
                                    rdzv_id="foobar").runs(
                                        "user_script.py", "--script_arg",
                                        "foo").on(container).replicas(3))

        # this is effectively JSON
        elastic_json = dataclasses.asdict(elastic_role)
        container_json = elastic_json.pop("container")
        resources_json = container_json.pop("resources")
        container_json["resources"] = {}
        for sched, resource_json in resources_json.items():
            container_json["resources"][sched] = Resource(**resource_json)

        role = Role(
            **elastic_json,
            container=Container(**container_json),
        )
        self.assertEqual(container, role.container)
        self.assertEqual(elastic_role.name, role.name)
        self.assertEqual(elastic_role.entrypoint, role.entrypoint)
        self.assertEqual(
            elastic_role.args,
            role.args,
        )
        self.assertEqual(dataclasses.asdict(elastic_role),
                         dataclasses.asdict(role))
Exemple #11
0
    def setUp(self):
        self.test_dir = tempfile.mkdtemp("StandaloneSessionTest")

        write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
        write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
        write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])

        self.image_fetcher = LocalDirectoryImageFetcher()
        self.scheduler = LocalScheduler(self.image_fetcher)

        # resource ignored for local scheduler; adding as an example
        self.test_container = Container(image=self.test_dir).require(
            Resource.SMALL)
Exemple #12
0
    def test_build_role(self):
        # runs: ENV_VAR_1=FOOBAR /bin/echo hello world
        container = Container(image="test_image")
        container.ports(foo=8080)
        trainer = (Role("trainer").runs(
            "/bin/echo", "hello", "world",
            ENV_VAR_1="FOOBAR").on(container).replicas(2))

        self.assertEqual("trainer", trainer.name)
        self.assertEqual("/bin/echo", trainer.entrypoint)
        self.assertEqual({"ENV_VAR_1": "FOOBAR"}, trainer.env)
        self.assertEqual(["hello", "world"], trainer.args)
        self.assertEqual(container, trainer.container)
        self.assertEqual(2, trainer.num_replicas)
Exemple #13
0
    def setUp(self):
        self.test_dir = tempfile.mkdtemp("LocalSchedulerTest")
        write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
        write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
        write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])
        write_shell_script(self.test_dir, "echo.sh", ["echo $1"])
        write_shell_script(self.test_dir, "echo_stderr.sh", ["echo $1 1>&2"])
        write_shell_script(
            self.test_dir,
            "echo_range.sh",
            ["for i in $(seq 0 $1); do echo $i 1>&2; sleep $2; done"],
        )

        self.scheduler = LocalScheduler(session_name="test_session")
        self.test_container = Container(image=self.test_dir)
Exemple #14
0
 def setUp(self):
     self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_")
     write_shell_script(self.test_dir, "touch.sh", ["touch $1"])
     write_shell_script(self.test_dir, "fail.sh", ["exit 1"])
     write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"])
     write_shell_script(self.test_dir, "echo_stdout.sh", ["echo $1"])
     write_shell_script(self.test_dir, "echo_stderr.sh", ["echo $1 1>&2"])
     write_shell_script(
         self.test_dir,
         "echo_range.sh",
         ["for i in $(seq 0 $1); do echo $i 1>&2; sleep $2; done"],
     )
     write_shell_script(self.test_dir, "echo_env_foo.sh",
                        ["echo $FOO 1>&2"])
     self.scheduler = LocalScheduler(session_name="test_session")
     self.test_container = Container(image=self.test_dir)
Exemple #15
0
 def test_build_elastic_role(self):
     # runs: python -m torchelastic.distributed.launch
     #                    --nnodes 2:4
     #                    --max_restarts 3
     #                    --no_python True
     #                    --rdzv_backend etcd
     #                    --rdzv_id ${app_id}
     #                    /bin/echo hello world
     container = Container(image="test_image")
     container.ports(foo=8080)
     elastic_trainer = (ElasticRole(
         "elastic_trainer", nnodes="2:4", max_restarts=3,
         no_python=True).runs("/bin/echo",
                              "hello",
                              "world",
                              ENV_VAR_1="FOOBAR").on(container).replicas(2))
     self.assertEqual("elastic_trainer", elastic_trainer.name)
     self.assertEqual("python", elastic_trainer.entrypoint)
     self.assertEqual(
         [
             "-m",
             "torchelastic.distributed.launch",
             "--nnodes",
             "2:4",
             "--max_restarts",
             "3",
             "--no_python",
             "--rdzv_backend",
             "etcd",
             "--rdzv_id",
             macros.app_id,
             "--role",
             "elastic_trainer",
             "/bin/echo",
             "hello",
             "world",
         ],
         elastic_trainer.args,
     )
     self.assertEqual({"ENV_VAR_1": "FOOBAR"}, elastic_trainer.env)
     self.assertEqual(container, elastic_trainer.container)
     self.assertEqual(2, elastic_trainer.num_replicas)
Exemple #16
0
    def test_build_role(self):
        # runs: ENV_VAR_1=FOOBAR /bin/echo hello world
        container = Container(image="test_image")
        container.ports(foo=8080)
        trainer = (Role("trainer").runs(
            "/bin/echo", "hello", "world",
            ENV_VAR_1="FOOBAR").on(container).replicas(2).with_retry_policy(
                RetryPolicy.REPLICA, max_retries=5).with_deployment_preference(
                    DeploymentPreference.SERVICE))

        self.assertEqual("trainer", trainer.name)
        self.assertEqual("/bin/echo", trainer.entrypoint)
        self.assertEqual({"ENV_VAR_1": "FOOBAR"}, trainer.env)
        self.assertEqual(["hello", "world"], trainer.args)
        self.assertEqual(container, trainer.container)
        self.assertEqual(2, trainer.num_replicas)
        self.assertEqual(5, trainer.max_retries)
        self.assertEqual(RetryPolicy.REPLICA, trainer.retry_policy)
        self.assertEqual(DeploymentPreference.SERVICE,
                         trainer.deployment_preference)
Exemple #17
0
 def test_create_container_with_resource(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     res2 = Resource(cpu=1, gpu=2, memMB=256)
     container = Container("torch").require(res1).require(res2)
     self.assertEqual(res2, container.resources)
Exemple #18
0
 def test_get_resource_specific(self):
     res = Resource(cpu=1, gpu=2, memMB=128)
     container = Container("torch").require(res, scheduler="foobar")
     self.assertEqual(res, container.get_resource("foobar"))
     self.assertEqual(NULL_RESOURCE,
                      container.get_resource("any_scheduler"))
Exemple #19
0
 def test_get_resource_all(self):
     res = Resource(cpu=1, gpu=2, memMB=128)
     container = Container("torch").require(res)
     self.assertEqual(res, container.get_resource("any_scheduler"))
Exemple #20
0
 def test_create_container_no_backend(self):
     res1 = Resource(cpu=1, gpu=2, memMB=128)
     container = Container("torch").require(res1)
     self.assertEqual(1, len(container.resources))
     self.assertEqual(res1, container.resources[ALL])