def test_get_resource_none(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require( {"default": res1, "test_scheduler": res2} ) self.assertEqual(NULL_RESOURCE, container.get_resource("non-existent"))
def test_get_resource_incorrect_input(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) with self.assertRaises(ValueError): Container("torch").require( {"default": res1, "test_scheduler": res2}, "new_scheduler" )
def test_application(self): container = Container(image="test_image") trainer = Role("trainer").runs("/bin/sleep", "10").on(container).replicas(2) app = Application(name="test_app").of(trainer) self.assertEqual("test_app", app.name) self.assertEqual(1, len(app.roles)) self.assertEqual(trainer, app.roles[0])
def test_validate_no_resource(self): session = self.MockSession() with self.assertRaises(ValueError): container = Container("no resource") role = Role("no resource").runs("echo", "hello_world").on(container) app = Application("no resource").of(role) session.run(app)
def test_get_resource_mapping(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require({"default": res1, ALL: res2}) self.assertEqual(2, len(container.resources)) self.assertEqual(res1, container.get_resource("default")) self.assertEqual(res2, container.get_resource(ALL)) self.assertEqual(res2, container.get_resource("unknown_scheduler"))
def test_create_container_with_resource(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = (Container("torch").require(res1, "default").require( res2, "test_scheduler")) self.assertEqual(2, len(container.resources)) self.assertEqual(res1, container.resources["default"]) self.assertEqual(res2, container.resources["test_scheduler"])
def test_validate_invalid_replicas(self): session = self.MockSession() with self.assertRaises(ValueError): container = Container("torch").require( Resource(cpu=1, gpu=0, memMB=500)) role = (Role("no container").runs( "echo", "hello_world").on(container).replicas(0)) app = Application("no container").of(role) session.run(app)
def setUp(self): self.test_dir = tempfile.mkdtemp("LocalSchedulerTest") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) self.image_fetcher = LocalDirectoryImageFetcher() self.scheduler = LocalScheduler(self.image_fetcher) self.test_container = Container(image=self.test_dir)
def setUp(self): self.test_dir = tempfile.mkdtemp("StandaloneSessionTest") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) self.scheduler = LocalScheduler(SESSION_NAME) self.cfg = RunConfig({"image_fetcher": "dir"}) # resource ignored for local scheduler; adding as an example self.test_container = Container(image=self.test_dir).require(resource.SMALL)
def test_json_serialization(self): """ Tests that an ElasticRole can be serialized into json (dict) then recreated as a Role. An ElasticRole is really just a builder utility to make it easy for users to create a Role with the entrypoint being ``torchelastic.distributed.launch`` """ resource = Resource(cpu=1, gpu=0, memMB=512) container = Container(image="user_image", resources={ "default": resource }).ports(tensorboard=8080) elastic_role = (ElasticRole("test_role", nnodes="2:4", rdzv_backend="etcd", rdzv_id="foobar").runs( "user_script.py", "--script_arg", "foo").on(container).replicas(3)) # this is effectively JSON elastic_json = dataclasses.asdict(elastic_role) container_json = elastic_json.pop("container") resources_json = container_json.pop("resources") container_json["resources"] = {} for sched, resource_json in resources_json.items(): container_json["resources"][sched] = Resource(**resource_json) role = Role( **elastic_json, container=Container(**container_json), ) self.assertEqual(container, role.container) self.assertEqual(elastic_role.name, role.name) self.assertEqual(elastic_role.entrypoint, role.entrypoint) self.assertEqual( elastic_role.args, role.args, ) self.assertEqual(dataclasses.asdict(elastic_role), dataclasses.asdict(role))
def setUp(self): self.test_dir = tempfile.mkdtemp("StandaloneSessionTest") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) self.image_fetcher = LocalDirectoryImageFetcher() self.scheduler = LocalScheduler(self.image_fetcher) # resource ignored for local scheduler; adding as an example self.test_container = Container(image=self.test_dir).require( Resource.SMALL)
def test_build_role(self): # runs: ENV_VAR_1=FOOBAR /bin/echo hello world container = Container(image="test_image") container.ports(foo=8080) trainer = (Role("trainer").runs( "/bin/echo", "hello", "world", ENV_VAR_1="FOOBAR").on(container).replicas(2)) self.assertEqual("trainer", trainer.name) self.assertEqual("/bin/echo", trainer.entrypoint) self.assertEqual({"ENV_VAR_1": "FOOBAR"}, trainer.env) self.assertEqual(["hello", "world"], trainer.args) self.assertEqual(container, trainer.container) self.assertEqual(2, trainer.num_replicas)
def setUp(self): self.test_dir = tempfile.mkdtemp("LocalSchedulerTest") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) write_shell_script(self.test_dir, "echo.sh", ["echo $1"]) write_shell_script(self.test_dir, "echo_stderr.sh", ["echo $1 1>&2"]) write_shell_script( self.test_dir, "echo_range.sh", ["for i in $(seq 0 $1); do echo $i 1>&2; sleep $2; done"], ) self.scheduler = LocalScheduler(session_name="test_session") self.test_container = Container(image=self.test_dir)
def setUp(self): self.test_dir = tempfile.mkdtemp(prefix=f"{self.__class__.__name__}_") write_shell_script(self.test_dir, "touch.sh", ["touch $1"]) write_shell_script(self.test_dir, "fail.sh", ["exit 1"]) write_shell_script(self.test_dir, "sleep.sh", ["sleep $1"]) write_shell_script(self.test_dir, "echo_stdout.sh", ["echo $1"]) write_shell_script(self.test_dir, "echo_stderr.sh", ["echo $1 1>&2"]) write_shell_script( self.test_dir, "echo_range.sh", ["for i in $(seq 0 $1); do echo $i 1>&2; sleep $2; done"], ) write_shell_script(self.test_dir, "echo_env_foo.sh", ["echo $FOO 1>&2"]) self.scheduler = LocalScheduler(session_name="test_session") self.test_container = Container(image=self.test_dir)
def test_build_elastic_role(self): # runs: python -m torchelastic.distributed.launch # --nnodes 2:4 # --max_restarts 3 # --no_python True # --rdzv_backend etcd # --rdzv_id ${app_id} # /bin/echo hello world container = Container(image="test_image") container.ports(foo=8080) elastic_trainer = (ElasticRole( "elastic_trainer", nnodes="2:4", max_restarts=3, no_python=True).runs("/bin/echo", "hello", "world", ENV_VAR_1="FOOBAR").on(container).replicas(2)) self.assertEqual("elastic_trainer", elastic_trainer.name) self.assertEqual("python", elastic_trainer.entrypoint) self.assertEqual( [ "-m", "torchelastic.distributed.launch", "--nnodes", "2:4", "--max_restarts", "3", "--no_python", "--rdzv_backend", "etcd", "--rdzv_id", macros.app_id, "--role", "elastic_trainer", "/bin/echo", "hello", "world", ], elastic_trainer.args, ) self.assertEqual({"ENV_VAR_1": "FOOBAR"}, elastic_trainer.env) self.assertEqual(container, elastic_trainer.container) self.assertEqual(2, elastic_trainer.num_replicas)
def test_build_role(self): # runs: ENV_VAR_1=FOOBAR /bin/echo hello world container = Container(image="test_image") container.ports(foo=8080) trainer = (Role("trainer").runs( "/bin/echo", "hello", "world", ENV_VAR_1="FOOBAR").on(container).replicas(2).with_retry_policy( RetryPolicy.REPLICA, max_retries=5).with_deployment_preference( DeploymentPreference.SERVICE)) self.assertEqual("trainer", trainer.name) self.assertEqual("/bin/echo", trainer.entrypoint) self.assertEqual({"ENV_VAR_1": "FOOBAR"}, trainer.env) self.assertEqual(["hello", "world"], trainer.args) self.assertEqual(container, trainer.container) self.assertEqual(2, trainer.num_replicas) self.assertEqual(5, trainer.max_retries) self.assertEqual(RetryPolicy.REPLICA, trainer.retry_policy) self.assertEqual(DeploymentPreference.SERVICE, trainer.deployment_preference)
def test_create_container_with_resource(self): res1 = Resource(cpu=1, gpu=2, memMB=128) res2 = Resource(cpu=1, gpu=2, memMB=256) container = Container("torch").require(res1).require(res2) self.assertEqual(res2, container.resources)
def test_get_resource_specific(self): res = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res, scheduler="foobar") self.assertEqual(res, container.get_resource("foobar")) self.assertEqual(NULL_RESOURCE, container.get_resource("any_scheduler"))
def test_get_resource_all(self): res = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res) self.assertEqual(res, container.get_resource("any_scheduler"))
def test_create_container_no_backend(self): res1 = Resource(cpu=1, gpu=2, memMB=128) container = Container("torch").require(res1) self.assertEqual(1, len(container.resources)) self.assertEqual(res1, container.resources[ALL])