Beispiel #1
0
    def test_launch_elastic(self):
        run_id = str(uuid.uuid4().int)
        min_nodes = 1
        max_nodes = 2
        nproc_per_node = 4
        # we are only launching 1 node (even though max = 2)
        world_size = nproc_per_node
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--start_method=fork",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]
        launch.main(args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #2
0
    def test_launch_elastic_multiple_agents(self):
        run_id = str(uuid.uuid4().int)
        min_nodes = 1
        max_nodes = 2
        nproc_per_node = 4
        nnodes = 2
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--start_method=fork",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]
        procs = []
        for _ in range(nnodes - 1):
            p = mp.Process(target=launch.main, args=[args])
            procs.append(p)
            p.start()
        launch.main(args)
        for i in range(nnodes - 1):
            p = procs[i]
            p.join()
            self.assertEqual(0, p.exitcode)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #3
0
    def test_launch_elastic_agent_raise_exception(self, record_mock,
                                                  mock_agent_run):
        """
        Asserts that when the agent raises an exception
        the launcher re-raises the original exception
        """
        run_id = str(uuid.uuid4().int)
        min_nodes = 1
        max_nodes = 2
        nproc_per_node = 4
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--max_restarts=0",
            "--start_method=fork",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]

        mock_agent_run.side_effect = MockException
        with self.assertRaises(MockException):
            launch.main(args)
        record_mock.assert_called_once()
Beispiel #4
0
    def test_launch_elastic_worker_raise_exception(self, record_mock):
        """
        Asserts that when the worker program fails and lancher raieses exception
        to indicate that worker process failed

        """
        run_id = str(uuid.uuid4().int)
        min_nodes = 1
        max_nodes = 2
        nproc_per_node = 4
        args = [
            f"--nnodes={min_nodes}:{max_nodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--max_restarts=0",
            "--start_method=fork",
            path("bin/test_script.py"),
            "--fail",
        ]
        with self.assertRaises(ChildFailedError):
            launch.main(args)

        record_mock.assert_called_once()
Beispiel #5
0
    def test_launch_with_env_vars(self):
        run_id = str(uuid.uuid4().int)
        nnodes = 1
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node

        os.environ["PET_NNODES"] = str(nnodes)
        os.environ["PET_NPROC_PER_NODE"] = str(nproc_per_node)
        os.environ["PET_RDZV_BACKEND"] = "etcd"
        os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint
        os.environ["PET_RDZV_ID"] = run_id
        os.environ["PET_MONITOR_INTERVAL"] = "1"
        os.environ["PET_START_METHOD"] = "fork"
        os.environ["PET_NO_PYTHON"] = "1"

        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]

        with self.assertRaises(ValueError):
            # --no_python cannot be used with --module
            os.environ["PET_MODULE"] = "1"
            launch.main(script_args)

        os.environ["PET_MODULE"] = "0"
        launch.main(script_args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #6
0
    def _test_nproc_launch_configuration(self, nproc_type, expected_number):
        run_id = str(uuid.uuid4().int)
        nnodes = 1

        args = [
            f"--nnodes={nnodes}",
            f"--nproc_per_node={nproc_type}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--start_method=fork",
            "--no_python",
        ]

        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]

        launch.main(args + script_args)

        world_size = nnodes * expected_number
        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #7
0
    def test_launch_user_script_bash(self):
        run_id = str(uuid.uuid4().int)
        nnodes = 1
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={nnodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--start_method=fork",
            "--no_python",
        ]

        script_args = [path("bin/test_script.sh"), f"{self.test_dir}"]

        with self.assertRaises(ValueError):
            # --no_python cannot be used with --module
            launch.main(args + ["--module"] + script_args)

        launch.main(args + script_args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #8
0
    def test_launch_user_script_python_caffe2_bc(self):
        nnodes = 1
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node
        sock = get_socket_with_port()
        with closing(sock):
            master_port = sock.getsockname()[1]
        args = [
            f"--nnodes={nnodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--monitor_interval=1",
            "--start_method=fork",
            "--master_addr=localhost",
            f"--master_port={master_port}",
            "--node_rank=0",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]
        launch.main(args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #9
0
 def test_init_method_env_with_torchelastic(self):
     port = get_free_port()
     launch.main([
         "--run_path",
         "--nnodes=1",
         "--nproc_per_node=4",
         "--master_addr=localhost",
         f"--master_port={port}",
         "--monitor_interval=1",
         path("bin/test_script_init_method.py"),
         "--init_method=env://",
     ])
Beispiel #10
0
    def test_is_torchelastic_launched(self):
        # launch test script with torchelastic and validate that
        # torch.distributed.is_torchelastic_launched() returns True

        out_file = f"{os.path.join(self.test_dir, 'out')}"

        launch.main([
            "--run_path",
            "--nnodes=1",
            "--nproc_per_node=1",
            "--monitor_interval=1",
            path("bin/test_script_is_torchelastic_launched.py"),
            f"--out_file={out_file}",
        ])

        with open(out_file, "r") as fp:
            is_torchelastic_launched = fp.readline()
            self.assertEqual("True", is_torchelastic_launched)
Beispiel #11
0
    def test_launch_run_path(self):
        nnodes = 1
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node
        args = [
            "--run_path",
            f"--nnodes={nnodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--monitor_interval=1",
            "--start_method=fork",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]
        launch.main(args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #12
0
 def test_launch_shutdown(self, agent_mock_cls):
     nnodes = 1
     nproc_per_node = 4
     args = [
         f"--nnodes={nnodes}",
         f"--nproc_per_node={nproc_per_node}",
         "--monitor_interval=1",
         "--start_method=fork",
         path("bin/test_script.py"),
         f"--touch_file_dir={self.test_dir}",
     ]
     agent_mock = Mock()
     agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
     agent_mock_cls.return_value = agent_mock
     rdzv_handler_mock = Mock()
     with patch(
             "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler"
     ) as param_mock:
         param_mock.return_value = rdzv_handler_mock
         launch.main(args)
         rdzv_handler_mock.shutdown.assert_called_once()
Beispiel #13
0
    def _test_launch_user_script_python(self):
        run_id = str(uuid.uuid4().int)
        nnodes = 1
        nproc_per_node = 4
        world_size = nnodes * nproc_per_node
        args = [
            f"--nnodes={nnodes}",
            f"--nproc_per_node={nproc_per_node}",
            "--rdzv_backend=etcd",
            f"--rdzv_endpoint={self._etcd_endpoint}",
            f"--rdzv_id={run_id}",
            "--monitor_interval=1",
            "--start_method=spawn",
            path("bin/test_script.py"),
            f"--touch_file_dir={self.test_dir}",
        ]
        launch.main(args)

        # make sure all the workers ran
        # each worker touches a file with its global rank as the name
        self.assertSetEqual({str(i)
                             for i in range(world_size)},
                            set(os.listdir(self.test_dir)))
Beispiel #14
0
def launch_in_proc(args):
    launch.main(args)