def run(args):
    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(
            f"\n**************************************\n"
            f"Rendezvous info:\n"
            f"--rdzv_backend={args.rdzv_backend} "
            f"--rdzv_endpoint={args.rdzv_endpoint} "
            f"--rdzv_id={args.rdzv_id}\n"
            f"**************************************\n"
        )

    config, cmd = config_from_args(args)

    try:
        elastic_launch(
            config=config,
            entrypoint=cmd[0],
        )(*cmd[1:])
    finally:
        if args.standalone:
            etcd_server.stop()
Beispiel #2
0
 def elastic_launch_wrapper():
     """We need a wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code."""
     elastic_launch(
         self.get_test_launch_config(
             min_nodes, max_nodes, nproc_per_node, run_id
         ),
         sys.executable,
     )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
Beispiel #3
0
    def test_launch_elastic(self):
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(1, 2, nproc_per_node),
            sys.executable,
        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")

        world_size = nproc_per_node
        self.check_works_ran(world_size)
Beispiel #4
0
    def test_launch_script_bash(self):
        nnodes = 1
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            path("bin/test_script.sh"),
        )(f"{self.test_dir}")

        world_size = nnodes * nproc_per_node
        self.check_works_ran(world_size)
Beispiel #5
0
 def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run):
     """
     Asserts that when the agent raises an exception
     the launcher re-raises the original exception.
     """
     mock_agent_run.side_effect = MockException
     with self.assertRaises(MockException):
         elastic_launch(
             self.get_test_launch_config(1, 2, 4),
             sys.executable,
         )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
     record_mock.assert_called_once()
Beispiel #6
0
    def test_launch_script_python_local_rank_transfer(self):
        nnodes = 1
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            sys.executable,
        )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")

        # make sure all the workers ran.
        # each worker touches a file with its global rank as the name.
        world_size = nnodes * nproc_per_node
        self.check_works_ran(world_size)
Beispiel #7
0
def elastic_launch_wrapper(
    test_dir: str,
    rdzv_endpoint: str,
    min_nodes: int,
    max_nodes: int,
    nproc_per_node: int,
    run_id: str,
):
    """A wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code."""
    elastic_launch(
        get_test_launch_config(rdzv_endpoint, min_nodes, max_nodes,
                               nproc_per_node, run_id),
        sys.executable,
    )("-u", path("bin/test_script.py"), f"--touch_file_dir={test_dir}")
Beispiel #8
0
    def test_launch_elastic_worker_raise_exception(self, record_mock):
        """
        Asserts that when the worker program fails and lancher raieses exception
        to indicate that worker process failed.
        """
        nproc_per_node = 4

        with self.assertRaises(ChildFailedError):
            elastic_launch(
                self.get_test_launch_config(1, 2, nproc_per_node),
                sys.executable,
            )("-u", path("bin/test_script.py"), "--fail")

        record_mock.assert_called_once()
Beispiel #9
0
    def test_launch_shutdown(self, agent_mock_cls):
        agent_mock = Mock()
        agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
        agent_mock_cls.return_value = agent_mock
        rdzv_handler_mock = Mock()
        with patch(
            "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler"
        ) as param_mock:
            param_mock.return_value = rdzv_handler_mock
            elastic_launch(
                self.get_test_launch_config(1, 1, 4),
                sys.executable,
            )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")

            rdzv_handler_mock.shutdown.assert_called_once()
Beispiel #10
0
    def test_launch_dist_sum_with_static_rdzv(self):
        nnodes = 1
        nproc_per_node = 4
        sock = get_socket_with_port()
        with closing(sock):
            master_port = sock.getsockname()[1]
        rdzv_endpoint = f"127.0.0.1:{master_port}"
        rank = 0
        rdzv_config = {
            "rank": rank,
        }

        res = elastic_launch(
            get_test_launch_config(
                rdzv_endpoint,
                nnodes,
                nnodes,
                nproc_per_node,
                rdzv_backend="static",
                config=rdzv_config,
            ),
            _dist_sum,
        )()

        expected_res = [sum(range(nproc_per_node))] * nproc_per_node
        actual_res = sorted(value for value in res.values())
        self.assertEqual(expected_res, actual_res)
Beispiel #11
0
def run(args):
    if args.standalone:
        args.rdzv_backend = "c10d"
        args.rdzv_endpoint = "localhost:29400"
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    config, cmd, cmd_args = config_from_args(args)
    elastic_launch(
        config=config,
        entrypoint=cmd,
    )(*cmd_args)
Beispiel #12
0
def launch(
        fn,
        n_gpu_per_machine,
        n_machine=1,
        machine_rank=0,
        dist_url=None,
        launch_config=None,
        args=(),
):
    world_size = n_machine * n_gpu_per_machine

    if world_size > 1:
        if "OMP_NUM_THREADS" not in os.environ:
            os.environ["OMP_NUM_THREADS"] = "1"

        if launch_config is not None:
            elastic_launch(config=launch_config,
                           entrypoint=elastic_worker)(fn, args)

            return

        if dist_url == "auto":
            if n_machine != 1:
                raise ValueError(
                    'dist_url="auto" not supported in multi-machine jobs')

            port = find_free_port()
            dist_url = f"tcp://127.0.0.1:{port}"

        if n_machine > 1 and dist_url.startswith("file://"):
            raise ValueError(
                "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://"
            )

        mp.spawn(
            distributed_worker,
            nprocs=n_gpu_per_machine,
            args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url,
                  args),
            daemon=False,
        )

    else:
        fn(*args)
Beispiel #13
0
    def test_launch_function(self):
        nnodes = 1
        nproc_per_node = 4

        res = elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            simple_rank_scale,
        )()

        expected_res = [10, 11, 12, 13]
        actual_res = sorted(value for value in res.values())
        self.assertEqual(expected_res, actual_res)