Beispiel #1
0
    def test_get_or_default(self):

        params = RendezvousParameters(
            backend="foobar",
            endpoint="localhost",
            run_id="1234",
            min_nodes=1,
            max_nodes=1,
            timeout1=10,
        )

        self.assertEqual(10, params.get("timeout1", 20))
        self.assertEqual(60, params.get("timeout2", 60))
Beispiel #2
0
    def _get_worker_spec(
        self,
        max_restarts=1,
        monitor_interval=1.0,
        role="test_trainer",
        local_world_size=8,
    ):
        run_id = str(uuid.uuid4().int)
        endpoint = self._etcd_server.get_endpoint()

        rdzv_params = RendezvousParameters(backend="etcd",
                                           endpoint=endpoint,
                                           run_id=run_id,
                                           min_nodes=1,
                                           max_nodes=1)
        rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)
        spec = WorkerSpec(
            role=role,
            local_world_size=local_world_size,
            fn=do_nothing,
            args=(),
            rdzv_handler=rdzv_handler,
            max_restarts=max_restarts,
            monitor_interval=monitor_interval,
        )
        return spec
Beispiel #3
0
    def _get_worker_spec(
        self,
        fn=None,
        cmd=None,
        args=(),
        max_restarts=1,
        num_agents=1,
        monitor_interval=0.1,
        local_world_size=8,
    ):
        run_id = str(uuid.uuid4().int)

        rdzv_params = RendezvousParameters(
            backend="etcd",
            endpoint=f"{self._etcd_server.get_endpoint()}",
            run_id=run_id,
            min_nodes=num_agents,
            max_nodes=num_agents,
        )
        rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)
        spec = WorkerSpec(
            role="test_trainer",
            local_world_size=local_world_size,
            fn=fn,
            cmd=cmd,
            args=args,
            rdzv_handler=rdzv_handler,
            max_restarts=max_restarts,
            monitor_interval=monitor_interval,
        )
        return spec
        def run_agent(run_id,
                      etcd_host,
                      etcd_port,
                      start_method,
                      worker_fn,
                      worker_args=()):
            rdzv_params = RendezvousParameters(
                backend="etcd",
                endpoint=f"{etcd_host}:{etcd_port}",
                run_id=run_id,
                min_nodes=2,
                max_nodes=2,
            )
            rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)

            spec = WorkerSpec(
                role="test_trainer",
                local_world_size=1,
                fn=worker_fn,
                args=worker_args,
                rdzv_handler=rdzv_handler,
                max_restarts=3,
                monitor_interval=1,
            )

            agent = LocalElasticAgent(spec, start_method)
            agent.run()
 def get_worker_spec(
     self,
     node_config: Conf,
     min_nodes=1,
     max_nodes=1,
     max_restarts=0,
     monitor_interval=0.01,
 ):
     rdzv_params = RendezvousParameters(
         backend="etcd",
         endpoint=self._etcd_server.get_endpoint(),
         run_id=self._run_id,
         min_nodes=min_nodes,
         max_nodes=max_nodes,
     )
     rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)
     return WorkerSpec(
         role=node_config.role,
         local_world_size=node_config.local_world_size,
         entrypoint=node_config.entrypoint,
         args=node_config.args,
         rdzv_handler=rdzv_handler,
         max_restarts=max_restarts,
         monitor_interval=monitor_interval,
         redirects=node_config.redirects,
         tee=node_config.tee,
     )
Beispiel #6
0
    def test_no_factory_method_found(self):
        factory = RendezvousHandlerFactory()
        rdzv_params = RendezvousParameters(
            backend="mock", endpoint="", run_id="foobar", min_nodes=1, max_nodes=2
        )

        with self.assertRaises(ValueError):
            factory.create_rdzv_handler(rdzv_params)
Beispiel #7
0
    def test_create_rdzv_handler(self):
        rdzv_params = RendezvousParameters(
            backend="mock", endpoint="", run_id="foobar", min_nodes=1, max_nodes=2
        )

        factory = RendezvousHandlerFactory()
        factory.register("mock", create_mock_rdzv_handler)
        mock_rdzv_handler = factory.create_rdzv_handler(rdzv_params)
        self.assertTrue(isinstance(mock_rdzv_handler, MockRendezvousHandler))
Beispiel #8
0
 def _create_params(self) -> RendezvousParameters:
     return RendezvousParameters(
         backend=self._backend_name,
         endpoint=self._endpoint,
         run_id=self._run_id,
         min_nodes=self._min_num_nodes,
         max_nodes=self._max_num_nodes,
         **self._kwargs,
     )
    def setUp(self) -> None:
        self._params = RendezvousParameters(
            backend="dummy_backend",
            endpoint=self._server.get_endpoint(),
            run_id="dummy_run_id",
            min_nodes=1,
            max_nodes=1,
            protocol="hTTp",
            read_timeout="10",
        )

        self._expected_protocol = "http"
        self._expected_read_timeout = 10
 def test_etcd_rdzv_basic_params(self):
     """
     Check that we can create the handler with a minimum set of
     params
     """
     rdzv_params = RendezvousParameters(
         backend="etcd",
         endpoint=f"{self._etcd_server.get_endpoint()}",
         run_id=f"{uuid.uuid4()}",
         min_nodes=1,
         max_nodes=1,
     )
     etcd_rdzv = create_rdzv_handler(rdzv_params)
     self.assertIsNotNone(etcd_rdzv)
    def setUp(self) -> None:
        self._params = RendezvousParameters(
            backend="dummy_backend",
            endpoint="127.0.0.1:29400",
            run_id="dummy_run_id",
            min_nodes=1,
            max_nodes=1,
            is_host="true",
            store_type="tCp",
            read_timeout="10",
        )

        self._expected_endpoint_host = "127.0.0.1"
        self._expected_endpoint_port = 29400
        self._expected_store_type = TCPStore
        self._expected_read_timeout = timedelta(seconds=10)
    def test_get_backend(self):
        run_id = str(uuid.uuid4())
        rdzv_params = RendezvousParameters(
            backend="etcd",
            endpoint=f"{self._etcd_server.get_endpoint()}",
            run_id=run_id,
            min_nodes=1,
            max_nodes=1,
            timeout=60,
            last_call_timeout=30,
            protocol="http",
        )

        etcd_rdzv = create_rdzv_handler(rdzv_params)

        self.assertEqual("etcd", etcd_rdzv.get_backend())
Beispiel #13
0
    def setUp(self) -> None:
        self._backend = DummyRendezvousBackend()

        self._params = RendezvousParameters(
            backend=self._backend.name,
            endpoint="dummy_end_point",
            run_id="dummy_run_id",
            min_nodes=3,
            max_nodes=6,
            store_port="1234",
            join_timeout="50",
            last_call_timeout="60",
            close_timeout="70",
            store_timeout="80",
        )

        self._expected_store_port = 1234
        self._expected_close_timeout = 70
Beispiel #14
0
    def test_get(self):
        params = RendezvousParameters(
            backend="foobar",
            endpoint="localhost",
            run_id="1234",
            min_nodes=1,
            max_nodes=1,
            timeout1=None,
            timeout2=10,
        )

        with self.assertRaises(KeyError):
            params.get("timeout3")

        with self.assertRaises(KeyError):
            params.get("timeout1")

        self.assertEqual(10, params.get("timeout2"))
Beispiel #15
0
def _run_agent(
    run_id,
    etcd_host,
    etcd_port,
    min_size,
    max_size,
    func_to_run,
    args,
    local_world_size=8,
    role="test_trainer",
    output_dict=None,
    agent_barrier_timeout=300,
):
    rdzv_params = RendezvousParameters(
        backend="etcd",
        endpoint=f"{etcd_host}:{etcd_port}",
        run_id=run_id,
        min_nodes=min_size,
        max_nodes=max_size,
    )
    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)

    spec = WorkerSpec(
        role=role,
        local_world_size=local_world_size,
        fn=func_to_run,
        args=args,
        rdzv_handler=rdzv_handler,
        max_restarts=2,
        monitor_interval=1,
    )

    agent = LocalElasticAgent(
        spec, start_method="fork", exit_barrier_timeout=agent_barrier_timeout
    )

    res = agent.run()
    if output_dict is not None:
        key = str(uuid.uuid4().int)
        output_dict[key] = (role, res)
Beispiel #16
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)

    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    omp_num_threads = None
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************")
        # This env variable will be passed down to the subprocesses
        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError("Don't use both the '--no_python' flag"
                             " and the '--module' flag at the same time.")

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rdzv_conf(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)

    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            cmd=cmd,
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec, start_method=args.start_method)
        group_result = elastic_agent.run(spec.role)
        if group_result.is_failed():
            min_rank = min(group_result.failures.keys())
            failure = group_result.failures[min_rank]
            # Note: this line will raise an exception to indicate to the
            # scheduler process that something went wrong.
            # If any workers wrote the error file, it will be propagated
            # to the scheduler specific destination.
            process_failure(failure)
            msg = f"""
*********************************************************************** \n
***********************USER CODE FAILED WITH ERROR****************** \n\n
{get_failure_message(failure)} \n
******************************************************************** \n\n
******************************************************************** \n
            """
            log.warning(msg)
            # Expected (0-127), 0 - success, anything else - failure
            sys.exit(abs(failure.exit_code))
    finally:
        rdzv_handler.shutdown()
        if args.standalone:
            etcd_server.stop()
        cleanup()
Beispiel #17
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)
    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    elastic_agent = None

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************")
        # This env variable will be passed down to the subprocesses
        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError("Don't use both the '--no_python' flag"
                             " and the '--module' flag at the same time.")

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rendezvous_config(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)
    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            entrypoint=cmd[0],
            args=(*cmd[1:], ),
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
            redirects=Std.from_str(args.redirects),
            tee=Std.from_str(args.tee),
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec=spec,
                                          start_method=args.start_method,
                                          log_dir=args.log_dir)
        run_result = elastic_agent.run(spec.role)
        events.record(
            elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED))
        if run_result.is_failed():
            # ChildFailedError is treated specially by @record
            # if the error files for the failed children exist
            # @record will copy the first error (root cause)
            # to the error file of the launcher process
            raise ChildFailedError(
                name=args.training_script,
                failures=run_result.failures,
            )
    except ChildFailedError:
        raise
    except Exception:
        if elastic_agent:
            events.record(
                elastic_agent.get_agent_status_event(WorkerState.FAILED))
        else:
            events.record(_construct_event(args))
        raise
    finally:
        rdzv_handler.shutdown()
        if args.standalone:
            etcd_server.stop()
Beispiel #18
0
def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
    """
    Usage:

    ::

    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8,
                        timeout=300,
                        last_call_timeout=30,
                        etcd_prefix="custom_prefix",
                        protocol="https",
                        cacert="/etc/kubernetes/certs/ca.crt",
                        cert="/etc/kubernetes/certs/client.crt",
                        key="/etc/kubernetes/certs/client.key")
    # -- or --
    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8)

    etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params)


    Where:
        run_id - unique id for this training job instance,
        min_nodes - min number of workers expected to join the rendezvous,
        max_nodes - max number of workers allowed to join the rendezvous,
                        defaults to min_workers is not specified.
        timeout - total timeout within which next_rendezvous is expected to
                      succeed; a RendezvousTimeoutException is raised otherwise;
                      Defaults is 600 (10 minutes).
        last_call_timeout - additional wait amount ("last call") after
                            min number of workers has been reached.
                            Defaults to 30 seconds.
        etcd_prefix - path prefix (from etcd root), inside which all
                      etcd nodes will be created.
                      Default is "/torchelastic/p2p".
        protocol - http (default) or https to access etcd.
        cacert - CA cert to access etcd, only makes sense with https.
        cert - client cert to access etcd, only makes sense with https.
        key - client key to access etcd, only makes sense with https.
    """
    client = _create_etcd_client(params)

    etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p")

    rdzv = EtcdRendezvous(
        client=client,
        prefix=etcd_prefix,
        run_id=params.run_id,
        num_min_workers=params.min_nodes,
        num_max_workers=params.max_nodes,
        timeout=params.timeout,
        last_call_timeout=params.last_call_timeout,
    )
    return EtcdRendezvousHandler(rdzv_impl=rdzv)
Beispiel #19
0
def create_rdzv_handler(rdzv_params: RendezvousParameters):
    """
    Usage:

    ::

    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8,
                        timeout=300,
                        last_call_timeout=30,
                        etcd_prefix="custom_prefix",
                        protocol="https",
                        cacert="/etc/kubernetes/certs/ca.crt",
                        cert="/etc/kubernetes/certs/client.crt",
                        key="/etc/kubernetes/certs/client.key")
    # -- or --
    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8)

    etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params)


    Where:
        run_id - unique id for this training job instance,
        min_nodes - min number of workers expected to join the rendezvous,
        max_nodes - max number of workers allowed to join the rendezvous,
                        defaults to min_workers is not specified.
        timeout - total timeout within which next_rendezvous is expected to
                      succeed; a RendezvousTimeoutException is raised otherwise;
                      Defaults is 600 (10 minutes).
        last_call_timeout - additional wait amount ("last call") after
                            min number of workers has been reached.
                            Defaults to 30 seconds.
        etcd_prefix - path prefix (from etcd root), inside which all
                      etcd nodes will be created.
                      Default is "/torchelastic/p2p".
        protocol - http (default) or https to access etcd.
        cacert - CA cert to access etcd, only makes sense with https.
        cert - client cert to access etcd, only makes sense with https.
        key - client key to access etcd, only makes sense with https.
    """
    import re

    # Etcd endpoints. (Current url format only allows a single host)
    endpoint = rdzv_params.endpoint
    match = re.match(r"(.+):(\d+)$", endpoint)  # check if port was provided
    if match:
        etcd_endpoints = ((match.group(1), int(match.group(2))), )
    else:
        # Use default etcd port
        etcd_endpoints = ((endpoint, 2379), )

    # Run ID value -> unique identifier of this training job instance:
    # typically a job_id or name assigned by the scheduler or user
    run_id = rdzv_params.run_id

    # Parse all of query parameters:
    etcd_prefix = rdzv_params.get("etcd_prefix", "/torchelastic/p2p")
    min_workers = rdzv_params.min_nodes
    max_workers = rdzv_params.max_nodes

    assert min_workers >= 1, "Min number of workers should be at least 1"
    assert (
        max_workers >= min_workers
    ), "Max number of workers cannot be less than min number of workers"

    timeout = rdzv_params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT)
    last_call_timeout = rdzv_params.get("last_call_timeout",
                                        CONST_DEFAULT_LAST_CALL_TIMEOUT)

    kwargs = _parse_etcd_client_params(rdzv_params.configs)

    # Etcd rendezvous implementation
    etcd_rdzv = EtcdRendezvous(
        endpoints=etcd_endpoints,
        prefix=etcd_prefix,
        run_id=run_id,
        num_min_workers=min_workers,
        num_max_workers=max_workers,
        timeout=timeout,
        last_call_timeout=last_call_timeout,
        **kwargs,
    )
    return EtcdRendezvousHandler(rdzv_impl=etcd_rdzv)
Beispiel #20
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)

    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(
            f"\n**************************************\n"
            f"Rendezvous info:\n"
            f"--rdzv_backend={args.rdzv_backend} "
            f"--rdzv_endpoint={args.rdzv_endpoint} "
            f"--rdzv_id={args.rdzv_id}\n"
            f"**************************************\n"
        )

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    omp_num_threads = None
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************"
        )

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError(
                "Don't use both the '--no_python' flag"
                " and the '--module' flag at the same time."
            )

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rdzv_conf(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)

    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            fn=wrapper_fn,
            args=(omp_num_threads, cmd),
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec, start_method=args.start_method)
        elastic_agent.run(spec.role)
    finally:
        rdzv_handler.shutdown()

    if args.standalone:
        etcd_server.stop()