Beispiel #1
0
    def test_get_or_default(self):

        params = RendezvousParameters(
            backend="foobar",
            endpoint="localhost",
            run_id="1234",
            min_nodes=1,
            max_nodes=1,
            timeout1=None,
            timeout2=10,
        )
        self.assertEqual(30, params.get("timeout1", 30))
        self.assertEqual(10, params.get("timeout2", 20))
        self.assertEqual(60, params.get("timeout3", 60))
Beispiel #2
0
    def test_get(self):
        params = RendezvousParameters(
            backend="foobar",
            endpoint="localhost",
            run_id="1234",
            min_nodes=1,
            max_nodes=1,
            timeout1=None,
            timeout2=10,
        )

        with self.assertRaises(KeyError):
            params.get("timeout3")

        with self.assertRaises(KeyError):
            params.get("timeout1")

        self.assertEqual(10, params.get("timeout2"))
Beispiel #3
0
def create_rdzv_handler(rdzv_params: RendezvousParameters):
    """
    Usage:

    ::

    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8,
                        timeout=300,
                        last_call_timeout=30,
                        etcd_prefix="custom_prefix",
                        protocol="https",
                        cacert="/etc/kubernetes/certs/ca.crt",
                        cert="/etc/kubernetes/certs/client.crt",
                        key="/etc/kubernetes/certs/client.key")
    # -- or --
    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8)

    etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params)


    Where:
        run_id - unique id for this training job instance,
        min_nodes - min number of workers expected to join the rendezvous,
        max_nodes - max number of workers allowed to join the rendezvous,
                        defaults to min_workers is not specified.
        timeout - total timeout within which next_rendezvous is expected to
                      succeed; a RendezvousTimeoutException is raised otherwise;
                      Defaults is 600 (10 minutes).
        last_call_timeout - additional wait amount ("last call") after
                            min number of workers has been reached.
                            Defaults to 30 seconds.
        etcd_prefix - path prefix (from etcd root), inside which all
                      etcd nodes will be created.
                      Default is "/torchelastic/p2p".
        protocol - http (default) or https to access etcd.
        cacert - CA cert to access etcd, only makes sense with https.
        cert - client cert to access etcd, only makes sense with https.
        key - client key to access etcd, only makes sense with https.
    """
    import re

    # Etcd endpoints. (Current url format only allows a single host)
    endpoint = rdzv_params.endpoint
    match = re.match(r"(.+):(\d+)$", endpoint)  # check if port was provided
    if match:
        etcd_endpoints = ((match.group(1), int(match.group(2))), )
    else:
        # Use default etcd port
        etcd_endpoints = ((endpoint, 2379), )

    # Run ID value -> unique identifier of this training job instance:
    # typically a job_id or name assigned by the scheduler or user
    run_id = rdzv_params.run_id

    # Parse all of query parameters:
    etcd_prefix = rdzv_params.get("etcd_prefix", "/torchelastic/p2p")
    min_workers = rdzv_params.min_nodes
    max_workers = rdzv_params.max_nodes

    assert min_workers >= 1, "Min number of workers should be at least 1"
    assert (
        max_workers >= min_workers
    ), "Max number of workers cannot be less than min number of workers"

    timeout = rdzv_params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT)
    last_call_timeout = rdzv_params.get("last_call_timeout",
                                        CONST_DEFAULT_LAST_CALL_TIMEOUT)

    kwargs = _parse_etcd_client_params(rdzv_params.configs)

    # Etcd rendezvous implementation
    etcd_rdzv = EtcdRendezvous(
        endpoints=etcd_endpoints,
        prefix=etcd_prefix,
        run_id=run_id,
        num_min_workers=min_workers,
        num_max_workers=max_workers,
        timeout=timeout,
        last_call_timeout=last_call_timeout,
        **kwargs,
    )
    return EtcdRendezvousHandler(rdzv_impl=etcd_rdzv)
Beispiel #4
0
def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
    """
    Usage:

    ::

    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8,
                        timeout=300,
                        last_call_timeout=30,
                        etcd_prefix="custom_prefix",
                        protocol="https",
                        cacert="/etc/kubernetes/certs/ca.crt",
                        cert="/etc/kubernetes/certs/client.crt",
                        key="/etc/kubernetes/certs/client.key")
    # -- or --
    rdzv_params = RendezvousParameters(
                        backend="etcd",
                        endpoint="192.168.0.42:2379",
                        run_id="123",
                        min_nodes=4,
                        max_nodes=8)

    etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params)


    Where:
        run_id - unique id for this training job instance,
        min_nodes - min number of workers expected to join the rendezvous,
        max_nodes - max number of workers allowed to join the rendezvous,
                        defaults to min_workers is not specified.
        timeout - total timeout within which next_rendezvous is expected to
                      succeed; a RendezvousTimeoutException is raised otherwise;
                      Defaults is 600 (10 minutes).
        last_call_timeout - additional wait amount ("last call") after
                            min number of workers has been reached.
                            Defaults to 30 seconds.
        etcd_prefix - path prefix (from etcd root), inside which all
                      etcd nodes will be created.
                      Default is "/torchelastic/p2p".
        protocol - http (default) or https to access etcd.
        cacert - CA cert to access etcd, only makes sense with https.
        cert - client cert to access etcd, only makes sense with https.
        key - client key to access etcd, only makes sense with https.
    """
    client = _create_etcd_client(params)

    etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p")

    rdzv = EtcdRendezvous(
        client=client,
        prefix=etcd_prefix,
        run_id=params.run_id,
        num_min_workers=params.min_nodes,
        num_max_workers=params.max_nodes,
        timeout=params.timeout,
        last_call_timeout=params.last_call_timeout,
    )
    return EtcdRendezvousHandler(rdzv_impl=rdzv)