コード例 #1
0
    def testDontScaleBelowTarget(self):
        config = SMALL_CLUSTER.copy()
        config["min_workers"] = 0
        config["max_workers"] = 2
        config["target_utilization_fraction"] = 0.5
        config_path = self.write_config(config)
        self.provider = MockProvider()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path, lm, max_failures=0, update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        assert autoscaler.num_launches_pending.value == 0
        assert len(self.provider.non_terminated_nodes({})) == 0

        # Scales up as nodes are reported as used
        local_ip = services.get_node_ip_address()
        lm.update(local_ip, {"CPU": 2}, {"CPU": 0})  # head
        # 1.0 nodes used => target nodes = 2 => target workers = 1
        autoscaler.update()
        self.waitForNodes(1)

        # Make new node idle, and never used.
        # Should hold steady as target is still 2.
        lm.update("172.0.0.0", {"CPU": 0}, {"CPU": 0})
        lm.last_used_time_by_ip["172.0.0.0"] = 0
        autoscaler.update()
        assert len(self.provider.non_terminated_nodes({})) == 1

        # Reduce load on head => target nodes = 1 => target workers = 0
        lm.update(local_ip, {"CPU": 2}, {"CPU": 1})
        autoscaler.update()
        assert len(self.provider.non_terminated_nodes({})) == 0
コード例 #2
0
ファイル: log_monitor.py プロジェクト: robertnishihara/ray
 def __init__(self, logs_dir, redis_address, redis_password=None):
     """Initialize the log monitor object."""
     self.ip = services.get_node_ip_address()
     self.logs_dir = logs_dir
     self.redis_client = ray.services.create_redis_client(
         redis_address, password=redis_password)
     self.log_filenames = set()
     self.open_file_infos = []
     self.closed_file_infos = []
     self.can_open_more_files = True
コード例 #3
0
ファイル: cluster_utils.py プロジェクト: jamescasbon/ray
    def add_node(self, **override_kwargs):
        """Adds a node to the local Ray Cluster.

        All nodes are by default started with the following settings:
            cleanup=True,
            num_cpus=1,
            object_store_memory=100 * (2**20) # 100 MB

        Args:
            override_kwargs: Keyword arguments used in `start_ray_head`
                and `start_ray_node`. Overrides defaults.

        Returns:
            Node object of the added Ray node.
        """
        node_kwargs = {
            "num_cpus": 1,
            "object_store_memory": 100 * (2**20)  # 100 MB
        }
        node_kwargs.update(override_kwargs)
        ray_params = RayParams(
            node_ip_address=services.get_node_ip_address(), **node_kwargs)

        if self.head_node is None:
            ray_params.update(include_webui=False)
            address_info = services.start_ray_head(ray_params, cleanup=True)
            self.redis_address = address_info["redis_address"]
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(address_info, process_dict_copy)
            self.head_node = node
        else:
            ray_params.update(redis_address=self.redis_address)
            address_info = services.start_ray_node(ray_params, cleanup=True)
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(address_info, process_dict_copy)
            self.worker_nodes[node] = address_info
        logger.info("Starting Node with raylet socket {}".format(
            address_info["raylet_socket_name"]))

        return node
コード例 #4
0
    def testScaleUpBasedOnLoad(self):
        config = SMALL_CLUSTER.copy()
        config["min_workers"] = 1
        config["max_workers"] = 10
        config["target_utilization_fraction"] = 0.5
        config_path = self.write_config(config)
        self.provider = MockProvider()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path, lm, max_failures=0, update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(1)
        autoscaler.update()
        assert autoscaler.num_launches_pending.value == 0
        assert len(self.provider.non_terminated_nodes({})) == 1

        # Scales up as nodes are reported as used
        local_ip = services.get_node_ip_address()
        lm.update(local_ip, {"CPU": 2}, {"CPU": 0})  # head
        lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 0})  # worker 1
        autoscaler.update()
        self.waitForNodes(3)
        lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 0})
        autoscaler.update()
        self.waitForNodes(5)

        # Holds steady when load is removed
        lm.update("172.0.0.0", {"CPU": 2}, {"CPU": 2})
        lm.update("172.0.0.1", {"CPU": 2}, {"CPU": 2})
        autoscaler.update()
        assert autoscaler.num_launches_pending.value == 0
        assert len(self.provider.non_terminated_nodes({})) == 5

        # Scales down as nodes become unused
        lm.last_used_time_by_ip["172.0.0.0"] = 0
        lm.last_used_time_by_ip["172.0.0.1"] = 0
        autoscaler.update()
        assert autoscaler.num_launches_pending.value == 0
        assert len(self.provider.non_terminated_nodes({})) == 3
        lm.last_used_time_by_ip["172.0.0.2"] = 0
        lm.last_used_time_by_ip["172.0.0.3"] = 0
        autoscaler.update()
        assert autoscaler.num_launches_pending.value == 0
        assert len(self.provider.non_terminated_nodes({})) == 1
コード例 #5
0
ファイル: worker.py プロジェクト: compsciencelab/pytorchrl
 def get_node_ip(self):
     """Returns the IP address of the current node."""
     return get_node_ip_address()
コード例 #6
0
ファイル: autoscaler.py プロジェクト: wanghuimu/ray
 def __init__(self):
     self.last_used_time_by_ip = {}
     self.last_heartbeat_time_by_ip = {}
     self.static_resources_by_ip = {}
     self.dynamic_resources_by_ip = {}
     self.local_ip = services.get_node_ip_address()
コード例 #7
0
ファイル: xgboost_ray.py プロジェクト: heuermh/modin
def _train(
        dtrain,
        nthread,
        evenly_data_distribution,
        params: Dict,
        *args,
        evals=(),
        **kwargs,
):
    s = time.time()

    X, y = dtrain
    assert len(X) == len(y)

    X_row_parts = unwrap_partitions(X,
                                    axis=0,
                                    bind_ip=not evenly_data_distribution)
    y_row_parts = unwrap_partitions(y,
                                    axis=0,
                                    bind_ip=not evenly_data_distribution)
    assert len(X_row_parts) == len(y_row_parts), "Unaligned train data"

    # Create remote actors
    actors = create_actors(nthread=nthread)

    add_as_eval_method = None
    if evals:
        for (eval_data, method) in evals[:]:
            if eval_data is dtrain:
                add_as_eval_method = method
                evals.remove((eval_data, method))

        for ((eval_X, eval_y), eval_method) in evals:
            # Split data across workers
            _split_data_across_actors(
                actors,
                lambda actor, *X_y: actor.add_eval_data.remote(
                    *X_y, eval_method=eval_method),
                unwrap_partitions(eval_X,
                                  axis=0,
                                  bind_ip=not evenly_data_distribution),
                unwrap_partitions(eval_y,
                                  axis=0,
                                  bind_ip=not evenly_data_distribution),
                evenly_data_distribution=evenly_data_distribution,
            )

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X_y: actor.set_train_data.remote(
            *X_y, add_as_eval_method=add_as_eval_method),
        X_row_parts,
        y_row_parts,
        evenly_data_distribution=evenly_data_distribution,
    )
    LOGGER.info(f"Data preparation time: {time.time() - s} s")

    s = time.time()
    with RabitContextManager(len(actors), get_node_ip_address()) as env:
        rabit_args = [("%s=%s" % item).encode() for item in env.items()]

        # Train
        fut = [
            actor.train.remote(rabit_args, params, *args, **kwargs)
            for _, actor in actors.items()
        ]

        # All results should be the same because of Rabit tracking. So we just
        # return the first one.
        result = ray.get(fut[0])
        LOGGER.info(f"Training time: {time.time() - s} s")
        return result
コード例 #8
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_shard_ports, object_manager_port,
          object_store_memory, num_workers, num_cpus, num_gpus, resources,
          head, no_ui, block, plasma_directory, huge_pages, autoscaling_config,
          use_raylet, no_redirect_worker_output, no_redirect_output,
          logging_level, logging_format):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    level = logging.getLevelName(logging_level.upper())
    logging.basicConfig(level=level, format=logging_format)
    logger = logging.getLogger(__name__)

    if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1":
        # This environment variable is used in our testing setup.
        logger.info("Detected environment variable 'RAY_USE_XRAY'.")
        use_raylet = True

    try:
        resources = json.loads(resources)
    except Exception as e:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    assert "CPU" not in resources, "Use the --num-cpus argument."
    assert "GPU" not in resources, "Use the --num-gpus argument."
    if num_cpus is not None:
        resources["CPU"] = num_cpus
    if num_gpus is not None:
        resources["GPU"] = num_gpus

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        logger.info(
            "Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            redis_shard_ports=redis_shard_ports,
            object_store_memory=object_store_memory,
            num_workers=num_workers,
            cleanup=False,
            redirect_worker_output=not no_redirect_worker_output,
            redirect_output=not no_redirect_output,
            resources=resources,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            redis_protected_mode=False,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            autoscaling_config=autoscaling_config,
            use_raylet=use_raylet)
        logger.info(address_info)
        logger.info(
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --redis-address {}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(redis_address=\"{}\")\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(address_info["redis_address"],
                                  address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        logger.info(
            "Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_client)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            object_store_memory=object_store_memory,
            cleanup=False,
            redirect_worker_output=not no_redirect_worker_output,
            redirect_output=not no_redirect_output,
            resources=resources,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            use_raylet=use_raylet)
        logger.info(address_info)
        logger.info("\nStarted Ray on this node. If you wish to terminate the "
                    "processes that have been started, run\n\n"
                    "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
コード例 #9
0
ファイル: scripts.py プロジェクト: zzz622848/ray
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_password, redis_shard_ports,
          object_manager_port, node_manager_port, object_store_memory,
          redis_max_memory, num_cpus, num_gpus, resources, head, include_webui,
          block, plasma_directory, huge_pages, autoscaling_config,
          no_redirect_worker_output, no_redirect_output,
          plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
          java_worker_options, load_code_from_local, internal_config):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    try:
        resources = json.loads(resources)
    except Exception:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    redirect_worker_output = None if not no_redirect_worker_output else True
    redirect_output = None if not no_redirect_output else True
    ray_params = ray.parameter.RayParams(
        node_ip_address=node_ip_address,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=redirect_worker_output,
        redirect_output=redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=huge_pages,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        include_java=include_java,
        include_webui=include_webui,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
        _internal_config=internal_config)

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address())
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        ray_params.update_if_absent(
            redis_port=redis_port,
            redis_shard_ports=redis_shard_ports,
            redis_max_memory=redis_max_memory,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            autoscaling_config=autoscaling_config,
            include_java=False,
        )

        node = ray.node.Node(ray_params, head=True, shutdown_at_exit=False)
        redis_address = node.redis_address

        logger.info(
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --redis-address {}{}{}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(redis_address=\"{}{}{}\")\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(
                redis_address, " --redis-password " if redis_password else "",
                redis_password if redis_password else "", redis_address,
                "\", redis_password=\"" if redis_password else "",
                redis_password if redis_password else ""))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if include_webui:
            raise Exception("If --head is not passed in, the --include-webui "
                            "flag is not relevant.")
        if include_java is not None:
            raise ValueError("--include-java should only be set for the head "
                             "node.")

        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address,
                                         int(redis_port),
                                         password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address,
                                                    password=redis_password)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.update(redis_address=redis_address)
        node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False)
        logger.info("\nStarted Ray on this node. If you wish to terminate the "
                    "processes that have been started, run\n\n"
                    "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
コード例 #10
0
ファイル: util.py プロジェクト: peppy0510/ray
def with_head_node_ip(cmds):
    head_ip = services.get_node_ip_address()
    out = []
    for cmd in cmds:
        out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd))
    return out
コード例 #11
0
ファイル: scripts.py プロジェクト: adgirish/ray
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_shard_ports, object_manager_port,
          object_store_memory, num_workers, num_cpus, num_gpus, resources,
          head, no_ui, block, plasma_directory, huge_pages,
          autoscaling_config):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    try:
        resources = json.loads(resources)
    except Exception as e:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    assert "CPU" not in resources, "Use the --num-cpus argument."
    assert "GPU" not in resources, "Use the --num-gpus argument."
    if num_cpus is not None:
        resources["CPU"] = num_cpus
    if num_gpus is not None:
        resources["GPU"] = num_gpus

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        print("Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            redis_shard_ports=redis_shard_ports,
            object_store_memory=object_store_memory,
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            autoscaling_config=autoscaling_config)
        print(address_info)
        print("\nStarted Ray on this node. You can add additional nodes to "
              "the cluster by calling\n\n"
              "    ray start --redis-address {}\n\n"
              "from the node you wish to add. You can connect a driver to the "
              "cluster from Python by running\n\n"
              "    import ray\n"
              "    ray.init(redis_address=\"{}\")\n\n"
              "If you have trouble connecting from a different machine, check "
              "that your firewall is configured properly. If you wish to "
              "terminate the processes that have been started, run\n\n"
              "    ray stop".format(address_info["redis_address"],
                                    address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_client)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            object_store_memory=object_store_memory,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages)
        print(address_info)
        print("\nStarted Ray on this node. If you wish to terminate the "
              "processes that have been started, run\n\n"
              "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
コード例 #12
0
    def __init__(self,

                 # --- general
                 name,
                 log_verbose,
                 log_export_freq,
                 checkpoint_freq,
                 export_hands_freq,
                 eval_agent_export_freq,

                 # --- env
                 game_cls,
                 env_bldr_cls,
                 start_chips,

                 # --- Evaluation
                 eval_modes_of_algo,
                 eval_stack_sizes,

                 module_args,

                 # --- Computing
                 path_data=None,
                 local_crayon_server_docker_address="localhost",
                 device_inference="cpu",
                 DISTRIBUTED=False,
                 CLUSTER=False,
                 DEBUGGING=False,
                 TESTING=False,

                 # --- Only relevant if running distributed
                 redis_head_adr=None,  # (str) IP under which the ray redis server can be reached
                 ):
        """
        Args:
            name (str):                             Under this name all logs, data, and checkpoints will appear.
            log_verbose (bool):                     Whether the program shall log detailed in Tensorboard.
            log_export_freq:                        Every X iterations, logs are pushed to the Crayon Docker container.
            checkpoint_freq (int):                  Every X iterations, make a recoverable copy of state of training.
            eval_agent_export_freq (int):        Every X iterations, an EvalAgent instance of the algo is exported.
            game_cls (PokerEnv subclass):           Class (not instance) to be trained in.
            env_bldr_cls (EnvBuilder subclass)      Class (not instance) to wrap the environment.
            start_chips (int):                      Standard stack size to initialize all players with.
            eval_modes_of_algo (tuple):             Tuple of algo-specific EvalAgent's eval modes
            eval_stack_sizes (tuple):               Tuple of lists of ints. if None, defaults to what's used in
                                                    training_profile.env_bldr.
            module_args (dict):                     All modules or parts of algorithms may have their own args. These
                                                    are stored in seperate objects and accessible under a certain string
                                                    key in the ""module_args"" dict.
            path_data:                              path to store data (e.g. checkpoints) the algorithm generates in.
                                                    If None, we will store data in a folder we create in your home dir.
            local_crayon_server_docker_address:     ip of crayon docker container (default: localhost)
            device_inference:                       "cpu" or "cuda". This device will be used for batched NN inference
            DISTRIBUTED (bool):                     Whether ray should be used at all.
            CLUSTER:                                requires "DISTRIBUTED==True".
                                                    If True, runs on many machines, if False, runs on local CPUs/GPUs.
            DEBUGGING (bool):                       Whether to use assert statements for debugging
            redis_head_adr:                         Only applicable if "CLUSTER==True". IP address under which the ray
                                                    head can be found.

        """

        # Assert basic modules were passed
        assert "env" in module_args

        # t_prof
        self.name = name
        self.log_verbose = log_verbose
        self.log_export_freq = log_export_freq
        self.checkpoint_freq = checkpoint_freq
        self.export_hands_freq = export_hands_freq
        self.eval_agent_export_freq = eval_agent_export_freq

        self.module_args = module_args

        if CLUSTER:
            if redis_head_adr:
                self.redis_head_adr = redis_head_adr
            else:
                from ray import services

                self.redis_head_adr = services.get_node_ip_address() + ":6379"
        self.local_crayon_server_docker_address = local_crayon_server_docker_address

        self.DISTRIBUTED = DISTRIBUTED or CLUSTER
        self.CLUSTER = CLUSTER
        self.DEBUGGING = DEBUGGING
        self.TESTING = TESTING
        self.HAVE_GPU = torch.cuda.is_available()

        self.n_seats = self.module_args["env"].n_seats

        # Eval
        self.eval_modes_of_algo = eval_modes_of_algo
        if eval_stack_sizes is None:
            if start_chips is None:
                self.eval_stack_sizes = [[game_cls.DEFAULT_STACK_SIZE for _ in range(self.n_seats)]]
            else:
                self.eval_stack_sizes = [copy.deepcopy(self.module_args["env"].starting_stack_sizes_list)]
        else:
            assert isinstance(eval_stack_sizes, tuple)
            assert isinstance(eval_stack_sizes[0], list)
            self.eval_stack_sizes = list(eval_stack_sizes)

        self.game_cls_str = game_cls.__name__
        self.env_builder_cls_str = env_bldr_cls.__name__

        assert isinstance(device_inference, str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_inference = torch.device(device_inference)

        # Paths
        def get_root_path():
            return "C:\\" if os.name == 'nt' else os.path.expanduser('~/')

        self._data_path = path_data if path_data is not None else os.path.join(get_root_path(), "poker_ai_data")
        if self.TESTING:
            self._data_path = os.path.join(self._data_path, "testing")
        self.path_agent_export_storage = ospj(self._data_path, "eval_agent")
        self.path_log_storage = ospj(self._data_path, "logs")
        self.path_checkpoint = ospj(self._data_path, "checkpoint")
        self.path_export_hands = ospj(self._data_path, "export_hands")
        self.path_trainingprofiles = ospj(self._data_path, "TrainingProfiles")

        for p in [self._data_path,
                  self.path_agent_export_storage,
                  self.path_log_storage,
                  self.path_checkpoint,
                  self.path_export_hands,
                  self.path_trainingprofiles,
                  ]:
            if (not os.path.exists(p)) and (not os.path.isfile(p)):
                os.makedirs(p)
コード例 #13
0
 def ip(self):
     import ray.services as rservices
     return rservices.get_node_ip_address()
コード例 #14
0
def start(node_ip_address, address, port, redis_password, redis_shard_ports,
          object_manager_port, node_manager_port, gcs_server_port,
          min_worker_port, max_worker_port, memory, object_store_memory,
          redis_max_memory, num_cpus, num_gpus, resources, head,
          include_dashboard, dashboard_host, dashboard_port, block,
          plasma_directory, autoscaling_config, no_redirect_worker_output,
          no_redirect_output, plasma_store_socket_name, raylet_socket_name,
          temp_dir, java_worker_options, load_code_from_local,
          code_search_path, system_config, lru_evict,
          enable_object_reconstruction, metrics_export_port, log_style,
          log_color, verbose):
    """Start Ray processes manually on the local machine."""
    cli_logger.configure(log_style, log_color, verbose)

    if gcs_server_port and not head:
        raise ValueError(
            "gcs_server_port can be only assigned when you specify --head.")

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)

    redis_address = None
    if address is not None:
        (redis_address, redis_address_ip,
         redis_address_port) = services.validate_redis_address(address)

    try:
        resources = json.loads(resources)
    except Exception:
        cli_logger.error("`{}` is not a valid JSON string.",
                         cf.bold("--resources"))
        cli_logger.abort(
            "Valid values look like this: `{}`",
            cf.bold("--resources='\"CustomResource3\": 1, "
                    "\"CustomResource2\": 2}'"))

        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    redirect_worker_output = None if not no_redirect_worker_output else True
    redirect_output = None if not no_redirect_output else True
    ray_params = ray.parameter.RayParams(
        node_ip_address=node_ip_address,
        min_worker_port=min_worker_port,
        max_worker_port=max_worker_port,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        gcs_server_port=gcs_server_port,
        memory=memory,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=redirect_worker_output,
        redirect_output=redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=False,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        include_dashboard=include_dashboard,
        dashboard_host=dashboard_host,
        dashboard_port=dashboard_port,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
        code_search_path=code_search_path,
        _system_config=system_config,
        lru_evict=lru_evict,
        enable_object_reconstruction=enable_object_reconstruction,
        metrics_export_port=metrics_export_port)
    if head:
        num_redis_shards = None
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            num_redis_shards = len(redis_shard_ports)

        if redis_address is not None:
            cli_logger.abort(
                "`{}` starts a new Redis server, `{}` should not be set.",
                cf.bold("--head"), cf.bold("--address"))

            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address())
        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
        cli_logger.old_info(logger, "Using IP address {} for this node.",
                            ray_params.node_ip_address)
        ray_params.update_if_absent(
            redis_port=port,
            redis_shard_ports=redis_shard_ports,
            redis_max_memory=redis_max_memory,
            num_redis_shards=num_redis_shards,
            redis_max_clients=None,
            autoscaling_config=autoscaling_config,
        )

        node = ray.node.Node(
            ray_params, head=True, shutdown_at_exit=block, spawn_reaper=block)
        redis_address = node.redis_address

        # this is a noop if new-style is not set, so the old logger calls
        # are still in place
        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        with cli_logger.group("Next steps"):
            cli_logger.print(
                "To connect to this Ray runtime from another node, run")
            cli_logger.print(
                cf.bold("  ray start --address='{}'{}"), redis_address,
                f" --redis-password='******'"
                if redis_password else "")
            cli_logger.newline()
            cli_logger.print("Alternatively, use the following Python code:")
            with cli_logger.indented():
                with cf.with_style("monokai") as c:
                    cli_logger.print("{} ray", c.magenta("import"))
                    cli_logger.print(
                        "ray{}init(address{}{}{})", c.magenta("."),
                        c.magenta("="), c.yellow("'auto'"),
                        ", _redis_password{}{}".format(
                            c.magenta("="),
                            c.yellow("'" + redis_password + "'"))
                        if redis_password else "")
            cli_logger.newline()
            cli_logger.print(
                cf.underlined("If connection fails, check your "
                              "firewall settings and "
                              "network configuration."))
            cli_logger.newline()
            cli_logger.print("To terminate the Ray runtime, run")
            cli_logger.print(cf.bold("  ray stop"))

        cli_logger.old_info(
            logger,
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --address='{}'{}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(address='auto'{})\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(
                redis_address, " --redis-password='******'"
                if redis_password else "",
                ", _redis_password='******'"
                if redis_password else ""))
    else:
        # Start Ray on a non-head node.
        if not (port is None):
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--port"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --port is not "
                            "allowed.")
        if redis_shard_ports is not None:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--redis-shard-ports"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed.")
        if redis_address is None:
            cli_logger.abort("`{}` is required unless starting with `{}`.",
                             cf.bold("--address"), cf.bold("--head"))

            raise Exception("If --head is not passed in, --address must "
                            "be provided.")
        if include_dashboard:
            cli_logger.abort("`{}` should not be specified without `{}`.",
                             cf.bold("--include-dashboard"), cf.bold("--head"))

            raise ValueError(
                "If --head is not passed in, the --include-dashboard"
                "flag is not relevant.")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(
            redis_address_ip, redis_address_port, password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(
            redis_address, password=redis_password)

        # Check that the version information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))

        cli_logger.labeled_value("Local node IP", ray_params.node_ip_address)
        cli_logger.old_info(logger, "Using IP address {} for this node.",
                            ray_params.node_ip_address)

        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.update(redis_address=redis_address)
        node = ray.node.Node(
            ray_params, head=False, shutdown_at_exit=block, spawn_reaper=block)

        cli_logger.newline()
        startup_msg = "Ray runtime started."
        cli_logger.success("-" * len(startup_msg))
        cli_logger.success(startup_msg)
        cli_logger.success("-" * len(startup_msg))
        cli_logger.newline()
        cli_logger.print("To terminate the Ray runtime, run")
        cli_logger.print(cf.bold("  ray stop"))

        cli_logger.old_info(
            logger, "\nStarted Ray on this node. If you wish to terminate the "
            "processes that have been started, run\n\n"
            "    ray stop")

    if block:
        cli_logger.newline()
        with cli_logger.group(cf.bold("--block")):
            cli_logger.print(
                "This command will now block until terminated by a signal.")
            cli_logger.print(
                "Runing subprocesses are monitored and a message will be "
                "printed if any of them terminate unexpectedly.")

        while True:
            time.sleep(1)
            deceased = node.dead_processes()
            if len(deceased) > 0:
                cli_logger.newline()
                cli_logger.error("Some Ray subprcesses exited unexpectedly:")
                cli_logger.old_error(logger,
                                     "Ray processes died unexpectedly:")

                with cli_logger.indented():
                    for process_type, process in deceased:
                        cli_logger.error(
                            "{}",
                            cf.bold(str(process_type)),
                            _tags={"exit code": str(process.returncode)})
                        cli_logger.old_error(
                            logger, "\t{} died with exit code {}".format(
                                process_type, process.returncode))

                # shutdown_at_exit will handle cleanup.
                cli_logger.newline()
                cli_logger.error("Remaining processes will be killed.")
                cli_logger.old_error(
                    logger, "Killing remaining processes and exiting...")
                sys.exit(1)
コード例 #15
0
ファイル: autoscaler.py プロジェクト: adgirish/ray
 def __init__(self):
     self.last_used_time_by_ip = {}
     self.last_heartbeat_time_by_ip = {}
     self.static_resources_by_ip = {}
     self.dynamic_resources_by_ip = {}
     self.local_ip = services.get_node_ip_address()
コード例 #16
0
ファイル: syncer.py プロジェクト: zachkeer/ray
 def __init__(self, local_dir, remote_dir, sync_client):
     self.local_ip = services.get_node_ip_address()
     self.worker_ip = None
     super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
コード例 #17
0
ファイル: autoscaler.py プロジェクト: adgirish/ray
def with_head_node_ip(cmds):
    head_ip = services.get_node_ip_address()
    out = []
    for cmd in cmds:
        out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd))
    return out
コード例 #18
0
    # Note that we redirect stdout and stderr to /dev/null because otherwise
    # attempts to print may cause exceptions if a process is started inside of an
    # SSH connection and the SSH connection dies. TODO(rkn): This is a temporary
    # fix. We should actually redirect stdout and stderr to Redis in some way.

    if args.head:
        # Start Ray on the head node.
        if args.redis_address is not None:
            raise Exception(
                "If --head is passed in, a Redis server will be started, so a Redis address should not be provided."
            )

        # Get the node IP address if one is not provided.
        if args.node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        else:
            node_ip_address = args.node_ip_address
        print("Using IP address {} for this node.".format(node_ip_address))

        if args.redis_port is not None:
            address_info = {
                "redis_address": "{}:{}".format(node_ip_address,
                                                args.redis_port)
            }
        else:
            address_info = None

        address_info = services.start_ray_head(address_info=address_info,
                                               node_ip_address=node_ip_address,
                                               num_workers=args.num_workers,
コード例 #19
0
ファイル: scripts.py プロジェクト: zhuohan123/ray
def start(node_ip_address, redis_address, address, redis_port, port,
          num_redis_shards, redis_max_clients, redis_password,
          redis_shard_ports, object_manager_port, node_manager_port,
          min_worker_port, max_worker_port, memory, object_store_memory,
          redis_max_memory, num_cpus, num_gpus, resources, head, include_webui,
          webui_host, block, plasma_directory, huge_pages, autoscaling_config,
          no_redirect_worker_output, no_redirect_output,
          plasma_store_socket_name, raylet_socket_name, temp_dir, include_java,
          java_worker_options, load_code_from_local, internal_config):
    """Start Ray processes manually on the local machine."""
    if redis_address is not None:
        raise DeprecationWarning("The --redis-address argument is "
                                 "deprecated. Please use --address instead.")
    if redis_port is not None:
        logger.warn("The --redis-port argument will be deprecated soon. "
                    "Please use --port instead.")
        if port is not None and port != redis_port:
            raise ValueError("Cannot specify both --port and --redis-port "
                             "as port is a rename of deprecated redis-port")

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)

    if redis_address is not None or address is not None:
        (redis_address, redis_address_ip,
         redis_address_port) = services.validate_redis_address(
             address, redis_address)

    try:
        resources = json.loads(resources)
    except Exception:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    redirect_worker_output = None if not no_redirect_worker_output else True
    redirect_output = None if not no_redirect_output else True
    ray_params = ray.parameter.RayParams(
        node_ip_address=node_ip_address,
        min_worker_port=min_worker_port,
        max_worker_port=max_worker_port,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        memory=memory,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=redirect_worker_output,
        redirect_output=redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=huge_pages,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        include_java=include_java,
        include_webui=include_webui,
        webui_host=webui_host,
        java_worker_options=java_worker_options,
        load_code_from_local=load_code_from_local,
        _internal_config=internal_config)
    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address())
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        ray_params.update_if_absent(
            redis_port=port or redis_port,
            redis_shard_ports=redis_shard_ports,
            redis_max_memory=redis_max_memory,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            autoscaling_config=autoscaling_config,
            include_java=False,
        )

        node = ray.node.Node(ray_params,
                             head=True,
                             shutdown_at_exit=block,
                             spawn_reaper=block)
        redis_address = node.redis_address

        logger.info(
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --address='{}'{}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(address='auto'{})\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(
                redis_address, " --redis-password='******'" if redis_password else "", ", redis_password='******'" if redis_password else ""))
    else:
        # Start Ray on a non-head node.
        if not (redis_port is None and port is None):
            raise Exception(
                "If --head is not passed in, --port and --redis-port are not "
                "allowed.")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed.")
        if redis_address is None:
            raise Exception("If --head is not passed in, --address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if include_webui:
            raise Exception("If --head is not passed in, the --include-webui "
                            "flag is not relevant.")
        if include_java is not None:
            raise ValueError("--include-java should only be set for the head "
                             "node.")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_address_ip,
                                         redis_address_port,
                                         password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address,
                                                    password=redis_password)

        # Check that the version information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.update(redis_address=redis_address)
        node = ray.node.Node(ray_params,
                             head=False,
                             shutdown_at_exit=block,
                             spawn_reaper=block)
        logger.info("\nStarted Ray on this node. If you wish to terminate the "
                    "processes that have been started, run\n\n"
                    "    ray stop")

    if block:
        while True:
            time.sleep(1)
            deceased = node.dead_processes()
            if len(deceased) > 0:
                logger.error("Ray processes died unexpectedly:")
                for process_type, process in deceased:
                    logger.error("\t{} died with exit code {}".format(
                        process_type, process.returncode))
                # shutdown_at_exit will handle cleanup.
                logger.error("Killing remaining processes and exiting...")
                sys.exit(1)
コード例 #20
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, object_manager_port, num_workers, num_cpus,
          num_gpus, num_custom_resource, head, no_ui, block, plasma_directory,
          huge_pages):
    # Note that we redirect stdout and stderr to /dev/null because otherwise
    # attempts to print may cause exceptions if a process is started inside of
    # an SSH connection and the SSH connection dies. TODO(rkn): This is a
    # temporary fix. We should actually redirect stdout and stderr to Redis in
    # some way.

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    if head:
        # Start Ray on the head node.
        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        print("Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            num_cpus=num_cpus,
            num_gpus=num_gpus,
            num_custom_resource=num_custom_resource,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages)
        print(address_info)
        print("\nStarted Ray on this node. You can add additional nodes to "
              "the cluster by calling\n\n"
              "    ray start --redis-address {}\n\n"
              "from the node you wish to add. You can connect a driver to the "
              "cluster from Python by running\n\n"
              "    import ray\n"
              "    ray.init(redis_address=\"{}\")\n\n"
              "If you have trouble connecting from a different machine, check "
              "that your firewall is configured properly. If you wish to "
              "terminate the processes that have been started, run\n\n"
              "    ray stop".format(address_info["redis_address"],
                                    address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")
        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))
        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_address)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            num_cpus=num_cpus,
            num_gpus=num_gpus,
            num_custom_resource=num_custom_resource,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages)
        print(address_info)
        print("\nStarted Ray on this node. If you wish to terminate the "
              "processes that have been started, run\n\n"
              "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
コード例 #21
0
ファイル: hello.py プロジェクト: ashbyhobart/holder
def f():
    time.sleep(0.01)
    return get_node_ip_address()
コード例 #22
0
def _train(
        dtrain,
        num_actors,
        params: Dict,
        *args,
        evals=(),
        **kwargs,
):
    """
    Run distributed training of XGBoost model on Ray backend.

    During work it evenly distributes `dtrain` between workers according
    to IP addresses partitions (in case of not even distribution of `dtrain`
    by nodes, part of partitions will be re-distributed between nodes),
    runs xgb.train on each worker for subset of `dtrain` and reduces training results
    of each worker using Rabit Context.

    Parameters
    ----------
    dtrain : modin.experimental.DMatrix
        Data to be trained against.
    num_actors : int, optional
        Number of actors for training. If unspecified, this value will be
        computed automatically.
    params : dict
        Booster params.
    *args : iterable
        Other parameters for `xgboost.train`.
    evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty
        List of validation sets for which metrics will be evaluated during training.
        Validation metrics will help us track the performance of the model.
    **kwargs : dict
        Other parameters are the same as `xgboost.train`.

    Returns
    -------
    dict
        A dictionary with trained booster and dict of
        evaluation results
        as {"booster": xgboost.Booster, "history": dict}.
    """
    s = time.time()

    X_row_parts, y_row_parts = dtrain

    assert len(X_row_parts) == len(y_row_parts), "Unaligned train data"

    num_actors = _get_num_actors(
        num_actors if isinstance(num_actors, int) else "default_train")

    if num_actors > len(X_row_parts):
        num_actors = len(X_row_parts)

    actors, pg = create_actors(num_actors)

    add_as_eval_method = None
    if evals:
        for (eval_data, method) in evals[:]:
            if eval_data is dtrain:
                add_as_eval_method = method
                evals.remove((eval_data, method))

        for ((eval_X, eval_y), eval_method) in evals:
            # Split data across workers
            _split_data_across_actors(
                actors,
                lambda actor, *X_y: actor.add_eval_data.remote(
                    *X_y, eval_method=eval_method),
                eval_X,
                y_parts=eval_y,
            )

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X_y: actor.set_train_data.remote(
            *X_y, add_as_eval_method=add_as_eval_method),
        X_row_parts,
        y_parts=y_row_parts,
    )
    LOGGER.info(f"Data preparation time: {time.time() - s} s")

    s = time.time()
    with RabitContextManager(len(actors), get_node_ip_address()) as env:
        rabit_args = [("%s=%s" % item).encode() for item in env.items()]

        # Train
        fut = [
            actor.train.remote(rabit_args, params, *args, **kwargs)
            for actor in actors
        ]
        # All results should be the same because of Rabit tracking. So we just
        # return the first one.
        result = ray.get(fut[0])
        remove_placement_group(pg)
        LOGGER.info(f"Training time: {time.time() - s} s")
        return result