Exemple #1
0
    def add_node(self, **override_kwargs):
        """Adds a node to the local Ray Cluster.

        All nodes are by default started with the following settings:
            cleanup=True,
            resources={"CPU": 1},
            object_store_memory=100 * (2**20) # 100 MB

        Args:
            override_kwargs: Keyword arguments used in `start_ray_head`
                and `start_ray_node`. Overrides defaults.

        Returns:
            Node object of the added Ray node.
        """
        node_kwargs = {
            "cleanup": True,
            "resources": {
                "CPU": 1
            },
            "object_store_memory": 100 * (2**20)  # 100 MB
        }
        node_kwargs.update(override_kwargs)

        if self.head_node is None:
            address_info = services.start_ray_head(
                node_ip_address=services.get_node_ip_address(),
                include_webui=False,
                **node_kwargs)
            self.redis_address = address_info["redis_address"]
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(process_dict_copy)
            self.head_node = node
        else:
            address_info = services.start_ray_node(
                services.get_node_ip_address(), self.redis_address,
                **node_kwargs)
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(process_dict_copy)
            self.worker_nodes[node] = address_info
        logger.info("Starting Node with raylet socket {}".format(
            address_info["raylet_socket_names"]))

        return node
Exemple #2
0
    def add_node(self, **override_kwargs):
        """Adds a node to the local Ray Cluster.

        All nodes are by default started with the following settings:
            cleanup=True,
            num_cpus=1,
            object_store_memory=100 * (2**20) # 100 MB

        Args:
            override_kwargs: Keyword arguments used in `start_ray_head`
                and `start_ray_node`. Overrides defaults.

        Returns:
            Node object of the added Ray node.
        """
        node_kwargs = {
            "num_cpus": 1,
            "object_store_memory": 100 * (2**20)  # 100 MB
        }
        node_kwargs.update(override_kwargs)
        ray_params = RayParams(
            node_ip_address=services.get_node_ip_address(), **node_kwargs)

        if self.head_node is None:
            ray_params.update(include_webui=False)
            address_info = services.start_ray_head(ray_params, cleanup=True)
            self.redis_address = address_info["redis_address"]
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(address_info, process_dict_copy)
            self.head_node = node
        else:
            ray_params.update(redis_address=self.redis_address)
            address_info = services.start_ray_node(ray_params, cleanup=True)
            # TODO(rliaw): Find a more stable way than modifying global state.
            process_dict_copy = services.all_processes.copy()
            for key in services.all_processes:
                services.all_processes[key] = []
            node = Node(address_info, process_dict_copy)
            self.worker_nodes[node] = address_info
        logger.info("Starting Node with raylet socket {}".format(
            address_info["raylet_socket_name"]))

        return node
Exemple #3
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_password, redis_shard_ports,
          object_manager_port, node_manager_port, object_store_memory,
          redis_max_memory, num_workers, num_cpus, num_gpus, resources, head,
          no_ui, block, plasma_directory, huge_pages, autoscaling_config,
          no_redirect_worker_output, no_redirect_output,
          plasma_store_socket_name, raylet_socket_name, temp_dir,
          internal_config):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    try:
        resources = json.loads(resources)
    except Exception:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    ray_params = RayParams(
        node_ip_address=node_ip_address,
        object_manager_port=object_manager_port,
        node_manager_port=node_manager_port,
        num_workers=num_workers,
        object_store_memory=object_store_memory,
        redis_password=redis_password,
        redirect_worker_output=not no_redirect_worker_output,
        redirect_output=not no_redirect_output,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        resources=resources,
        plasma_directory=plasma_directory,
        huge_pages=huge_pages,
        plasma_store_socket_name=plasma_store_socket_name,
        raylet_socket_name=raylet_socket_name,
        temp_dir=temp_dir,
        _internal_config=internal_config)

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address())
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        ray_params.update_if_absent(redis_port=redis_port,
                                    redis_shard_ports=redis_shard_ports,
                                    redis_max_memory=redis_max_memory,
                                    num_redis_shards=num_redis_shards,
                                    redis_max_clients=redis_max_clients,
                                    include_webui=(not no_ui),
                                    autoscaling_config=autoscaling_config)

        address_info = services.start_ray_head(ray_params, cleanup=False)
        logger.info(address_info)
        logger.info(
            "\nStarted Ray on this node. You can add additional nodes to "
            "the cluster by calling\n\n"
            "    ray start --redis-address {}{}{}\n\n"
            "from the node you wish to add. You can connect a driver to the "
            "cluster from Python by running\n\n"
            "    import ray\n"
            "    ray.init(redis_address=\"{}{}{}\")\n\n"
            "If you have trouble connecting from a different machine, check "
            "that your firewall is configured properly. If you wish to "
            "terminate the processes that have been started, run\n\n"
            "    ray stop".format(
                address_info["redis_address"],
                " --redis-password " if redis_password else "",
                redis_password if redis_password else "",
                address_info["redis_address"],
                "\", redis_password=\"" if redis_password else "",
                redis_password if redis_password else ""))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address,
                                         int(redis_port),
                                         password=redis_password)

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address,
                                                    password=redis_password)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        ray_params.update_if_absent(
            node_ip_address=services.get_node_ip_address(redis_address))
        logger.info("Using IP address {} for this node.".format(
            ray_params.node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(ray_params.node_ip_address,
                                        redis_client)
        ray_params.redis_address = redis_address
        address_info = services.start_ray_node(ray_params, cleanup=False)
        logger.info(address_info)
        logger.info("\nStarted Ray on this node. If you wish to terminate the "
                    "processes that have been started, run\n\n"
                    "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
Exemple #4
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_shard_ports, object_manager_port,
          object_store_memory, num_workers, num_cpus, num_gpus, resources,
          head, no_ui, block, plasma_directory, huge_pages, autoscaling_config,
          use_raylet):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1":
        # This environment variable is used in our testing setup.
        print("Detected environment variable 'RAY_USE_XRAY'.")
        use_raylet = True

    try:
        resources = json.loads(resources)
    except Exception as e:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    assert "CPU" not in resources, "Use the --num-cpus argument."
    assert "GPU" not in resources, "Use the --num-gpus argument."
    if num_cpus is not None:
        resources["CPU"] = num_cpus
    if num_gpus is not None:
        resources["GPU"] = num_gpus

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        print("Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            redis_shard_ports=redis_shard_ports,
            object_store_memory=object_store_memory,
            num_workers=num_workers,
            cleanup=False,
            redirect_worker_output=True,
            redirect_output=True,
            resources=resources,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            autoscaling_config=autoscaling_config,
            use_raylet=use_raylet)
        print(address_info)
        print("\nStarted Ray on this node. You can add additional nodes to "
              "the cluster by calling\n\n"
              "    ray start --redis-address {}\n\n"
              "from the node you wish to add. You can connect a driver to the "
              "cluster from Python by running\n\n"
              "    import ray\n"
              "    ray.init(redis_address=\"{}\")\n\n"
              "If you have trouble connecting from a different machine, check "
              "that your firewall is configured properly. If you wish to "
              "terminate the processes that have been started, run\n\n"
              "    ray stop".format(address_info["redis_address"],
                                    address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_client)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            object_store_memory=object_store_memory,
            cleanup=False,
            redirect_worker_output=True,
            redirect_output=True,
            resources=resources,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            use_raylet=use_raylet)
        print(address_info)
        print("\nStarted Ray on this node. If you wish to terminate the "
              "processes that have been started, run\n\n"
              "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
Exemple #5
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, object_manager_port, num_workers, num_cpus,
          num_gpus, resources, head, no_ui, block, plasma_directory,
          huge_pages, autoscaling_config):
    # Note that we redirect stdout and stderr to /dev/null because otherwise
    # attempts to print may cause exceptions if a process is started inside of
    # an SSH connection and the SSH connection dies. TODO(rkn): This is a
    # temporary fix. We should actually redirect stdout and stderr to Redis in
    # some way.

    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    try:
        resources = json.loads(resources)
    except Exception as e:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    assert "CPU" not in resources, "Use the --num-cpus argument."
    assert "GPU" not in resources, "Use the --num-gpus argument."
    if num_cpus is not None:
        resources["CPU"] = num_cpus
    if num_gpus is not None:
        resources["GPU"] = num_gpus

    if head:
        # Start Ray on the head node.
        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        print("Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            autoscaling_config=autoscaling_config)
        print(address_info)
        print("\nStarted Ray on this node. You can add additional nodes to "
              "the cluster by calling\n\n"
              "    ray start --redis-address {}\n\n"
              "from the node you wish to add. You can connect a driver to the "
              "cluster from Python by running\n\n"
              "    import ray\n"
              "    ray.init(redis_address=\"{}\")\n\n"
              "If you have trouble connecting from a different machine, check "
              "that your firewall is configured properly. If you wish to "
              "terminate the processes that have been started, run\n\n"
              "    ray stop".format(address_info["redis_address"],
                                    address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_client)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages)
        print(address_info)
        print("\nStarted Ray on this node. If you wish to terminate the "
              "processes that have been started, run\n\n"
              "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
Exemple #6
0
                "If --head is not passed in, --num-redis-shards must "
                "not be provided.")
        redis_ip_address, redis_port = args.redis_address.split(":")
        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))
        # Get the node IP address if one is not provided.
        if args.node_ip_address is None:
            node_ip_address = services.get_node_ip_address(args.redis_address)
        else:
            node_ip_address = args.node_ip_address
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP address
        # connected with this Redis instance. This raises an exception if the Redis
        # server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, args.redis_address)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=args.redis_address,
            object_manager_ports=[args.object_manager_port],
            num_workers=args.num_workers,
            cleanup=False,
            redirect_output=True,
            num_cpus=args.num_cpus,
            num_gpus=args.num_gpus)
        print(address_info)
        print(
            "\nStarted Ray on this node. If you wish to terminate the processes "
            "that have been started, run\n\n"
            "    ./scripts/stop_ray.sh")
Exemple #7
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          object_manager_port, num_workers, num_cpus, num_gpus, head, block):
  # Note that we redirect stdout and stderr to /dev/null because otherwise
  # attempts to print may cause exceptions if a process is started inside of an
  # SSH connection and the SSH connection dies. TODO(rkn): This is a temporary
  # fix. We should actually redirect stdout and stderr to Redis in some way.

  if head:
    # Start Ray on the head node.
    if redis_address is not None:
      raise Exception("If --head is passed in, a Redis server will be "
                      "started, so a Redis address should not be provided.")

    # Get the node IP address if one is not provided.
    if node_ip_address is None:
      node_ip_address = services.get_node_ip_address()
    print("Using IP address {} for this node.".format(node_ip_address))

    address_info = {}
    # Use the provided object manager port if there is one.
    if object_manager_port is not None:
      address_info["object_manager_ports"] = [object_manager_port]
    if address_info == {}:
      address_info = None

    address_info = services.start_ray_head(
        address_info=address_info,
        node_ip_address=node_ip_address,
        redis_port=redis_port,
        num_workers=num_workers,
        cleanup=False,
        redirect_output=True,
        num_cpus=num_cpus,
        num_gpus=num_gpus,
        num_redis_shards=num_redis_shards)
    print(address_info)
    print("\nStarted Ray on this node. You can add additional nodes to the "
          "cluster by calling\n\n"
          "    ray start --redis-address {}\n\n"
          "from the node you wish to add. You can connect a driver to the "
          "cluster from Python by running\n\n"
          "    import ray\n"
          "    ray.init(redis_address=\"{}\")\n\n"
          "If you have trouble connecting from a different machine, check "
          "that your firewall is configured properly. If you wish to "
          "terminate the processes that have been started, run\n\n"
          "    ray stop".format(address_info["redis_address"],
                                address_info["redis_address"]))
  else:
    # Start Ray on a non-head node.
    if redis_port is not None:
      raise Exception("If --head is not passed in, --redis-port is not "
                      "allowed")
    if redis_address is None:
      raise Exception("If --head is not passed in, --redis-address must be "
                      "provided.")
    if num_redis_shards is not None:
      raise Exception("If --head is not passed in, --num-redis-shards must "
                      "not be provided.")
    redis_ip_address, redis_port = redis_address.split(":")
    # Wait for the Redis server to be started. And throw an exception if we
    # can't connect to it.
    services.wait_for_redis_to_start(redis_ip_address, int(redis_port))
    # Get the node IP address if one is not provided.
    if node_ip_address is None:
      node_ip_address = services.get_node_ip_address(redis_address)
    print("Using IP address {} for this node.".format(node_ip_address))
    # Check that there aren't already Redis clients with the same IP address
    # connected with this Redis instance. This raises an exception if the Redis
    # server already has clients on this node.
    check_no_existing_redis_clients(node_ip_address, redis_address)
    address_info = services.start_ray_node(
        node_ip_address=node_ip_address,
        redis_address=redis_address,
        object_manager_ports=[object_manager_port],
        num_workers=num_workers,
        cleanup=False,
        redirect_output=True,
        num_cpus=num_cpus,
        num_gpus=num_gpus)
    print(address_info)
    print("\nStarted Ray on this node. If you wish to terminate the processes "
          "that have been started, run\n\n"
          "    ray stop")

  if block:
    import time
    while True:
      time.sleep(30)
Exemple #8
0
            raise Exception(
                "If --head is not passed in, --redis-address must be provided."
            )
        redis_ip_address, redis_port = args.redis_address.split(":")
        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))
        # Get the node IP address if one is not provided.
        if args.node_ip_address is None:
            node_ip_address = services.get_node_ip_address(args.redis_address)
        else:
            node_ip_addess = args.node_ip_address
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP address
        # connected with this Redis instance. This raises an exception if the Redis
        # server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, args.redis_address)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=args.redis_address,
            num_workers=args.num_workers,
            cleanup=False,
            redirect_output=True)
        print(address_info)
        print(
            "\nStarted {} workers on this node. A different number of workers "
            "can be set with the --num-workers flag (but you have to first "
            "terminate the existing cluster). If you wish to terminate the "
            "processes that have been started, run\n\n"
            "    ./scripts/stop_ray.sh".format(args.num_workers))
Exemple #9
0
def start(node_ip_address, redis_address, redis_port, num_redis_shards,
          redis_max_clients, redis_shard_ports, object_manager_port,
          object_store_memory, num_workers, num_cpus, num_gpus, resources,
          head, no_ui, block, plasma_directory, huge_pages,
          autoscaling_config):
    # Convert hostnames to numerical IP address.
    if node_ip_address is not None:
        node_ip_address = services.address_to_ip(node_ip_address)
    if redis_address is not None:
        redis_address = services.address_to_ip(redis_address)

    try:
        resources = json.loads(resources)
    except Exception as e:
        raise Exception("Unable to parse the --resources argument using "
                        "json.loads. Try using a format like\n\n"
                        "    --resources='{\"CustomResource1\": 3, "
                        "\"CustomReseource2\": 2}'")

    assert "CPU" not in resources, "Use the --num-cpus argument."
    assert "GPU" not in resources, "Use the --num-gpus argument."
    if num_cpus is not None:
        resources["CPU"] = num_cpus
    if num_gpus is not None:
        resources["GPU"] = num_gpus

    if head:
        # Start Ray on the head node.
        if redis_shard_ports is not None:
            redis_shard_ports = redis_shard_ports.split(",")
            # Infer the number of Redis shards from the ports if the number is
            # not provided.
            if num_redis_shards is None:
                num_redis_shards = len(redis_shard_ports)
            # Check that the arguments match.
            if len(redis_shard_ports) != num_redis_shards:
                raise Exception("If --redis-shard-ports is provided, it must "
                                "have the form '6380,6381,6382', and the "
                                "number of ports provided must equal "
                                "--num-redis-shards (which is 1 if not "
                                "provided)")

        if redis_address is not None:
            raise Exception("If --head is passed in, a Redis server will be "
                            "started, so a Redis address should not be "
                            "provided.")

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address()
        print("Using IP address {} for this node.".format(node_ip_address))

        address_info = {}
        # Use the provided object manager port if there is one.
        if object_manager_port is not None:
            address_info["object_manager_ports"] = [object_manager_port]
        if address_info == {}:
            address_info = None

        address_info = services.start_ray_head(
            address_info=address_info,
            node_ip_address=node_ip_address,
            redis_port=redis_port,
            redis_shard_ports=redis_shard_ports,
            object_store_memory=object_store_memory,
            num_workers=num_workers,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            num_redis_shards=num_redis_shards,
            redis_max_clients=redis_max_clients,
            include_webui=(not no_ui),
            plasma_directory=plasma_directory,
            huge_pages=huge_pages,
            autoscaling_config=autoscaling_config)
        print(address_info)
        print("\nStarted Ray on this node. You can add additional nodes to "
              "the cluster by calling\n\n"
              "    ray start --redis-address {}\n\n"
              "from the node you wish to add. You can connect a driver to the "
              "cluster from Python by running\n\n"
              "    import ray\n"
              "    ray.init(redis_address=\"{}\")\n\n"
              "If you have trouble connecting from a different machine, check "
              "that your firewall is configured properly. If you wish to "
              "terminate the processes that have been started, run\n\n"
              "    ray stop".format(address_info["redis_address"],
                                    address_info["redis_address"]))
    else:
        # Start Ray on a non-head node.
        if redis_port is not None:
            raise Exception("If --head is not passed in, --redis-port is not "
                            "allowed")
        if redis_shard_ports is not None:
            raise Exception("If --head is not passed in, --redis-shard-ports "
                            "is not allowed")
        if redis_address is None:
            raise Exception("If --head is not passed in, --redis-address must "
                            "be provided.")
        if num_redis_shards is not None:
            raise Exception("If --head is not passed in, --num-redis-shards "
                            "must not be provided.")
        if redis_max_clients is not None:
            raise Exception("If --head is not passed in, --redis-max-clients "
                            "must not be provided.")
        if no_ui:
            raise Exception("If --head is not passed in, the --no-ui flag is "
                            "not relevant.")
        redis_ip_address, redis_port = redis_address.split(":")

        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))

        # Create a Redis client.
        redis_client = services.create_redis_client(redis_address)

        # Check that the verion information on this node matches the version
        # information that the cluster was started with.
        services.check_version_info(redis_client)

        # Get the node IP address if one is not provided.
        if node_ip_address is None:
            node_ip_address = services.get_node_ip_address(redis_address)
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP
        # address connected with this Redis instance. This raises an exception
        # if the Redis server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, redis_client)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=redis_address,
            object_manager_ports=[object_manager_port],
            num_workers=num_workers,
            object_store_memory=object_store_memory,
            cleanup=False,
            redirect_output=True,
            resources=resources,
            plasma_directory=plasma_directory,
            huge_pages=huge_pages)
        print(address_info)
        print("\nStarted Ray on this node. If you wish to terminate the "
              "processes that have been started, run\n\n"
              "    ray stop")

    if block:
        import time
        while True:
            time.sleep(30)
Exemple #10
0
                "If --head is not passed in, --redis-address must be provided."
            )
        redis_ip_address, redis_port = args.redis_address.split(":")
        # Wait for the Redis server to be started. And throw an exception if we
        # can't connect to it.
        services.wait_for_redis_to_start(redis_ip_address, int(redis_port))
        # Get the node IP address if one is not provided.
        if args.node_ip_address is None:
            node_ip_address = services.get_node_ip_address(args.redis_address)
        else:
            node_ip_address = args.node_ip_address
        print("Using IP address {} for this node.".format(node_ip_address))
        # Check that there aren't already Redis clients with the same IP address
        # connected with this Redis instance. This raises an exception if the Redis
        # server already has clients on this node.
        check_no_existing_redis_clients(node_ip_address, args.redis_address)
        address_info = services.start_ray_node(
            node_ip_address=node_ip_address,
            redis_address=args.redis_address,
            object_manager_port=args.object_manager_port,
            num_workers=args.num_workers,
            cleanup=False,
            redirect_output=False)
        print(address_info)
        print(
            "\nStarted {} workers on this node. A different number of workers "
            "can be set with the --num-workers flag (but you have to first "
            "terminate the existing cluster). If you wish to terminate the "
            "processes that have been started, run\n\n"
            "    ./scripts/stop_ray.sh".format(args.num_workers))