def add_node(self, **override_kwargs): """Adds a node to the local Ray Cluster. All nodes are by default started with the following settings: cleanup=True, num_cpus=1, object_store_memory=100 * (2**20) # 100 MB Args: override_kwargs: Keyword arguments used in `start_ray_head` and `start_ray_node`. Overrides defaults. Returns: Node object of the added Ray node. """ node_kwargs = { "num_cpus": 1, "object_store_memory": 100 * (2**20) # 100 MB } node_kwargs.update(override_kwargs) ray_params = RayParams( node_ip_address=services.get_node_ip_address(), **node_kwargs) if self.head_node is None: ray_params.update(include_webui=False) address_info = services.start_ray_head(ray_params, cleanup=True) self.redis_address = address_info["redis_address"] # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.head_node = node else: ray_params.update(redis_address=self.redis_address) address_info = services.start_ray_node(ray_params, cleanup=True) # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.worker_nodes[node] = address_info logger.info("Starting Node with raylet socket {}".format( address_info["raylet_socket_name"])) return node
def ray_start_workers_separate(): # Start the Ray processes. ray_params = RayParams(num_cpus=1, start_ray_local=True, redirect_output=True) ray.worker._init(ray_params) yield None # The code after the yield will run as teardown code. ray.shutdown()
def _test_component_failed(component_type): """Kill a component on all worker nodes and check workload succeeds.""" # Start with 4 workers and 4 cores. num_local_schedulers = 4 num_workers_per_scheduler = 8 ray_params = RayParams(num_local_schedulers=num_local_schedulers, start_ray_local=True, num_cpus=[num_workers_per_scheduler] * num_local_schedulers, redirect_output=True, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10, })) ray.worker._init(ray_params) # Submit many tasks with many dependencies. @ray.remote def f(x): return x @ray.remote def g(*xs): return 1 # Kill the component on all nodes except the head node as the tasks # execute. Do this in a loop while submitting tasks between each # component failure. time.sleep(0.1) components = ray.services.all_processes[component_type] for process in components[1:]: # Submit a round of tasks with many dependencies. x = 1 for _ in range(1000): x = f.remote(x) xs = [g.remote(1)] for _ in range(100): xs.append(g.remote(*xs)) xs.append(g.remote(1)) # Kill a component on one of the nodes. process.terminate() time.sleep(1) process.kill() process.wait() assert not process.poll() is None # Make sure that we can still get the objects after the # executing tasks died. ray.get(x) ray.get(xs)
def ray_start_combination(request): num_local_schedulers = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. ray_params = RayParams(start_ray_local=True, num_local_schedulers=num_local_schedulers, num_cpus=10) ray.worker._init(ray_params) yield num_local_schedulers, num_workers_per_scheduler # The code after the yield will run as teardown code. ray.shutdown()
def ray_start_two_nodes(): # Start the Ray processes. ray_params = RayParams(start_ray_local=True, num_local_schedulers=2, num_cpus=0, _internal_config=json.dumps( {"num_heartbeats_timeout": 40})) ray.worker._init(ray_params) yield None # The code after the yield will run as teardown code. ray.shutdown()
def ray_start_workers_separate_multinode(request): num_local_schedulers = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. ray_params = RayParams(num_local_schedulers=num_local_schedulers, start_ray_local=True, num_cpus=[num_initial_workers] * num_local_schedulers, redirect_output=True) ray.worker._init(ray_params) yield num_local_schedulers, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown()
def add_node(self, **override_kwargs): """Adds a node to the local Ray Cluster. All nodes are by default started with the following settings: cleanup=True, resources={"CPU": 1}, object_store_memory=100 * (2**20) # 100 MB Args: override_kwargs: Keyword arguments used in `start_ray_head` and `start_ray_node`. Overrides defaults. Returns: Node object of the added Ray node. """ node_kwargs = { "resources": { "CPU": 1 }, "object_store_memory": 100 * (2**20) # 100 MB } node_kwargs.update(override_kwargs) ray_params = RayParams(node_ip_address=services.get_node_ip_address(), **node_kwargs) if self.head_node is None: ray_params.update(include_webui=False) address_info = services.start_ray_head(ray_params, cleanup=True) self.redis_address = address_info["redis_address"] # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.head_node = node else: ray_params.update(redis_address=self.redis_address) address_info = services.start_ray_node(ray_params, cleanup=True) # TODO(rliaw): Find a more stable way than modifying global state. process_dict_copy = services.all_processes.copy() for key in services.all_processes: services.all_processes[key] = [] node = Node(address_info, process_dict_copy) self.worker_nodes[node] = address_info logger.info("Starting Node with raylet socket {}".format( address_info["raylet_socket_names"])) return node
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_password, redis_shard_ports, object_manager_port, node_manager_port, object_store_memory, redis_max_memory, num_workers, num_cpus, num_gpus, resources, head, no_ui, block, plasma_directory, huge_pages, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, internal_config): # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) try: resources = json.loads(resources) except Exception: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") ray_params = RayParams( node_ip_address=node_ip_address, object_manager_port=object_manager_port, node_manager_port=node_manager_port, num_workers=num_workers, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=not no_redirect_worker_output, redirect_output=not no_redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, _internal_config=internal_config) if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) ray_params.update_if_absent(redis_port=redis_port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, include_webui=(not no_ui), autoscaling_config=autoscaling_config) address_info = services.start_ray_head(ray_params, cleanup=False) logger.info(address_info) logger.info( "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}{}{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}{}{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( address_info["redis_address"], " --redis-password " if redis_password else "", redis_password if redis_password else "", address_info["redis_address"], "\", redis_password=\"" if redis_password else "", redis_password if redis_password else "")) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if no_ui: raise Exception("If --head is not passed in, the --no-ui flag is " "not relevant.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start(redis_ip_address, int(redis_port), password=redis_password) # Create a Redis client. redis_client = services.create_redis_client(redis_address, password=redis_password) # Check that the verion information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.redis_address = redis_address address_info = services.start_ray_node(ray_params, cleanup=False) logger.info(address_info) logger.info("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
args = parser.parse_args() info = { "node_ip_address": args.node_ip_address, "redis_address": args.redis_address, "redis_password": args.redis_password, "store_socket_name": args.object_store_name, "raylet_socket_name": args.raylet_name, } ray.utils.setup_logger(args.logging_level, args.logging_format) ray_params = RayParams(node_ip_address=args.node_ip_address, redis_address=args.redis_address, redis_password=args.redis_password, plasma_store_socket_name=args.object_store_name, raylet_socket_name=args.raylet_name, temp_dir=args.temp_dir, load_code_from_local=args.load_code_from_local) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False, connect_only=True) ray.worker._global_node = node # TODO(suquark): Use "node" as the input of "connect". ray.worker.connect(info, redis_password=args.redis_password, mode=ray.WORKER_MODE, load_code_from_local=args.load_code_from_local)
if len(config_list) > 1: i = 0 while i < len(config_list): internal_config[config_list[i]] = config_list[i + 1] i += 2 raylet_ip_address = args.raylet_ip_address if raylet_ip_address is None: raylet_ip_address = args.node_ip_address ray_params = RayParams( node_ip_address=args.node_ip_address, raylet_ip_address=raylet_ip_address, node_manager_port=args.node_manager_port, redis_address=args.redis_address, redis_password=args.redis_password, plasma_store_socket_name=args.object_store_name, raylet_socket_name=args.raylet_name, temp_dir=args.temp_dir, load_code_from_local=args.load_code_from_local, _internal_config=json.dumps(internal_config), ) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False, spawn_reaper=False, connect_only=True) ray.worker._global_node = node ray.worker.connect(node, mode=ray.WORKER_MODE, internal_config=internal_config)
if raylet_ip_address is None: raylet_ip_address = args.node_ip_address code_search_path = args.code_search_path if code_search_path is not None: for p in code_search_path.split(":"): if os.path.isfile(p): p = os.path.dirname(p) sys.path.append(p) ray_params = RayParams( node_ip_address=args.node_ip_address, raylet_ip_address=raylet_ip_address, node_manager_port=args.node_manager_port, redis_address=args.redis_address, redis_password=args.redis_password, plasma_store_socket_name=args.object_store_name, raylet_socket_name=args.raylet_name, temp_dir=args.temp_dir, load_code_from_local=args.load_code_from_local, metrics_agent_port=args.metrics_agent_port, ) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False, spawn_reaper=False, connect_only=True) ray.worker._global_node = node ray.worker.connect(node, mode=mode) # Setup log file.
def main(args): ray.ray_logging.setup_logger(args.logging_level, args.logging_format) if args.worker_type == "WORKER": mode = ray.WORKER_MODE elif args.worker_type == "SPILL_WORKER": mode = ray.SPILL_WORKER_MODE elif args.worker_type == "RESTORE_WORKER": mode = ray.RESTORE_WORKER_MODE else: raise ValueError("Unknown worker type: " + args.worker_type) # NOTE(suquark): We must initialize the external storage before we # connect to raylet. Otherwise we may receive requests before the # external storage is intialized. if mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE: from ray import external_storage if args.object_spilling_config: object_spilling_config = base64.b64decode( args.object_spilling_config) object_spilling_config = json.loads(object_spilling_config) else: object_spilling_config = {} external_storage.setup_external_storage(object_spilling_config) raylet_ip_address = args.raylet_ip_address if raylet_ip_address is None: raylet_ip_address = args.node_ip_address code_search_path = args.code_search_path if code_search_path is not None: for p in code_search_path.split(":"): if os.path.isfile(p): p = os.path.dirname(p) sys.path.append(p) ray_params = RayParams( node_ip_address=args.node_ip_address, raylet_ip_address=raylet_ip_address, node_manager_port=args.node_manager_port, redis_address=args.redis_address, redis_password=args.redis_password, plasma_store_socket_name=args.object_store_name, raylet_socket_name=args.raylet_name, temp_dir=args.temp_dir, load_code_from_local=args.load_code_from_local, metrics_agent_port=args.metrics_agent_port, ) node = ray.node.Node(ray_params, head=False, shutdown_at_exit=False, spawn_reaper=False, connect_only=True) ray.worker._global_node = node ray.worker.connect(node, mode=mode) # Redirect stdout and stderr to the default worker interceptor logger. # NOTE: We deprecated redirect_worker_output arg, # so we don't need to handle here. stdout_interceptor = StandardStreamInterceptor( setup_and_get_worker_interceptor_logger(is_for_stdout=True), intercept_stdout=True) stderr_interceptor = StandardStreamInterceptor( setup_and_get_worker_interceptor_logger(is_for_stdout=False), intercept_stdout=False) with redirect_stdout(stdout_interceptor): with redirect_stderr(stderr_interceptor): if mode == ray.WORKER_MODE: ray.worker.global_worker.main_loop() elif (mode == ray.RESTORE_WORKER_MODE or mode == ray.SPILL_WORKER_MODE): # It is handled by another thread in the C++ core worker. # We just need to keep the worker alive. while True: time.sleep(100000) else: raise ValueError(f"Unexcepted worker mode: {mode}")
def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_password, redis_shard_ports, object_manager_port, node_manager_port, object_store_memory, redis_max_memory, num_workers, num_cpus, num_gpus, resources, head, no_ui, block, plasma_directory, huge_pages, autoscaling_config, no_redirect_worker_output, no_redirect_output, plasma_store_socket_name, raylet_socket_name, temp_dir, internal_config): # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) if redis_address is not None: redis_address = services.address_to_ip(redis_address) try: resources = json.loads(resources) except Exception: raise Exception("Unable to parse the --resources argument using " "json.loads. Try using a format like\n\n" " --resources='{\"CustomResource1\": 3, " "\"CustomReseource2\": 2}'") ray_params = RayParams( node_ip_address=node_ip_address, object_manager_port=object_manager_port, node_manager_port=node_manager_port, num_workers=num_workers, object_store_memory=object_store_memory, redis_password=redis_password, redirect_worker_output=not no_redirect_worker_output, redirect_output=not no_redirect_output, num_cpus=num_cpus, num_gpus=num_gpus, resources=resources, plasma_directory=plasma_directory, huge_pages=huge_pages, plasma_store_socket_name=plasma_store_socket_name, raylet_socket_name=raylet_socket_name, temp_dir=temp_dir, _internal_config=internal_config) if head: # Start Ray on the head node. if redis_shard_ports is not None: redis_shard_ports = redis_shard_ports.split(",") # Infer the number of Redis shards from the ports if the number is # not provided. if num_redis_shards is None: num_redis_shards = len(redis_shard_ports) # Check that the arguments match. if len(redis_shard_ports) != num_redis_shards: raise Exception("If --redis-shard-ports is provided, it must " "have the form '6380,6381,6382', and the " "number of ports provided must equal " "--num-redis-shards (which is 1 if not " "provided)") if redis_address is not None: raise Exception("If --head is passed in, a Redis server will be " "started, so a Redis address should not be " "provided.") # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address()) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) ray_params.update_if_absent( redis_port=redis_port, redis_shard_ports=redis_shard_ports, redis_max_memory=redis_max_memory, num_redis_shards=num_redis_shards, redis_max_clients=redis_max_clients, include_webui=(not no_ui), autoscaling_config=autoscaling_config) address_info = services.start_ray_head(ray_params, cleanup=False) logger.info(address_info) logger.info( "\nStarted Ray on this node. You can add additional nodes to " "the cluster by calling\n\n" " ray start --redis-address {}{}{}\n\n" "from the node you wish to add. You can connect a driver to the " "cluster from Python by running\n\n" " import ray\n" " ray.init(redis_address=\"{}{}{}\")\n\n" "If you have trouble connecting from a different machine, check " "that your firewall is configured properly. If you wish to " "terminate the processes that have been started, run\n\n" " ray stop".format( address_info["redis_address"], " --redis-password " if redis_password else "", redis_password if redis_password else "", address_info["redis_address"], "\", redis_password=\"" if redis_password else "", redis_password if redis_password else "")) else: # Start Ray on a non-head node. if redis_port is not None: raise Exception("If --head is not passed in, --redis-port is not " "allowed") if redis_shard_ports is not None: raise Exception("If --head is not passed in, --redis-shard-ports " "is not allowed") if redis_address is None: raise Exception("If --head is not passed in, --redis-address must " "be provided.") if num_redis_shards is not None: raise Exception("If --head is not passed in, --num-redis-shards " "must not be provided.") if redis_max_clients is not None: raise Exception("If --head is not passed in, --redis-max-clients " "must not be provided.") if no_ui: raise Exception("If --head is not passed in, the --no-ui flag is " "not relevant.") redis_ip_address, redis_port = redis_address.split(":") # Wait for the Redis server to be started. And throw an exception if we # can't connect to it. services.wait_for_redis_to_start( redis_ip_address, int(redis_port), password=redis_password) # Create a Redis client. redis_client = services.create_redis_client( redis_address, password=redis_password) # Check that the verion information on this node matches the version # information that the cluster was started with. services.check_version_info(redis_client) # Get the node IP address if one is not provided. ray_params.update_if_absent( node_ip_address=services.get_node_ip_address(redis_address)) logger.info("Using IP address {} for this node.".format( ray_params.node_ip_address)) # Check that there aren't already Redis clients with the same IP # address connected with this Redis instance. This raises an exception # if the Redis server already has clients on this node. check_no_existing_redis_clients(ray_params.node_ip_address, redis_client) ray_params.redis_address = redis_address address_info = services.start_ray_node(ray_params, cleanup=False) logger.info(address_info) logger.info("\nStarted Ray on this node. If you wish to terminate the " "processes that have been started, run\n\n" " ray stop") if block: import time while True: time.sleep(30)
def ray_start_reconstruction(request): num_local_schedulers = request.param # Start the Redis global state store. node_ip_address = "127.0.0.1" redis_address, redis_shards = ray.services.start_redis(node_ip_address) redis_ip_address = ray.services.get_ip_address(redis_address) redis_port = ray.services.get_port(redis_address) time.sleep(0.1) # Start the Plasma store instances with a total of 1GB memory. plasma_store_memory = 10**9 plasma_addresses = [] object_store_memory = plasma_store_memory // num_local_schedulers for i in range(num_local_schedulers): store_stdout_file, store_stderr_file = ( ray.tempfile_services.new_plasma_store_log_file(i, True)) plasma_addresses.append( ray.services.start_plasma_store( node_ip_address, redis_address, object_store_memory=object_store_memory, store_stdout_file=store_stdout_file, store_stderr_file=store_stderr_file)) # Start the rest of the services in the Ray cluster. address_info = { "redis_address": redis_address, "redis_shards": redis_shards, "object_store_addresses": plasma_addresses } ray_params = RayParams(address_info=address_info, start_ray_local=True, num_local_schedulers=num_local_schedulers, num_cpus=[1] * num_local_schedulers, redirect_output=True, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.worker._init(ray_params) yield (redis_ip_address, redis_port, plasma_store_memory, num_local_schedulers) # The code after the yield will run as teardown code. assert ray.services.all_processes_alive() # Determine the IDs of all local schedulers that had a task scheduled # or submitted. state = ray.experimental.state.GlobalState() state._initialize_global_state(redis_ip_address, redis_port) if os.environ.get("RAY_USE_NEW_GCS") == "on": tasks = state.task_table() local_scheduler_ids = { task["LocalSchedulerID"] for task in tasks.values() } # Make sure that all nodes in the cluster were used by checking that # the set of local scheduler IDs that had a task scheduled or submitted # is equal to the total number of local schedulers started. We add one # to the total number of local schedulers to account for # NIL_LOCAL_SCHEDULER_ID. This is the local scheduler ID associated # with the driver task, since it is not scheduled by a particular local # scheduler. if os.environ.get("RAY_USE_NEW_GCS") == "on": assert len(local_scheduler_ids) == num_local_schedulers + 1 # Clean up the Ray cluster. ray.shutdown()