def create_cluster(num_nodes): cluster = Cluster() for i in range(num_nodes): cluster.add_node(resources={str(i): 100}, object_store_memory=10**9) ray.init(redis_address=cluster.redis_address) return cluster
def ray_start_reconstruction(request): num_nodes = request.param plasma_store_memory = int(0.5 * 10**9) cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, "redis_max_memory": 10**7, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 }) }) for i in range(num_nodes - 1): cluster.add_node( num_cpus=1, object_store_memory=plasma_store_memory // num_nodes, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.init(redis_address=cluster.redis_address) yield plasma_store_memory, num_nodes, cluster # Clean up the Ray cluster. ray.shutdown() cluster.shutdown()
def ray_start_empty_cluster(): cluster = Cluster() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_redis_password_cluster(self, password, shutdown_only): @ray.remote def f(): return 1 node_args = {"redis_password": password} cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) cluster.add_node(**node_args) object_id = f.remote() ray.get(object_id)
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def cluster_start(): # Start the Ray processes. cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield cluster ray.shutdown() cluster.shutdown()
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(redis_address=cluster.redis_address) else: ray.init( redis_address=args.redis_address, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)
def start_connected_longer_cluster(): """Creates a cluster with a longer timeout.""" g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 20 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def start_connected_cluster(): # Start the Ray processes. g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_two_nodes(): # Start the Ray processes. cluster = Cluster() for _ in range(2): cluster.add_node( num_cpus=0, _internal_config=json.dumps({ "num_heartbeats_timeout": 40 })) ray.init(redis_address=cluster.redis_address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def start_connected_emptyhead_cluster(): """Starts head with no resources.""" cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 0, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) # Pytest doesn't play nicely with imports _register_all() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_initialize_cluster(): # Start with 4 workers and 4 cores. num_nodes = 4 num_workers_per_scheduler = 8 cluster = Cluster() for _ in range(num_nodes): cluster.add_node( num_cpus=num_workers_per_scheduler, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10, })) ray.init(redis_address=cluster.redis_address) yield cluster ray.shutdown() cluster.shutdown()
def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.remaining_processes_alive() assert node2.remaining_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2])
def ray_start_cluster(): node_args = { "num_cpus": 4, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 3 worker nodes and 4 cores each. cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(3): workers.append(cluster.add_node(**node_args)) cluster.wait_for_nodes() yield cluster ray.shutdown() cluster.shutdown()
def test_connect_with_disconnected_node(shutdown_only): config = json.dumps({ "num_heartbeats_timeout": 50, "heartbeat_timeout_milliseconds": 10, }) cluster = Cluster() cluster.add_node(num_cpus=0, _internal_config=config) ray.init(redis_address=cluster.redis_address) info = relevant_errors(ray_constants.REMOVED_NODE_ERROR) assert len(info) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2) # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2) # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(removing_node, allow_graceful=True) with pytest.raises(Exception, match=('Timing out of wait.')): wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2) # There is no connection error to a dead node. info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR) assert len(info) == 0
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) # add callbacks for self-defined metric # and save successful transitions from RL agents experiment_name = next(iter(experiments)) experiments[experiment_name]["config"]["optimizer"]["robot_demo_path"] = dir_path experiments[experiment_name]["config"]["callbacks"] = { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, "on_sample_end": on_sample_end, "on_train_result": on_train_result, "on_postprocess_traj": on_postprocess_traj } else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.eager: exp["config"]["eager"] = True if args.trace: if not exp["config"].get("eager"): raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(address=cluster.address) #, log_to_driver=False) else: ray.init( address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) # log_to_driver=False) # disable the loggings # https://github.com/ray-project/ray/issues/5048 run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)
from ray.tune.schedulers import PopulationBasedTraining from ray.tests.cluster_utils import Cluster num_redis_shards = 5 redis_max_memory = 10**8 object_store_memory = 10**8 num_nodes = 3 message = ("Make sure there is enough memory on this machine to run this " "workload. We divide the system memory by 2 to provide a buffer.") assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < ray.utils.get_system_memory() / 2), message # Simulate a cluster on one machine. cluster = Cluster() for i in range(num_nodes): cluster.add_node(redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, num_cpus=10, num_gpus=0, resources={str(i): 2}, object_store_memory=object_store_memory, redis_max_memory=redis_max_memory) ray.init(address=cluster.address) # Run the workload. pbt = PopulationBasedTraining(time_attr="training_iteration", metric="episode_reward_mean", mode="max",
from ray.tune import run_experiments from ray.tests.cluster_utils import Cluster num_redis_shards = 5 redis_max_memory = 10**8 object_store_memory = 10**8 num_nodes = 1 message = ("Make sure there is enough memory on this machine to run this " "workload. We divide the system memory by 2 to provide a buffer.") assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < ray.utils.get_system_memory() / 2), message # Simulate a cluster on one machine. cluster = Cluster() for i in range(num_nodes): cluster.add_node( redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, num_cpus=10, num_gpus=0, resources={str(i): 2}, object_store_memory=object_store_memory, redis_max_memory=redis_max_memory) ray.init(redis_address=cluster.redis_address) # Run the workload. run_experiments({ "impala": {
import ray from ray.tests.cluster_utils import Cluster num_redis_shards = 5 redis_max_memory = 10**8 object_store_memory = 10**8 num_nodes = 10 message = ("Make sure there is enough memory on this machine to run this " "workload. We divide the system memory by 2 to provide a buffer.") assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < ray.utils.get_system_memory() / 2), message # Simulate a cluster on one machine. cluster = Cluster() for i in range(num_nodes): cluster.add_node(redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, num_cpus=2, num_gpus=0, resources={str(i): 2}, object_store_memory=object_store_memory, redis_max_memory=redis_max_memory) ray.init(redis_address=cluster.redis_address) # Run the workload. @ray.remote def f(*xs):
def run(args, parser): # create exps from configs if args.config_file: # load configs from yaml with open(args.config_file) as f: exps = yaml.safe_load(f) else: exps = create_exps( args=args, ) arena_exps = create_arena_exps( exps=exps, args=args, parser=parser, ) # config ray cluster if args.ray_num_nodes: cluster = Cluster() for ray_node in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, ) ray.init( address=cluster.redis_address, ) else: ray.init( address=args.ray_address, object_store_memory=args.ray_object_store_memory, memory=args.ray_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, ) if len(arena_exps.keys()) > 1: logger.warning( "There are multiple experiments scheduled, ray==0.7.4 will run them one by one, instead of cocurrently. " "However, recent ray can run them cocurrently. But the recent ray has failed our test (the rllib is broken)" "This is mainly due to there are grid search used in configs that is not supported by original rllib. " ) if args.eval: # evaluate policies if len(arena_exps.keys()) < 1: raise ValueError elif len(arena_exps.keys()) >= 1: if len(arena_exps.keys()) > 1: arena_exp_key = inquire_select( choices=list(arena_exps.keys()), key="arena_exp_key", ) else: # if there is just one arena_exps arena_exp_key = list(arena_exps.keys())[0] logger.info("Evaluating arena_exp_key: {}".format( arena_exp_key, )) arena_exp = arena_exps[arena_exp_key] answers = prompt( [{ 'type': 'input', 'name': 'eval_log_path', 'message': 'Where do you want to log the results of this evaluation?', 'default': '../eval_log_path/' }], style=custom_style_2, ) prepare_path(answers['eval_log_path']) from ray.rllib.evaluation.rollout_worker import RolloutWorker # worker = ArenaRolloutWorker( # TODO: RolloutWorker does not support monitor for multi-agent envs worker = RolloutWorker( env_creator=lambda _: ArenaRllibEnv( env=arena_exp["env"], env_config=arena_exp["config"]["env_config"], ), policy=arena_exp["config"]["multiagent"]["policies"], policy_mapping_fn=arena_exp["config"]["multiagent"]["policy_mapping_fn"], batch_mode="complete_episodes", batch_steps=500, num_envs=1, monitor_path=answers['eval_log_path'], ) logger.info("Testing worker...") sample_start = time.time() worker.sample() sample_time = time.time() - sample_start logger.info("Finish testing worker.") policy_ids = list(worker.policy_map.keys()) checkpoints = inquire_checkpoints( local_dir=arena_exp["local_dir"], policy_ids=policy_ids, ) checkpoint_paths = checkpoints_2_checkpoint_paths(checkpoints) num_checkpoint_paths = {} for policy_id, checkpoint_paths_per_policy_id in checkpoint_paths.items(): num_checkpoint_paths[policy_id] = len( checkpoint_paths_per_policy_id ) num_sampling = np.prod(list(num_checkpoint_paths.values())) confirm = inquire_confirm("You have scheduled {} sampling, each sampling will take {} minutes, which means {} hours in total.".format( num_sampling, sample_time / 60.0, num_sampling * sample_time / 60.0 / 60.0, )) if not confirm: os.exit() result_matrix = run_result_matrix( checkpoint_paths=checkpoint_paths, worker=worker, ) result_matrix = np.asarray(result_matrix) vis_result_matrix( result_matrix=result_matrix, log_path=answers['eval_log_path'], ) else: run_experiments( arena_exps, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume, )