def start_connected_cluster(): # Start the Ray processes. g = Cluster(initialize_head=True, connect=True) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def create_cluster(num_nodes): cluster = Cluster() for i in range(num_nodes): cluster.add_node(resources={str(i): 100}, object_store_memory=10**9) ray.init(redis_address=cluster.redis_address) return cluster
def ray_start_reconstruction(request): num_nodes = request.param plasma_store_memory = 10**9 cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, "redis_max_memory": 10**7, "redirect_output": True, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 }) }) for i in range(num_nodes - 1): cluster.add_node( num_cpus=1, object_store_memory=plasma_store_memory // num_nodes, redirect_output=True, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.init(redis_address=cluster.redis_address) yield plasma_store_memory, num_nodes, cluster # Clean up the Ray cluster. ray.shutdown() cluster.shutdown()
def ray_start_empty_cluster(): cluster = Cluster() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_redis_password_cluster(self, password, shutdown_only): @ray.remote def f(): return 1 node_args = {"redis_password": password} cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) cluster.add_node(**node_args) object_id = f.remote() ray.get(object_id)
def cluster_start(): # Start the Ray processes. cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "resources": dict(CPU=1), "_internal_config": json.dumps({"num_heartbeats_timeout": 10}) }) yield cluster ray.shutdown() cluster.shutdown()
def start_connected_cluster(): # Start the Ray processes. g = Cluster(initialize_head=True, connect=True, head_node_args={ "resources": dict(CPU=1), "_internal_config": json.dumps({"num_heartbeats_timeout": 10}) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def start_connected_longer_cluster(): """Creates a cluster with a longer timeout.""" g = Cluster(initialize_head=True, connect=True, head_node_args={ "resources": dict(CPU=1), "_internal_config": json.dumps({"num_heartbeats_timeout": 20}) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def test_redis_password_cluster(self, password, shutdown_only): @ray.remote def f(): return 1 node_args = {"redis_password": password} cluster = Cluster(initialize_head=True, connect=True, head_node_args=node_args) cluster.add_node(**node_args) object_id = f.remote() ray.get(object_id)
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( resources={ "num_cpus": args.ray_num_cpus or 1, "num_gpus": args.ray_num_gpus or 0, }, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(redis_address=cluster.redis_address) else: ray.init( redis_address=args.redis_address, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) run_experiments( experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)
def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.all_processes_alive() assert node2.all_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(node.any_processes_alive() for node in g.list_all_nodes())
def cluster_start(): # Start the Ray processes. cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield cluster ray.shutdown() cluster.shutdown()
def start_connected_cluster(): # Start the Ray processes. g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def start_connected_longer_cluster(): """Creates a cluster with a longer timeout.""" g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 20 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def start_connected_emptyhead_cluster(): """Starts head with no resources.""" cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "resources": dict(CPU=0), "_internal_config": json.dumps({"num_heartbeats_timeout": 10}) }) # Pytest doesn't play nicely with imports _register_all() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.remaining_processes_alive() assert node2.remaining_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2])
def start_connected_emptyhead_cluster(): """Starts head with no resources.""" cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 0, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) # Pytest doesn't play nicely with imports _register_all() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_workers_per_scheduler # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_initialize_cluster(): # Start with 4 workers and 4 cores. num_nodes = 4 num_workers_per_scheduler = 8 cluster = Cluster() for _ in range(num_nodes): cluster.add_node( num_cpus=num_workers_per_scheduler, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10, })) ray.init(redis_address=cluster.redis_address) yield None ray.shutdown() cluster.shutdown()
def _start_new_cluster(): cluster = Cluster(initialize_head=True, connect=True, head_node_args={ "resources": dict(CPU=1), "_internal_config": json.dumps({"num_heartbeats_timeout": 10}) }) # Pytest doesn't play nicely with imports _register_all() return cluster
def test_cluster(): """Basic test for adding and removing nodes in cluster.""" g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() assert node.all_processes_alive() assert node2.all_processes_alive() g.remove_node(node2) g.remove_node(node) assert not any(n.any_processes_alive() for n in [node, node2])
def ray_start_two_nodes(): # Start the Ray processes. cluster = Cluster() for _ in range(2): cluster.add_node(num_cpus=0, _internal_config=json.dumps( {"num_heartbeats_timeout": 40})) ray.init(redis_address=cluster.redis_address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_cluster(): node_args = { "resources": dict(CPU=8), "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 4 worker nodes and 8 cores each. g = Cluster(initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(4): workers.append(g.add_node(**node_args)) g.wait_for_nodes() yield g ray.shutdown() g.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster(initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_cluster(): node_args = { "num_cpus": 4, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 3 worker nodes and 4 cores each. cluster = Cluster(initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(3): workers.append(cluster.add_node(**node_args)) cluster.wait_for_nodes() yield cluster ray.shutdown() cluster.shutdown()
def ray_start_cluster(): node_args = { "num_cpus": 8, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 4 worker nodes and 8 cores each. cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(4): workers.append(cluster.add_node(**node_args)) cluster.wait_for_nodes() yield cluster ray.shutdown() cluster.shutdown()
def ray_initialize_cluster(): # Start with 4 workers and 4 cores. num_nodes = 4 num_workers_per_scheduler = 8 cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_workers_per_scheduler, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10, })) ray.init(redis_address=cluster.redis_address) yield cluster ray.shutdown() cluster.shutdown()
def test_shutdown(): g = Cluster(initialize_head=False) node = g.add_node() node2 = g.add_node() g.shutdown() assert not any(n.any_processes_alive() for n in [node, node2])
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } # The default maximum number of bytes to allocate to the object store unless # overridden by the user. DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES = 20 * 10**9 # The smallest cap on the memory used by the object store that we allow. OBJECT_STORE_MINIMUM_MEMORY_BYTES = 10**7 # The default maximum number of bytes that the non-primary Redis shards are # allowed to use unless overridden by the user. DEFAULT_REDIS_MAX_MEMORY_BYTES = 10**10 # The smallest cap on the memory used by Redis that we allow. REDIS_MINIMUM_MEMORY_BYTES = 10**7 def on_episode_end(info): episode = info["episode"] env = info['env'].get_unwrapped()[0] if hasattr(env, 'capital'): capital_return = (env.capital - env.initial_funds) / env.initial_funds episode.custom_metrics['capital_return'] = capital_return key = list(experiments.keys())[0] experiments[key]["config"]["callbacks"] = { "on_episode_end": tune.function(on_episode_end) } for exp in experiments.values(): if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(resources={ "num_cpus": args.ray_num_cpus or 1, "num_gpus": args.ray_num_gpus or 0, }, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory) ray.init(redis_address=cluster.redis_address) else: print('init') ray.init(redis_address=args.redis_address, object_store_memory=int(0.5 * 10**9), redis_max_memory=int(0.5 * 10**9), num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus) run_experiments(experiments, scheduler=_make_scheduler(args), queue_trials=args.queue_trials, resume=args.resume)