def test_driver_lives_sequential(shutdown_only): ray.init(num_cpus=1) ray.worker._global_node.kill_raylet() ray.worker._global_node.kill_plasma_store() ray.worker._global_node.kill_log_monitor() ray.worker._global_node.kill_monitor() ray.worker._global_node.kill_raylet_monitor()
def testCachingReusables(self): # Test that we can define reusable variables before the driver is connected. def foo_initializer(): return 1 def bar_initializer(): return [] def bar_reinitializer(bar): return [] ray.reusables.foo = ray.Reusable(foo_initializer) ray.reusables.bar = ray.Reusable(bar_initializer, bar_reinitializer) @ray.remote def use_foo(): return ray.reusables.foo @ray.remote def use_bar(): ray.reusables.bar.append(1) return ray.reusables.bar ray.init(start_ray_local=True, num_workers=2) self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_foo.remote()), 1) self.assertEqual(ray.get(use_bar.remote()), [1]) self.assertEqual(ray.get(use_bar.remote()), [1]) ray.worker.cleanup()
def test_connect_with_disconnected_node(shutdown_only): config = json.dumps({ "num_heartbeats_timeout": 50, "heartbeat_timeout_milliseconds": 10, }) cluster = Cluster() cluster.add_node(num_cpus=0, _internal_config=config) ray.init(redis_address=cluster.redis_address) info = relevant_errors(ray_constants.REMOVED_NODE_ERROR) assert len(info) == 0 # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2) # This node is killed by SIGKILL, ray_monitor will mark it to dead. dead_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(dead_node, allow_graceful=False) wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2) # This node is killed by SIGTERM, ray_monitor will not mark it again. removing_node = cluster.add_node(num_cpus=0, _internal_config=config) cluster.remove_node(removing_node, allow_graceful=True) with pytest.raises(Exception, match=('Timing out of wait.')): wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2) # There is no connection error to a dead node. info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR) assert len(info) == 0
def ray_start_object_store_memory(): # Start the Ray processes. store_size = 10**6 ray.init(num_cpus=1, object_store_memory=store_size) yield None # The code after the yield will run as teardown code. ray.shutdown()
def testPutGet(self): ray.init(start_ray_local=True, num_workers=0) for i in range(100): value_before = i * 10 ** 6 objectid = ray.put(value_before) value_after = ray.get(objectid) self.assertEqual(value_before, value_after) for i in range(100): value_before = i * 10 ** 6 * 1.0 objectid = ray.put(value_before) value_after = ray.get(objectid) self.assertEqual(value_before, value_after) for i in range(100): value_before = "h" * i objectid = ray.put(value_before) value_after = ray.get(objectid) self.assertEqual(value_before, value_after) for i in range(100): value_before = [1] * i objectid = ray.put(value_before) value_after = ray.get(objectid) self.assertEqual(value_before, value_after) ray.worker.cleanup()
def testWait(self): ray.init(start_ray_local=True, num_workers=1) @ray.remote def f(delay): time.sleep(delay) return 1 objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)] ready_ids, remaining_ids = ray.wait(objectids) self.assertTrue(len(ready_ids) == 1) self.assertTrue(len(remaining_ids) == 3) ready_ids, remaining_ids = ray.wait(objectids, num_returns=4) self.assertEqual(ready_ids, objectids) self.assertEqual(remaining_ids, []) objectids = [f.remote(0.5), f.remote(0.5), f.remote(0.5), f.remote(0.5)] start_time = time.time() ready_ids, remaining_ids = ray.wait(objectids, timeout=1.75, num_returns=4) self.assertTrue(time.time() - start_time < 2) self.assertEqual(len(ready_ids), 3) self.assertEqual(len(remaining_ids), 1) ray.wait(objectids) objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)] start_time = time.time() ready_ids, remaining_ids = ray.wait(objectids, timeout=5) self.assertTrue(time.time() - start_time < 5) self.assertEqual(len(ready_ids), 1) self.assertEqual(len(remaining_ids), 3) ray.worker.cleanup()
def create_cluster(num_nodes): cluster = Cluster() for i in range(num_nodes): cluster.add_node(resources={str(i): 100}, object_store_memory=10**9) ray.init(redis_address=cluster.redis_address) return cluster
def testRecursiveObjects(self): ray.init(start_ray_local=True, num_workers=0) class ClassA(object): pass ray.register_class(ClassA) # Make a list that contains itself. l = [] l.append(l) # Make an object that contains itself as a field. a1 = ClassA() a1.field = a1 # Make two objects that contain each other as fields. a2 = ClassA() a3 = ClassA() a2.field = a3 a3.field = a2 # Make a dictionary that contains itself. d1 = {} d1["key"] = d1 # Create a list of recursive objects. recursive_objects = [l, a1, a2, a3, d1] # Check that exceptions are thrown when we serialize the recursive objects. for obj in recursive_objects: self.assertRaises(Exception, lambda : ray.put(obj)) ray.worker.cleanup()
def ray_start_driver_put_errors(): plasma_store_memory = 10**9 # Start the Ray processes. ray.init(num_cpus=1, object_store_memory=plasma_store_memory) yield plasma_store_memory # The code after the yield will run as teardown code. ray.shutdown()
def testFailImportingRemoteFunction(self): ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE) # This example is somewhat contrived. It should be successfully pickled, and # then it should throw an exception when it is unpickled. This may depend a # bit on the specifics of our pickler. def reducer(*args): raise Exception("There is a problem here.") class Foo(object): def __init__(self): self.__name__ = "Foo_object" self.func_doc = "" self.__globals__ = {} def __reduce__(self): return reducer, () def __call__(self): return ray.remote(Foo()) for _ in range(100): # Retry if we need to wait longer. if len(ray.task_info()["failed_remote_function_imports"]) >= 1: break time.sleep(0.1) self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"]) ray.worker.cleanup()
def init(): ray.init(num_cpus=4) async_api.init() asyncio.get_event_loop().set_debug(False) yield async_api.shutdown() ray.shutdown()
def launch(self): """Actual entry point into the class instance where everything happens. Lots of delegating to classes that are in subclass or can be over-ridden. """ self.register_env_creator() # All worker nodes will block at this step during training ray_cluster_config = self.ray_init_config() if not self.is_master_node: return # Start the driver on master node ray.init(**ray_cluster_config) experiment_config = self.get_experiment_config() experiment_config = self.customize_experiment_config(experiment_config) print("Running experiment with config %s" % json.dumps(experiment_config, indent=2)) run_experiments(experiment_config) all_wokers_host_names = self.get_all_host_names()[1:] # If distributed job, send TERMINATION_SIGNAL to all workers. if len(all_wokers_host_names) > 0: self.sage_cluster_communicator.create_s3_signal(TERMINATION_SIGNAL) algo = experiment_config["training"]["run"] env_string = experiment_config["training"]["config"]["env"] config = experiment_config["training"]["config"] self.save_checkpoint_and_serving_model(algorithm=algo, env_string=env_string, config=config)
def testMethods(self): for module in [ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg]: reload(module) ray.init() # test eye object_id = ra.eye.remote(3) val = ray.get(object_id) assert_almost_equal(val, np.eye(3)) # test zeros object_id = ra.zeros.remote([3, 4, 5]) val = ray.get(object_id) assert_equal(val, np.zeros([3, 4, 5])) # test qr - pass by value a_val = np.random.normal(size=[10, 11]) q_id, r_id = ra.linalg.qr.remote(a_val) q_val = ray.get(q_id) r_val = ray.get(r_id) assert_almost_equal(np.dot(q_val, r_val), a_val) # test qr - pass by objectid a = ra.random.normal.remote([10, 13]) q_id, r_id = ra.linalg.qr.remote(a) a_val = ray.get(a) q_val = ray.get(q_id) r_val = ray.get(r_id) assert_almost_equal(np.dot(q_val, r_val), a_val)
def router(): # We need at least 5 workers so resource won't be oversubscribed ray.init(num_cpus=5) # The following two blobs are equivalent # # handle = DeadlineAwareRouter.remote("DefaultTestRouter") # ray.experimental.register_actor("DefaultTestRouter", handle) # handle.start.remote() # # handle = start_router(DeadlineAwareRouter, "DefaultRouter") handle = start_router(DeadlineAwareRouter, "DefaultRouter") handle.register_actor.remote( "VAdder", VectorizedAdder, init_kwargs={"scaler_increment": 1}) # init args handle.register_actor.remote( "SAdder", ScalerAdder, init_kwargs={"scaler_increment": 2}) handle.register_actor.remote( "SleepFirst", SleepOnFirst, init_kwargs={"sleep_time": 1}) handle.register_actor.remote( "SleepCounter", SleepCounter, max_batch_size=1) yield handle ray.shutdown()
def testFailedTask(self): reload(test_functions) ray.init(num_workers=3, driver_mode=ray.SILENT_MODE) test_functions.throw_exception_fct1.remote() test_functions.throw_exception_fct1.remote() wait_for_errors(b"task", 2) self.assertEqual(len(relevant_errors(b"task")), 2) for task in relevant_errors(b"task"): self.assertIn(b"Test function 1 intentionally failed.", task.get(b"message")) x = test_functions.throw_exception_fct2.remote() try: ray.get(x) except Exception as e: self.assertIn("Test function 2 intentionally failed.", str(e)) else: # ray.get should throw an exception. self.assertTrue(False) x, y, z = test_functions.throw_exception_fct3.remote(1.0) for ref in [x, y, z]: try: ray.get(ref) except Exception as e: self.assertIn("Test function 3 intentionally failed.", str(e)) else: # ray.get should throw an exception. self.assertTrue(False)
def testFailedActorInit(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) error_message1 = "actor constructor failed" error_message2 = "actor method failed" @ray.remote class FailedActor(object): def __init__(self): raise Exception(error_message1) def get_val(self): return 1 def fail_method(self): raise Exception(error_message2) a = FailedActor.remote() # Make sure that we get errors from a failed constructor. wait_for_errors(b"task", 1) self.assertEqual(len(ray.error_info()), 1) self.assertIn(error_message1, ray.error_info()[0][b"message"].decode("ascii")) # Make sure that we get errors from a failed method. a.fail_method.remote() wait_for_errors(b"task", 2) self.assertEqual(len(ray.error_info()), 2) self.assertIn(error_message2, ray.error_info()[1][b"message"].decode("ascii"))
def run(args, parser): config = {} # Load configuration from file config_dir = os.path.dirname(args.checkpoint) config_path = os.path.join(config_dir, "params.pkl") if not os.path.exists(config_path): config_path = os.path.join(config_dir, "../params.pkl") if not os.path.exists(config_path): if not args.config: raise ValueError( "Could not find params.pkl in either the checkpoint dir or " "its parent directory.") else: with open(config_path, 'rb') as f: config = pickle.load(f) if "num_workers" in config: config["num_workers"] = min(2, config["num_workers"]) config = merge_dicts(config, args.config) if not args.env: if not config.get("env"): parser.error("the following arguments are required: --env") args.env = config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_steps = int(args.steps) rollout(agent, args.env, num_steps, args.out, args.no_render)
def test_dying_worker_wait(shutdown_only): ray.init(num_cpus=2) @ray.remote def sleep_forever(): time.sleep(10**6) @ray.remote def get_pid(): return os.getpid() x_id = sleep_forever.remote() # Get the PID of the worker that block_in_wait will run on (sleep a little # to make sure that sleep_forever has already started). time.sleep(0.1) worker_pid = ray.get(get_pid.remote()) @ray.remote def block_in_wait(object_id_in_list): ray.wait(object_id_in_list) # Have the worker wait in a wait call. block_in_wait.remote([x_id]) time.sleep(0.1) # Kill the worker. os.kill(worker_pid, signal.SIGKILL) time.sleep(0.1) # Create the object. ray.worker.global_worker.put_object(x_id, 1) time.sleep(0.1) # Make sure that nothing has died. assert ray.services.all_processes_alive()
def testNetworksIndependent(self): # Note we use only one worker to ensure that all of the remote # functions run on the same worker. ray.init(num_workers=1) net1 = NetActor() net2 = NetActor() # Make sure the two networks have different weights. TODO(rkn): Note # that equality comparisons of numpy arrays normally does not work. # This only works because at the moment they have size 1. weights1 = net1.get_weights() weights2 = net2.get_weights() self.assertNotEqual(weights1, weights2) # Set the weights and get the weights, and make sure they are # unchanged. new_weights1 = net1.set_and_get_weights(weights1) new_weights2 = net2.set_and_get_weights(weights2) self.assertEqual(weights1, new_weights1) self.assertEqual(weights2, new_weights2) # Swap the weights. new_weights1 = net2.set_and_get_weights(weights1) new_weights2 = net1.set_and_get_weights(weights2) self.assertEqual(weights1, new_weights1) self.assertEqual(weights2, new_weights2)
def ray_start_reconstruction(request): num_nodes = request.param plasma_store_memory = int(0.5 * 10**9) cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, "redis_max_memory": 10**7, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 }) }) for i in range(num_nodes - 1): cluster.add_node( num_cpus=1, object_store_memory=plasma_store_memory // num_nodes, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.init(redis_address=cluster.redis_address) yield plasma_store_memory, num_nodes, cluster # Clean up the Ray cluster. ray.shutdown() cluster.shutdown()
def driver_0(redis_address, driver_index): """The script for driver 0. This driver should create five actors that each use one GPU and some actors that use no GPUs. After a while, it should exit. """ ray.init(redis_address=redis_address) # Wait for all the nodes to join the cluster. _wait_for_nodes_to_join(total_num_nodes) # Start some long running task. Driver 2 will make sure the worker running # this task has been killed. for i in range(num_long_running_tasks_per_driver): long_running_task.remote(driver_index, i, redis_address) # Create some actors that require one GPU. actors_one_gpu = [Actor1.remote(driver_index, i, redis_address) for i in range(5)] # Create some actors that don't require any GPUs. actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5)] for _ in range(1000): ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) # Start a long-running method on one actor and make sure this doesn't # affect anything. actors_no_gpus[0].long_running_method.remote() _broadcast_event("DRIVER_0_DONE", redis_address)
def testFailImportingActor(self): ray.init(num_workers=2, driver_mode=ray.SILENT_MODE) # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo(object): def __init__(self): self.x = module.temporary_python_file() def get_val(self): return 1 # There should be no errors yet. self.assertEqual(len(ray.error_info()), 0) # Create an actor. foo = Foo.remote() # Wait for the error to arrive. wait_for_errors(b"register_actor", 1) self.assertIn(b"No module named", ray.error_info()[0][b"message"]) # Wait for the error from when the __init__ tries to run. wait_for_errors(b"task", 1) self.assertIn( b"failed to be imported, and so cannot execute this method", ray.error_info()[1][b"message"]) # Check that if we try to get the function it throws an exception and # does not hang. with self.assertRaises(Exception): ray.get(foo.get_val.remote()) # Wait for the error from when the call to get_val. wait_for_errors(b"task", 2) self.assertIn( b"failed to be imported, and so cannot execute this method", ray.error_info()[2][b"message"]) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
def test_using_hostnames(ray_start_head_local): ray.init(node_ip_address="localhost", redis_address="localhost:6379") @ray.remote def f(): return 1 assert ray.get(f.remote()) == 1
def testAdditionalVariablesWithLoss(self): ray.init(num_workers=1) net = LossActor() self.assertEqual(len(net.values[0].variables.items()), 3) self.assertEqual(len(net.values[0].placeholders.items()), 3) net.values[0].set_weights(net.values[0].get_weights())
def initialize(self, env, policy, pool): super(RemoteSampler, self).initialize(env, policy, pool) ray.init() env_pkl = pickle.dumps(env) policy_pkl = pickle.dumps(policy) self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl)
def testVariableNameCollision(self): ray.init(num_workers=2) net1 = NetActor() net2 = NetActor() # This is checking that the variable names of the two nets are the # same, i.e., that the names in the weight dictionaries are the same. net1.values[0].set_weights(net2.values[0].get_weights())
def test_temp_plasma_store_socket(): ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket") assert os.path.exists( "/tmp/i_am_a_temp_socket"), "Specified socket path not found." ray.shutdown() try: os.remove("/tmp/i_am_a_temp_socket") except Exception: pass
def testVersionMismatch(self): ray_version = ray.__version__ ray.__version__ = "fake ray version" ray.init(num_workers=1, driver_mode=ray.SILENT_MODE) wait_for_errors(b"version_mismatch", 1) ray.__version__ = ray_version
def ray_start(): # Start ray instance ray.init(num_cpus=1) # Run test using this fixture yield None # Shutdown ray instance ray.shutdown()
def test_capture_child_tasks(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_tasks = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_tasks, num_gpus=total_num_tasks) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group( [{ "CPU": 2, "GPU": 2, }, { "CPU": 2, "GPU": 2, }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test if tasks capture child tasks. @ray.remote def task(): return get_current_placement_group() @ray.remote def create_nested_task(child_cpu, child_gpu, set_none=False): assert get_current_placement_group() is not None kwargs = { "num_cpus": child_cpu, "num_gpus": child_gpu, } if set_none: kwargs["placement_group"] = None return ray.get([task.options(**kwargs).remote() for _ in range(3)]) t = create_nested_task.options( num_cpus=1, num_gpus=0, placement_group=pg, placement_group_capture_child_tasks=True).remote(1, 0) pgs = ray.get(t) # Every task should have current placement group because they # should be implicitly captured by default. assert None not in pgs t1 = create_nested_task.options( num_cpus=1, num_gpus=0, placement_group=pg, placement_group_capture_child_tasks=True).remote(1, 0, True) pgs = ray.get(t1) # Every task should have no placement group since it's set to None. # should be implicitly captured by default. assert set(pgs) == {None} # Test if tasks don't capture child tasks when the option is off. t2 = create_nested_task.options( num_cpus=0, num_gpus=1, placement_group=pg).remote(0, 1) pgs = ray.get(t2) # All placement groups should be None since we don't capture child # tasks. assert not all(pgs)
def test_automatic_cleanup_job(ray_start_cluster): # Make sure the placement groups created by a # job, actor, and task are cleaned when the job is done. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 4 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) cluster.wait_for_nodes() info = ray.init(address=cluster.address) available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["redis_address"]}") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg @ray.remote(num_cpus=0) def f(): create_pg() @ray.remote(num_cpus=0) class A: def create_pg(self): create_pg() ray.get(f.remote()) a = A.remote() ray.get(a.create_pg.remote()) # Create 2 pgs to make sure multiple placement groups that belong # to a single job will be properly cleaned. create_pg() create_pg() ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.state.jobs() for job in jobs: if job["IsDead"]: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) available_cpus = ray.available_resources()["CPU"] wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_int_dataframe(): ray.init() pandas_df = pd.DataFrame({'col1': [0, 1, 2, 3], 'col2': [4, 5, 6, 7], 'col3': [8, 9, 10, 11], 'col4': [12, 13, 14, 15], 'col5': [0, 0, 0, 0]}) ray_df = rdf.from_pandas(pandas_df, 2) testfuncs = [lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x, lambda x: False] keys = ['col1', 'col2', 'col3', 'col4'] test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) test_ndim(ray_df, pandas_df) test_ftypes(ray_df, pandas_df) test_values(ray_df, pandas_df) test_axes(ray_df, pandas_df) test_shape(ray_df, pandas_df) test_add_prefix(ray_df, pandas_df) test_add_suffix(ray_df, pandas_df) for testfunc in testfuncs: test_applymap(ray_df, pandas_df, testfunc) test_copy(ray_df) test_sum(ray_df, pandas_df) test_abs(ray_df, pandas_df) test_keys(ray_df, pandas_df) test_transpose(ray_df, pandas_df) test_round(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) test___delitem__(ray_df, pandas_df) test___copy__(ray_df, pandas_df) test___deepcopy__(ray_df, pandas_df) test_bool(ray_df, pandas_df) test_count(ray_df, pandas_df) test_head(ray_df, pandas_df) test_tail(ray_df, pandas_df) test_idxmax(ray_df, pandas_df) test_idxmin(ray_df, pandas_df) test_pop(ray_df, pandas_df) for key in keys: test_get(ray_df, pandas_df, key) test_get_dtype_counts(ray_df, pandas_df) test_get_ftype_counts(ray_df, pandas_df) test_max(ray_df, pandas_df) test_min(ray_df, pandas_df) test_notna(ray_df, pandas_df) test_notnull(ray_df, pandas_df)
def ray_start_single_node(): # Please start this fixture in a cluster with 2 cpus. address_info = ray.init(num_cpus=8) yield address_info ray.shutdown()
def test_mini_integration(ray_start_cluster, connect_to_client): # Create bundles as many as number of gpus in the cluster. # Do some random work and make sure all resources are properly recovered. cluster = ray_start_cluster num_nodes = 5 per_bundle_gpus = 2 gpu_per_node = 4 total_gpus = num_nodes * per_bundle_gpus * gpu_per_node per_node_gpus = per_bundle_gpus * gpu_per_node bundles_per_pg = 2 total_num_pg = total_gpus // (bundles_per_pg * per_bundle_gpus) [ cluster.add_node(num_cpus=2, num_gpus=per_bundle_gpus * gpu_per_node) for _ in range(num_nodes) ] cluster.wait_for_nodes() ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): @ray.remote(num_cpus=0, num_gpus=1) def random_tasks(): import time import random sleep_time = random.uniform(0.1, 0.2) time.sleep(sleep_time) return True pgs = [] pg_tasks = [] # total bundle gpu usage = bundles_per_pg*total_num_pg*per_bundle_gpus # Note this is half of total for index in range(total_num_pg): pgs.append( ray.util.placement_group( name=f"name{index}", strategy="PACK", bundles=[{ "GPU": per_bundle_gpus } for _ in range(bundles_per_pg)])) # Schedule tasks. for i in range(total_num_pg): pg = pgs[i] pg_tasks.append([ random_tasks.options( placement_group=pg, placement_group_bundle_index=bundle_index).remote() for bundle_index in range(bundles_per_pg) ]) # Make sure tasks are done and we remove placement groups. num_removed_pg = 0 pg_indexes = [2, 3, 1, 7, 8, 9, 0, 6, 4, 5] while num_removed_pg < total_num_pg: index = pg_indexes[num_removed_pg] pg = pgs[index] assert all(ray.get(pg_tasks[index])) ray.util.remove_placement_group(pg) num_removed_pg += 1 @ray.remote(num_cpus=2, num_gpus=per_node_gpus) class A: def ping(self): return True # Make sure all resources are properly returned by scheduling # actors that take up all existing resources. actors = [A.remote() for _ in range(num_nodes)] assert all(ray.get([a.ping.remote() for a in actors]))
self.register_variables(self.base_model.variables) @override(ModelV2) def forward(self, input_dict, state, seq_lens): out, self._value_out = self.base_model( [input_dict["obs"], input_dict["is_training"]]) return out, [] @override(ModelV2) def value_function(self): return tf.reshape(self._value_out, [-1]) if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("bn_model", BatchNormModel) config = { "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", "model": { "custom_model": "bn_model", }, "num_workers": 0, } from ray.rllib.agents.ppo import PPOTrainer trainer = PPOTrainer(config=config) trainer.train()
import unittest import ray from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.tests.test_rollout_worker import MockPolicy class TestPerf(unittest.TestCase): # Tested on Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz # 11/23/18: Samples per second 8501.125113727468 # 03/01/19: Samples per second 8610.164353268685 def testBaselinePerformance(self): for _ in range(20): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, batch_steps=100) start = time.time() count = 0 while time.time() - start < 1: count += ev.sample().count print() print("Samples per second {}".format( count / (time.time() - start))) print() if __name__ == "__main__": ray.init(num_cpus=5) unittest.main(verbosity=2)
def __init__(self, game_name, config=None, split_resources_in=1): # Load the game and the config from the module with the game name try: game_module = importlib.import_module("games." + game_name) self.Game = game_module.Game self.config = game_module.MuZeroConfig() except ModuleNotFoundError as err: print( f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.' ) raise err # Overwrite the config if config: if type(config) is dict: for param, value in config.items(): setattr(self.config, param, value) else: self.config = config # Fix random generator seed numpy.random.seed(self.config.seed) torch.manual_seed(self.config.seed) # Manage GPUs if self.config.max_num_gpus == 0 and (self.config.selfplay_on_gpu or self.config.train_on_gpu or self.config.reanalyse_on_gpu): raise ValueError( "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu." ) if (self.config.selfplay_on_gpu or self.config.train_on_gpu or self.config.reanalyse_on_gpu): total_gpus = (self.config.max_num_gpus if self.config.max_num_gpus is not None else torch.cuda.device_count()) else: total_gpus = 0 self.num_gpus = total_gpus / split_resources_in if 1 < self.num_gpus: self.num_gpus = math.floor(self.num_gpus) ray.init(num_gpus=total_gpus, ignore_reinit_error=True) # Checkpoint and replay buffer used to initialize workers self.checkpoint = { "weights": None, "optimizer_state": None, "total_reward": 0, "muzero_reward": 0, "opponent_reward": 0, "episode_length": 0, "mean_value": 0, "training_step": 0, "lr": 0, "total_loss": 0, "value_loss": 0, "reward_loss": 0, "policy_loss": 0, "num_played_games": 0, "num_played_steps": 0, "num_reanalysed_games": 0, "terminate": False, } self.replay_buffer = {} cpu_actor = CPUActor.remote() cpu_weights = cpu_actor.get_initial_weights.remote(self.config) self.checkpoint["weights"], self.summary = copy.deepcopy( ray.get(cpu_weights)) # Workers self.self_play_workers = None self.test_worker = None self.training_worker = None self.reanalyse_worker = None self.replay_buffer_worker = None self.shared_storage_worker = None
def ray_start_single_node_2_gpus(): # Please start this fixture in a cluster with 2 GPUs. address_info = ray.init(num_gpus=2) yield address_info ray.shutdown()
def setUpClass(cls) -> None: ray.init()
def main(num_actors, env_name="CartPole-v0", batch_size=16, update_iter=16, gamma=0.97, eta=0.9, alpha=0.9, burnin_length=4, unroll_length=4): s = time.time() ray.init(local_mode=False) logdir = Path(__file__).parent / "log" if logdir.exists(): shutil.rmtree(logdir) summary_writer = tf.summary.create_file_writer(str(logdir)) history = [] epsilons = np.linspace(0.1, 0.8, num_actors) if num_actors > 1 else [0.3] actors = [ Actor.remote(pid=i, env_name=env_name, epsilon=epsilons[i], gamma=gamma, eta=eta, alpha=alpha, burnin_length=burnin_length, unroll_length=unroll_length) for i in range(num_actors) ] replay = SegmentReplayBuffer(buffer_size=2**12) learner = Learner.remote(env_name=env_name, gamma=gamma, eta=eta, alpha=alpha, burnin_length=burnin_length, unroll_length=unroll_length) current_weights = ray.get(learner.define_network.remote()) current_weights = ray.put(current_weights) tester = Tester.remote(env_name=env_name) wip_actors = [ actor.sync_weights_and_rollout.remote(current_weights) for actor in actors ] for _ in range(10): finished, wip_actors = ray.wait(wip_actors, num_returns=1) priorities, segments, pid = ray.get(finished[0]) replay.add(priorities, segments) wip_actors.extend( [actors[pid].sync_weights_and_rollout.remote(current_weights)]) # minibatchs: (indices, weights, segments) minibatchs = [ replay.sample_minibatch(batch_size=batch_size) for _ in range(update_iter) ] wip_learner = learner.update_network.remote(minibatchs) wip_tester = tester.test_play.remote(current_weights, epsilon=0.05) minibatchs = [ replay.sample_minibatch(batch_size=batch_size) for _ in range(update_iter) ] learner_cycles = 1 actor_cycles = 0 n_segment_added = 0 print("Start learning!!") while learner_cycles <= 50: actor_cycles += 1 finished, wip_actors = ray.wait(wip_actors, num_returns=1) priorities, segments, pid = ray.get(finished[0]) replay.add(priorities, segments) wip_actors.extend( [actors[pid].sync_weights_and_rollout.remote(current_weights)]) n_segment_added += len(segments) finished_learner, _ = ray.wait([wip_learner], timeout=0) if finished_learner: current_weights, indices, priorities, loss = ray.get( finished_learner[0]) wip_learner = learner.update_network.remote(minibatchs) current_weights = ray.put(current_weights) #: 優先度の更新とminibatchの作成はlearnerよりも十分に速いという前提 replay.update_priority(indices, priorities) minibatchs = [ replay.sample_minibatch(batch_size=batch_size) for _ in range(update_iter) ] print("Actor cycle:", actor_cycles, "Added:", n_segment_added) learner_cycles += 1 actor_cycles = 0 n_segment_added = 0 with summary_writer.as_default(): tf.summary.scalar("learner_loss", loss, step=learner_cycles) if learner_cycles % 5 == 0: test_rewards = ray.get(wip_tester) history.append((learner_cycles - 5, test_rewards)) wip_tester = tester.test_play.remote(current_weights, epsilon=0.05) print("Cycle:", learner_cycles, "Score:", test_rewards) with summary_writer.as_default(): tf.summary.scalar("test_rewards", test_rewards, step=learner_cycles) tf.summary.scalar("buffer_size", len(replay), step=learner_cycles) wallclocktime = round(time.time() - s, 2) cycles, scores = zip(*history) plt.plot(cycles, scores) plt.title(f"total time: {wallclocktime} sec") plt.ylabel("test_score(epsilon=0.1)") plt.savefig("log/history.png")
arr = np.ones(OBJECT_SIZE, dtype=np.uint8) ref = ray.put(arr) for actor in tqdm(actors, desc="Ensure all actors have started."): ray.get(actor.foo.remote()) result_refs = [] for actor in tqdm(actors, desc="Broadcasting objects"): result_refs.append(actor.sum.remote(ref)) results = ray.get(result_refs) for result in results: assert result == OBJECT_SIZE ray.init(address="auto") start = perf_counter() test_object_broadcast() end = perf_counter() print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)") if "TEST_OUTPUT_JSON" in os.environ: out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") results = { "broadcast_time": end - start, "object_size": OBJECT_SIZE, "num_nodes": NUM_NODES, "success": "1" } json.dump(results, out_file)
def test_capture_child_actors(ray_start_cluster, connect_to_client): cluster = ray_start_cluster total_num_actors = 4 for _ in range(2): cluster.add_node(num_cpus=total_num_actors) ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): pg = ray.util.placement_group( [{ "CPU": 2 }, { "CPU": 2 }], strategy="STRICT_PACK") ray.get(pg.ready()) # If get_current_placement_group is used when the current worker/driver # doesn't belong to any of placement group, it should return None. assert get_current_placement_group() is None # Test actors first. @ray.remote(num_cpus=1) class NestedActor: def ready(self): return True @ray.remote(num_cpus=1) class Actor: def __init__(self): self.actors = [] def ready(self): return True def schedule_nested_actor(self): # Make sure we can capture the current placement group. assert get_current_placement_group() is not None # Actors should be implicitly captured. actor = NestedActor.remote() ray.get(actor.ready.remote()) self.actors.append(actor) def schedule_nested_actor_outside_pg(self): # Don't use placement group. actor = NestedActor.options(placement_group=None).remote() ray.get(actor.ready.remote()) self.actors.append(actor) a = Actor.options( placement_group=pg, placement_group_capture_child_tasks=True).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are scheduled on the same node. # (why? The placement group has STRICT_PACK strategy). node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) # Since all node id should be identical, set should be equal to 1. assert len(node_id_set) == 1 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Now create an actor, but do not capture the current tasks a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2 # Kill an actor and wait until it is killed. kill_actor_and_wait_for_failure(a) with pytest.raises(ray.exceptions.RayActorError): ray.get(a.ready.remote()) # Lastly, make sure when None is specified, actors are not scheduled # on the same placement group. a = Actor.options(placement_group=pg).remote() ray.get(a.ready.remote()) # 1 top level actor + 3 children. for _ in range(total_num_actors - 1): ray.get(a.schedule_nested_actor_outside_pg.remote()) # Make sure all the actors are not scheduled on the same node. # It is because the child tasks are not scheduled on the same # placement group. node_id_set = set() for actor_info in ray.state.actors().values(): if actor_info["State"] == convert_actor_state( gcs_utils.ActorTableData.ALIVE): node_id = actor_info["Address"]["NodeID"] node_id_set.add(node_id) assert len(node_id_set) == 2
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < ray.utils.get_system_memory() / 2) # Simulate a cluster on one machine. cluster = Cluster() for i in range(num_nodes): cluster.add_node( redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, num_cpus=4, num_gpus=0, resources={str(i): 5}, object_store_memory=object_store_memory, redis_max_memory=redis_max_memory) ray.init(address=cluster.address) # Run the workload. # Define a driver script that runs a few tasks and actors on each node in the # cluster. driver_script = """ import ray ray.init(address="{}") num_nodes = {} @ray.remote def f():
parse.add_argument("--batch_size", default=256, type=int, help="train batch size") parse.add_argument("--lr", default=0.625e-4, type=float, help="learning rate") parse.add_argument("--suffix", default="default", help="suffix for saving folders") parse.add_argument("--speed", default=8, type=int, help="training data consum speed. default 8x data generate speed") args = parse.parse_args() n_worker = args.num_agents n_loader = args.num_loaders env_name = "{}NoFrameskip-v4".format(args.env_name) buffer = "multi_workers_buffer" batch_size = args.batch_size lr = args.lr suffix = args.suffix speed = args.speed n_iter = 40*32//batch_size ray.init(num_cpus=1+2*n_worker+n_loader, object_store_memory=1*1024**3, memory=6*1024**3) buffer = lmdb_op.init(buffer) workers = [ray.remote(DQN_Worker).options(num_gpus=0.1).remote(env_name=env_name, db=buffer, db_write=lmdb_op.write) for _ in range(n_worker)] test_worker = ray.remote(DQN_Worker).options(num_gpus=0.1).remote(env_name=env_name, phase="test", suffix=suffix) worker_id = {worker: "worker_{}".format(i) for i, worker in enumerate(workers)} dataloader = Dataloader(buffer, lmdb_op, worker_num=n_loader, batch_size=batch_size, batch_num=n_iter) opt = ray.remote(Optimizer).options(num_gpus=0.3).remote(dataloader, env_name, suffix=suffix, iter_steps=n_iter, update_period=10000, lr=lr) sche = Sched() eps = 1 opt_start = False glog = SummaryWriter("./logdir/{}/{}/{}.lr{}.batch{}".format(env_name, suffix, Optimizer.__name__, lr, batch_size)) model_save_period = 100000 train_step = 0 model_idx = 0
register_env(gym_name, create_env) return alg_run, gym_name, config if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-s", "--seeds_file", dest="seeds_file", help="pickle file containing seeds", default=None) args = parser.parse_args() alg_run, gym_name, config = setup_exps(args.seeds_file) ray.init(num_cpus=N_CPUS + 1) trials = run_experiments( { flow_params["exp_tag"]: { "run": alg_run, "env": gym_name, "config": { **config }, "restore": '/home/flow/ray_results/central_i696_IDMJunction_horizon2000_Ignore/PPO_MergePOEnv_Ignore-v0_0_2020-04-25_13-49-44m7fk71uh/checkpoint_230', #"resume":True, "checkpoint_freq": 10, #20, "checkpoint_at_end": True, "max_failures": 999, "stop": {
def test_atomic_creation(ray_start_cluster, connect_to_client): # Setup cluster. cluster = ray_start_cluster bundle_cpu_size = 2 bundle_per_node = 2 num_nodes = 2 [ cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node) for _ in range(num_nodes) ] ray.init(address=cluster.address) @ray.remote(num_cpus=1) class NormalActor: def ping(self): pass @ray.remote(num_cpus=3) def bothering_task(): time.sleep(6) return True with connect_to_client_or_not(connect_to_client): # Schedule tasks to fail initial placement group creation. tasks = [bothering_task.remote() for _ in range(2)] # Make sure the two common task has scheduled. def tasks_scheduled(): return ray.available_resources()["CPU"] == 2.0 wait_for_condition(tasks_scheduled) # Create an actor that will fail bundle scheduling. # It is important to use pack strategy to make test less flaky. pg = ray.util.placement_group( name="name", strategy="SPREAD", bundles=[{ "CPU": bundle_cpu_size } for _ in range(num_nodes * bundle_per_node)]) # Create a placement group actor. # This shouldn't be scheduled because atomic # placement group creation should've failed. pg_actor = NormalActor.options( placement_group=pg, placement_group_bundle_index=num_nodes * bundle_per_node - 1).remote() # Wait on the placement group now. It should be unready # because normal actor takes resources that are required # for one of bundle creation. ready, unready = ray.wait([pg.ready()], timeout=0.5) assert len(ready) == 0 assert len(unready) == 1 # Wait until all tasks are done. assert all(ray.get(tasks)) # Wait on the placement group creation. Since resources are now # available, it should be ready soon. ready, unready = ray.wait([pg.ready()]) assert len(ready) == 1 assert len(unready) == 0 # Confirm that the placement group actor is created. It will # raise an exception if actor was scheduled before placement # group was created thus it checks atomicity. ray.get(pg_actor.ping.remote(), timeout=3.0) ray.kill(pg_actor) # Make sure atomic creation failure didn't impact resources. @ray.remote(num_cpus=bundle_cpu_size) def resource_check(): return True # This should hang because every resources # are claimed by placement group. check_without_pg = [ resource_check.remote() for _ in range(bundle_per_node * num_nodes) ] # This all should scheduled on each bundle. check_with_pg = [ resource_check.options( placement_group=pg, placement_group_bundle_index=i).remote() for i in range(bundle_per_node * num_nodes) ] # Make sure these are hanging. ready, unready = ray.wait(check_without_pg, timeout=0) assert len(ready) == 0 assert len(unready) == bundle_per_node * num_nodes # Make sure these are all scheduled. assert all(ray.get(check_with_pg)) ray.util.remove_placement_group(pg) def pg_removed(): return ray.util.placement_group_table(pg)["state"] == "REMOVED" wait_for_condition(pg_removed) # Make sure check without pgs are all # scheduled properly because resources are cleaned up. assert all(ray.get(check_without_pg))
def test_client_serialize_addon(call_ray_stop_only): import ray import pydantic ray.init(num_cpus=0) class User(pydantic.BaseModel): name: str with ray_start_client_server() as ray: assert ray.get(ray.put(User(name="ray"))).name == "ray" object_ref_cleanup_script = """ import ray ray.init("ray://localhost:50051") @ray.remote def f(): return 42 @ray.remote class SomeClass: pass obj_ref = f.remote() actor_ref = SomeClass.remote()
max_episode_steps=200, kwargs={}) def create_env(env_config): pass_params_to_gym(env_name) env = gym.envs.make(env_name) return env if __name__ == '__main__': register_env(env_name, lambda env_config: create_env(env_config)) config = ppo.DEFAULT_CONFIG.copy() horizon = 10 num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus config["train_batch_size"] = 1000 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon config["use_gae"] = False config["model"].update({"fcnet_hiddens": [256, 256]}) options = { "multiagent_obs_shapes": [2, 2], "multiagent_act_shapes": [1, 1], "multiagent_shared_model": False, "multiagent_fcnet_hiddens": [[32, 32]] * 2 } config["model"].update({"custom_options": options}) alg = ppo.PPOAgent(env=env_name, config=config)
def test_automatic_cleanup_detached_actors(ray_start_cluster): # Make sure the placement groups created by a # detached actors are cleaned properly. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 2 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) cluster.wait_for_nodes() info = ray.init( address=cluster.address, namespace="default_test_namespace") available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["redis_address"]}", namespace="default_test_namespace") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg # TODO(sang): Placement groups created by tasks launched by detached actor # is not cleaned with the current protocol. # @ray.remote(num_cpus=0) # def f(): # create_pg() @ray.remote(num_cpus=0, max_restarts=1) class A: def create_pg(self): create_pg() def create_child_pg(self): self.a = A.options(name="B").remote() ray.get(self.a.create_pg.remote()) def kill_child_actor(self): ray.kill(self.a) try: ray.get(self.a.create_pg.remote()) except Exception: pass a = A.options(lifetime="detached", name="A").remote() ray.get(a.create_pg.remote()) # TODO(sang): Currently, child tasks are cleaned when a detached actor # is dead. We cannot test this scenario until it is fixed. # ray.get(a.create_child_pg.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.state.jobs() for job in jobs: if job["IsDead"]: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Make sure when a child actor spawned by a detached actor # is killed, the placement group is removed. a = ray.get_actor("A") # TODO(sang): child of detached actors # seem to be killed when jobs are done. We should fix this before # testing this scenario. # ray.get(a.kill_child_actor.remote()) # assert assert_num_cpus(num_nodes) # Make sure placement groups are cleaned when detached actors are killed. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
import time import ray @ray.remote def f(): time.sleep(0.01) return ray.services.get_node_ip_address() ray.init(redis_address='master:6379') # Get a list of the IP addresses of the nodes that have joined the cluster. ids = set(ray.get([f.remote() for _ in range(1000)])) print(ids)
N_ROWS = args.num_rows # number of row of bidirectional lanes N_COLUMNS = args.num_cols # number of columns of bidirectional lanes flow_params = make_flow_params(N_ROWS, N_COLUMNS, EDGE_INFLOW) upload_dir = args.upload_dir RUN_MODE = args.run_mode ALGO = args.algo if ALGO == 'PPO': alg_run, env_name, config = setup_exps_PPO(flow_params) else: raise NotImplementedError if RUN_MODE == 'local': ray.init(num_cpus=N_CPUS + 1) N_ITER = 1 elif RUN_MODE == 'cluster': ray.init(redis_address="localhost:6379") N_ITER = 2000 exp_tag = { 'run': alg_run, 'env': env_name, 'checkpoint_freq': 25, "max_failures": 10, 'stop': { 'training_iteration': N_ITER }, 'config': config, "num_samples": 1,
def ray_start_2_cpus(): address_info = ray.init(num_cpus=2) yield address_info # The code after the yield will run as teardown code. ray.shutdown()
import io import base64 from IPython.display import HTML from dqn_model import DQNModel from dqn_model import _DQNModel from memory_remote import ReplayBuffer_remote import matplotlib.pyplot as plt from custom_cartpole import CartPoleEnv # matplotlib inline ray.shutdown() ray.init(include_webui=False, ignore_reinit_error=True, redis_max_memory=500000000, object_store_memory=5000000000, temp_dir="../../tmp/") FloatTensor = torch.FloatTensor # # Set the Env name and action space for CartPole # ENV_NAME = 'CartPole-v0' # # Move left, Move right # ACTION_DICT = {"LEFT": 0, "RIGHT": 1} # # Register the environment # env_CartPole = gym.make(ENV_NAME) # # # Set result saveing floder # result_floder = ENV_NAME # result_file = ENV_NAME + "/results.txt"
def main(args): # ==================================== # init env config # ==================================== if args.no_debug: ray.init(webui_host="127.0.0.1") else: ray.init(local_mode=True, webui_host="127.0.0.1") # use ray cluster for training # ray.init( # address="auto" if args.address is None else args.address, # redis_password="******", # ) # # print( # "--------------- Ray startup ------------\n{}".format( # ray.state.cluster_resources() # ) # ) agent_specs = {"AGENT-007": agent_spec} env_config = { "seed": 42, "scenarios": [scenario_paths], "headless": args.headless, "agent_specs": agent_specs, } # ==================================== # init tune config # ==================================== class MultiEnv(RLlibHiWayEnv): def __init__(self, env_config): env_config["scenarios"] = [ scenario_paths[(env_config.worker_index - 1) % len(scenario_paths)] ] super(MultiEnv, self).__init__(config=env_config) tune_config = { "env": MultiEnv, "env_config": env_config, "multiagent": { "policies": { "default_policy": ( None, OBSERVATION_SPACE, ACTION_SPACE, {}, ) }, "policy_mapping_fn": lambda agent_id: "default_policy", }, "framework": "torch", "callbacks": { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, }, "lr": 1e-4, "log_level": "WARN", "num_workers": args.num_workers, "horizon": args.horizon, "learning_starts": 10000, "buffer_size": 50000, "rollout_fragment_length": 4, "train_batch_size": 32, # "exploration_config": { # "epsilon_timesteps": 200000, # "final_epsilon": .01 # }, "timesteps_per_iteration": 10000, "dueling": False, "prioritized_replay": False, # "model": { # "fcnet_hiddens": [256, 256], # "fcnet_activation": "relu", # }, "exploration_config": { # Exploration sub-class by name or full path to module+class # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”) # "type": "zhr_train_rllib.multiobj_episilongreedy.EpsilonGreedy", # Parameters for the Exploration class' constructor: "initial_epsilon": 1.0, "final_epsilon": 0.02, "epsilon_timesteps": 200000, # Timesteps over which to anneal epsilon. }, # "observation_filter": "MeanStdFilter", # "model":{ # "use_lstm": True, # }, } tune_config.update({ "gamma": 0.995, "decompose_num": 2, # "seed_global": tune.grid_search([10, 20, 30, 40]) }) num_samples = 4 if not args.no_debug: tune_config.update({ "learning_starts": 1000, "buffer_size": 5000, "timesteps_per_iteration": 1000, }) num_samples = 1 # ==================================== # init log and checkpoint dir_info # ==================================== experiment_name = EXPERIMENT_NAME.format( scenario=args.exper, algorithm="DQN", n_agent=1, ) log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME log_dir.mkdir(parents=True, exist_ok=True) print(f"Checkpointing at {log_dir}") if args.restore: restore_path = Path(args.restore).expanduser() print(f"Loading model from {restore_path}") else: restore_path = None # run experiments analysis = tune.run( DQN, # "DQN", name=experiment_name, stop={"timesteps_total": 10000000 / 2}, checkpoint_freq=10, checkpoint_at_end=True, local_dir=str(log_dir), resume=args.resume, restore=restore_path, max_failures=1000, export_formats=["model", "checkpoint"], config=tune_config, num_samples=num_samples, ) print(analysis.dataframe().head())
def tune_random(name, exp, num_samples=2, seed_value=None, **digit_kwargs): """Tune hyperparameters of any bandit experiment.""" # ------------------------------------------------------------------------ # Init seed prng = np.random.RandomState(seed_value) # Init ray if not ray.is_initialized(): if "debug" in digit_kwargs: ray.init(local_mode=digit_kwargs["debug"]) else: ray.init() # ------------------------------------------------------------------------ # Create the train function. We do it scope to control if it # gets remote'ed to GPUs or not. try: if digit_kwargs["use_gpu"]: @ray.remote(num_gpus=0.25) def train(name=None, exp_func=None, config=None): trial = exp_func(**config) trial.update({"config": config, "name": name}) return trial else: @ray.remote def train(name=None, exp_func=None, config=None): trial = exp_func(**config) trial.update({"config": config, "name": name}) return trial except KeyError: @ray.remote def train(name=None, exp_func=None, config=None): trial = exp_func(**config) trial.update({"config": config, "name": name}) return trial # ------------------------------------------------------------------------ # Init: # Separate name from path path, name = os.path.split(name) # Look up the bandit run function were using in this tuning. exp_func = getattr(glia_digits, exp) # ------------------------------------------------------------------------ # Run! # Setup the parallel workers runs = [] for i in range(num_samples): # Make a new HP sample params = {} for k, v in digit_kwargs.items(): if isinstance(v, str): params[k] = v elif isinstance(v, bool): params[k] = v elif isinstance(v, int): params[k] = v elif isinstance(v, float): params[k] = v else: low, high = v params[k] = prng.uniform(low=low, high=high) runs.append(train.remote(i, exp_func, params)) trials = [ray.get(r) for r in runs] # ------------------------------------------------------------------------ # Save configs and correct (full model data is dropped): best = get_best_trial(trials, 'correct') # Best trial config best_config = best["config"] best_config.update(get_best_result(trials, 'correct')) save_checkpoint(best_config, filename=os.path.join(path, name + "_best.pkl")) # Sort and save the configs of all trials sorted_configs = {} for i, trial in enumerate(get_sorted_trials(trials, 'correct')): sorted_configs[i] = trial["config"] sorted_configs[i].update({"correct": trial["correct"]}) save_checkpoint(sorted_configs, filename=os.path.join(path, name + "_sorted.pkl")) # kill ray ray.shutdown() # - return best, trials
def ray_init_with_task_retry_delay(): address = ray.init(_system_config={"task_retry_delay_ms": 100}) yield address ray.shutdown()
import torch from scipy.signal import savgol_filter from tqdm import tqdm import ffmpeg import ray from data_segments.get_data_segments import (get_flame_params_for_file, get_segments_v2) from misc.read_n_write import flame2glow from misc.shared import DATA_DIR from misc.utils import frames2ms, get_participant WIN_LEN = 9 ONLY_ODD = True ray.init(num_cpus=10) segments = get_segments_v2() sessions_dir = DATA_DIR / "Sessions_50fps_pytorch_data/sessions" def to_iterator(obj_ids): while obj_ids: done, obj_ids = ray.wait(obj_ids) yield ray.get(done[0]) def get_openface(file_): tar_file = TarFile(file_)
filename="smoketest.parquet", num_rows=args.num_workers * 500, num_features=4, num_classes=2, num_partitions=args.num_workers * 10) use_gpu = False else: path = args.file if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` first.") init_start = time.time() if args.smoke_test: ray.init(num_cpus=num_workers) else: ray.init(address="auto") init_taken = time.time() - init_start full_start = time.time() bst, train_taken = train_ray(path=path, num_workers=num_workers, num_boost_rounds=num_boost_rounds, num_files=num_files, regression=args.regression, use_gpu=use_gpu, smoke_test=args.smoke_test) full_taken = time.time() - full_start print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)")
def setUpClass(cls): ray.init()