def main(): env = GB_game(num_char=4, reward_circle=True, death_penalty=False, relative_positions=True, discrete=True, max_speed=10) print('the size of env is' + str(env.size)) # env = dummy_obj() ray.register_custom_serializer( GB_game, use_pickle=True) # amazing. I needed to use this to get it to # work! env = ray.put(env) print('\nthe put succeeded!!\n') actors = [ Parallel_Actor.remote(computation_graph_args, sample_trajectory_args, estimate_return_args) for i in range(num_cpus) ] CA = Counter.remote() # weights_copy = actors[0].get_weights.remote() # ray.get([actor.set_weights.remote(weights_copy) for actor in actors]) # weights = ray.get([actor.get_weights.remote() for actor in actors]) # # for i in range(len(weights)): # np.testing.assert_equal(weights[i], weights[0]) # print('test passed!') return_array = ray.get([ actor.sample_trajectories_fake.remote(10, env, CA) for actor in actors ]) print(return_array)
def initialize_ray(): """Initializes ray based on environment variables and internal defaults.""" if threading.current_thread().name == "MainThread": plasma_directory = None object_store_memory = os.environ.get("MODIN_MEMORY", None) if os.environ.get("MODIN_OUT_OF_CORE", "False").title() == "True": from tempfile import gettempdir plasma_directory = gettempdir() # We may have already set the memory from the environment variable, we don't # want to overwrite that value if we have. if object_store_memory is None: # Round down to the nearest Gigabyte. mem_bytes = ray.utils.get_system_memory() // 10**9 * 10**9 # Default to 8x memory for out of core object_store_memory = 8 * mem_bytes # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: # Round down to the nearest Gigabyte. object_store_memory = int(0.6 * ray.utils.get_system_memory() // 10**9 * 10**9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None else: object_store_memory = int(object_store_memory) ray.init( include_webui=False, ignore_reinit_error=True, plasma_directory=plasma_directory, object_store_memory=object_store_memory, ) # Register custom serializer for method objects to avoid warning message. # We serialize `MethodType` objects when we use AxisPartition operations. ray.register_custom_serializer(types.MethodType, use_pickle=True)
def custom_serializers(): class Foo: def __init__(self): self.x = 3 def custom_serializer(obj): return 3, "string1", type(obj).__name__ def custom_deserializer(serialized_obj): return serialized_obj, "string2" ray.register_custom_serializer(Foo, serializer=custom_serializer, deserializer=custom_deserializer) assert ray.get(ray.put(Foo())) == ((3, "string1", Foo.__name__), "string2") class Bar: def __init__(self): self.x = 3 ray.register_custom_serializer(Bar, serializer=custom_serializer, deserializer=custom_deserializer) @ray.remote def f(): return Bar() assert ray.get(f.remote()) == ((3, "string1", Bar.__name__), "string2")
def init_ray(): ray.init() def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer(t, serializer=serializer, deserializer=deserializer)
def init(name=None, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, metric_exporter=InMemoryExporter): """Initialize or connect to a serve cluster. If serve cluster is already initialized, this function will just return. If `ray.init` has not been called in this process, it will be called with no arguments. To specify kwargs to `ray.init`, it should be called separately before calling `serve.init`. Args: name (str): A unique name for this serve instance. This allows multiple serve instances to run on the same ray cluster. Must be specified in all subsequent serve.init() calls. http_host (str): Host for HTTP server. Default to "0.0.0.0". http_port (int): Port for HTTP server. Default to 8000. metric_exporter(ExporterInterface): The class aggregates metrics from all RayServe actors and optionally export them to external services. RayServe has two options built in: InMemoryExporter and PrometheusExporter """ if name is not None and not isinstance(name, str): raise TypeError("name must be a string.") # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve master actor if it exists global master_actor master_actor_name = format_actor_name(SERVE_MASTER_NAME, name) try: master_actor = ray.get_actor(master_actor_name) return except ValueError: pass # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) ray.register_custom_serializer(RequestMetadata, RequestMetadata.ray_serialize, RequestMetadata.ray_deserialize) # TODO(edoakes): for now, always start the HTTP proxy on the node that # serve.init() was run on. We should consider making this configurable # in the future. http_node_id = ray.state.current_node_id() master_actor = ServeMaster.options( name=master_actor_name, max_restarts=-1, max_task_retries=-1, ).remote(name, http_node_id, http_host, http_port, metric_exporter) block_until_http_ready("http://{}:{}/-/routes".format( http_host, http_port), timeout=HTTP_PROXY_TIMEOUT)
def __init__(self, benchmark_database=None, evaluators=None, terminal_when=None, behaviors=None, num_scenarios=None, benchmark_configs=None, log_eval_avg_every=None, num_cpus=None, memory_total=None): super().__init__(benchmark_database=benchmark_database, evaluators=evaluators, terminal_when=terminal_when, behaviors=behaviors, num_scenarios=num_scenarios, benchmark_configs=benchmark_configs) num_cpus_available = psutil.cpu_count(logical=True) if num_cpus and num_cpus <= num_cpus_available: pass else: num_cpus = num_cpus_available mem = psutil.virtual_memory() memory_available = mem.available if memory_total and memory_total <= memory_available: pass else: memory_total = memory_available ray.init( num_cpus=num_cpus, memory=memory_total * 0.3, object_store_memory=memory_total * 0.7) # we split memory between workers (30%) and objects (70%) ray.register_custom_serializer( BenchmarkConfig, serializer=serialize_benchmark_config, deserializer=deserialize_benchmark_config) ray.register_custom_serializer(Scenario, serializer=serialize_scenario, deserializer=deserialize_scenario) self.benchmark_config_split = [ self.benchmark_configs[i::num_cpus] for i in range(0, num_cpus) ] self.actors = [ _BenchmarkRunnerActor.remote( evaluators=evaluators, terminal_when=terminal_when, benchmark_configs=self.benchmark_config_split[i], logger_name="BenchmarkingActor{}".format(i), log_eval_avg_every=log_eval_avg_every) for i in range(num_cpus) ]
def run_ray_many(tune_config, exp_config, experiments, fix_seed=False): # update config tune_config["config"] = exp_config # override when running local for test if not torch.cuda.is_available(): tune_config["config"]["device"] = "cpu" tune_config["resources_per_trial"] = {"cpu": 1} # MC code to fix for an unknown bug def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer( t, serializer=serializer, deserializer=deserializer ) # fix seed if fix_seed: set_random_seed(32) # multiple experiments exp_configs = [ (name, new_experiment(exp_config, c)) for name, c in experiments.items() ] # init ray ray.init() results = [ run_experiment.remote(name, RayTrainable, c, tune_config) for name, c in exp_configs ] ray.get(results) ray.shutdown()
def __init__(self, num_trees=0, rand_features=None, max_depth=15, header=""): super(RandomForest, self).__init__() self.trained_trees = [None] * num_trees self.num_trees = num_trees self.rand_features = "sqrt" self.max_depth = max_depth self.header = header ray.init() ray.register_custom_serializer(RandomForest, use_pickle=True)
def register_serializer(conn_str=None, apikey=None, profile=None): """ Register serializer for BTrDB Object Parameters ---------- conn_str: str, default=None The address and port of the cluster to connect to, e.g. `192.168.1.1:4411`. If set to None, will look in the environment variable `$BTRDB_ENDPOINTS` (recommended). apikey: str, default=None The API key used to authenticate requests (optional). If None, the key is looked up from the environment variable `$BTRDB_API_KEY`. profile: str, default=None The name of a profile containing the required connection information as found in the user's predictive grid credentials file `~/.predictivegrid/credentials.yaml`. """ try: import ray except ImportError: raise ImportError("must pip install ray to register custom serializer") try: import semver except ImportError: raise ImportError( "must pip install semver to register custom serializer") assert ray.is_initialized( ), "Need to call ray.init() before registering custom serializer" # TODO: check the version using the 'semver' package? ver = semver.VersionInfo.parse(ray.__version__) if ver.major == 0: ray.register_custom_serializer(BTrDB, serializer=btrdb_serializer, deserializer=partial(btrdb_deserializer, conn_str=conn_str, apikey=apikey, profile=profile)) elif ver.major == 1 and ver.minor in range(2, 4): # TODO: check different versions of ray? ray.util.register_serializer(BTrDB, serializer=btrdb_serializer, deserializer=partial(btrdb_deserializer, conn_str=conn_str, apikey=apikey, profile=profile)) else: raise Exception( "Ray version %s does not have custom serialization. Please upgrade to >= 1.2.0" % ray.__version__)
def _register_ray_serializer(op): # register a custom serializer for Mars operand try: ray.register_custom_serializer(type(op), serializer=operand_serializer, deserializer=operand_deserializer) except AttributeError: # ray >= 1.0 from ray.worker import global_worker global_worker.check_connected() context = global_worker.get_serialization_context() context.register_custom_serializer(type(op), serializer=operand_serializer, deserializer=operand_deserializer)
def __init__(self, name, flags, auto_pilot, camera_setups=[], lidar_setups=[], log_file_name=None, csv_file_name=None): super(CarlaLegacyOperator, self).__init__(name) self._flags = flags self._logger = setup_logging(self.name, log_file_name) self._csv_logger = setup_csv_logging(self.name + '-csv', csv_file_name) self._auto_pilot = auto_pilot if self._flags.carla_high_quality: quality = 'Epic' else: quality = 'Low' self._settings = CarlaSettings() self._settings.set( SynchronousMode=self._flags.carla_synchronous_mode, SendNonPlayerAgentsInfo=True, NumberOfVehicles=self._flags.carla_num_vehicles, NumberOfPedestrians=self._flags.carla_num_pedestrians, WeatherId=self._flags.carla_weather, QualityLevel=quality) self._settings.randomize_seeds() self._transforms = {} # Add cameras to the simulation. for cs in camera_setups: self.__add_camera(cs) self._transforms[cs.name] = cs.get_transform() # Add lidars to the simulation. for ls in lidar_setups: self.__add_lidar(ls) self._transforms[ls.name] = ls.get_transform() self.agent_id_map = {} self.pedestrian_count = 0 # Initialize the control state. self.control = { 'steer': 0.0, 'throttle': 0.0, 'brake': 0.0, 'hand_brake': False, 'reverse': False } # Register custom serializers for Messages and WatermarkMessages ray.register_custom_serializer(Message, use_pickle=True) ray.register_custom_serializer(WatermarkMessage, use_pickle=True)
def init_ray(num_cpus=None, num_gpus=None, ray_redis_address=None): """Initialize ray. If `ray_redis_address` is given, use the address to connect existing ray cluster. Otherwise start ray locally. """ if ray_redis_address is not None: ray.init(redis_address=ray_redis_address) else: if num_gpus is None: num_gpus = torch.cuda.device_count() ray.init(num_gpus=num_gpus, num_cpus=num_cpus) # XXX: Currently, ray (pyarrow) does not serialize `requires_grad` # attribute. As a workaround, use custom serializer. # See https://github.com/ray-project/ray/issues/4855 ray.register_custom_serializer(torch.nn.Module, use_pickle=True)
def test_numpy_subclass_serialization_pickle(ray_start_regular): class MyNumpyConstant(np.ndarray): def __init__(self, value): super().__init__() self.constant = value def __str__(self): print(self.constant) constant = MyNumpyConstant(123) ray.register_custom_serializer(type(constant), use_pickle=True) repr_orig = repr(constant) repr_ser = repr(ray.get(ray.put(constant))) assert repr_orig == repr_ser
def __init__(self, experiment): from slm_lab.experiment.control import Experiment ray.register_custom_serializer(Experiment, use_pickle=True) ray.register_custom_serializer(InfoSpace, use_pickle=True) ray.register_custom_serializer(pd.DataFrame, use_pickle=True) ray.register_custom_serializer(pd.Series, use_pickle=True) self.experiment = experiment self.config_space = build_config_space(experiment) logger.info(f'Running {util.get_class_name(self)}, with meta spec:\n{self.experiment.spec["meta"]}')
def register_ray_serializer(): '''Helper to register so objects can be serialized in Ray''' from slm_lab.experiment.control import Experiment from slm_lab.experiment.monitor import InfoSpace import pandas as pd ray.register_custom_serializer(Experiment, use_pickle=True) ray.register_custom_serializer(InfoSpace, use_pickle=True) ray.register_custom_serializer(pd.DataFrame, use_pickle=True) ray.register_custom_serializer(pd.Series, use_pickle=True)
def initialize_ray(): """Initializes ray based on environment variables and internal defaults.""" if threading.current_thread().name == "MainThread": plasma_directory = None object_store_memory = None if "MODIN_MEMORY" in os.environ: object_store_memory = os.environ["MODIN_MEMORY"] if ("MODIN_OUT_OF_CORE" in os.environ and os.environ["MODIN_OUT_OF_CORE"].title() == "True"): from tempfile import gettempdir plasma_directory = gettempdir() # We may have already set the memory from the environment variable, we don't # want to overwrite that value if we have. if object_store_memory is None: try: from psutil import virtual_memory except ImportError: raise ImportError( "To use Modin out of core, please install modin[out_of_core]: " '`pip install "modin[out_of_core]"`') # Round down to the nearest Gigabyte. mem_bytes = virtual_memory().total // 10**9 * 10**9 # Default to 8x memory for out of core object_store_memory = 8 * mem_bytes elif "MODIN_MEMORY" in os.environ: object_store_memory = os.environ["MODIN_MEMORY"] # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: # Round down to the nearest Gigabyte. object_store_memory = int(0.6 * ray.utils.get_system_memory() // 10**9 * 10**9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None ray.init( redirect_output=True, include_webui=False, redirect_worker_output=True, ignore_reinit_error=True, plasma_directory=plasma_directory, object_store_memory=object_store_memory, ) # Register custom serializer for method objects to avoid warning message. # We serialize `MethodType` objects when we use AxisPartition operations. ray.register_custom_serializer(types.MethodType, use_pickle=True)
def __init__(self, experiment): from slm_lab.experiment.control import Experiment ray.register_custom_serializer(Experiment, use_pickle=True) ray.register_custom_serializer(InfoSpace, use_pickle=True) ray.register_custom_serializer(pd.DataFrame, use_pickle=True) ray.register_custom_serializer(pd.Series, use_pickle=True) self.experiment = experiment self.config_space = build_config_space(experiment) logger.info( f'Running {util.get_class_name(self)}, with meta spec:\n{self.experiment.spec["meta"]}')
def test_custom_serializers_with_pickle(shutdown_only): ray.init(use_pickle=True) custom_serializers() class Foo: def __init__(self): self.x = 4 # Test the pickle serialization backend without serializer. # NOTE: 'use_pickle' here is different from 'use_pickle' in # ray.init ray.register_custom_serializer(Foo, use_pickle=True) @ray.remote def f(): return Foo() assert type(ray.get(f.remote())) == Foo
def register_torch_serializers(): """ Registers ray custom serializer and deserializer for torch.tensor types. According to the ray documentation: "The serializer and deserializer are used when transferring objects of cls across processes and nodes." In particular, these are found handy when array-like logs (from a tune.Trainable) are transfered across nodes. Example: ``` ray.init() register_torch_serializers() ``` """ # Register serializer and deserializer - needed when logging arrays and tensors. def serializer(obj): if obj.requires_grad: obj = obj.detach() if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() for tensor_type in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: def deserializer(serialized_obj): return tensor_type(serialized_obj) # cast to tensor_type ray.register_custom_serializer(tensor_type, serializer=serializer, deserializer=deserializer)
def run_ray(tune_config, exp_config, fix_seed=False): # update config tune_config["config"] = exp_config download_dataset(exp_config) # override when running local for test if not torch.cuda.is_available(): tune_config["config"]["device"] = "cpu" tune_config["resources_per_trial"] = {"cpu": 1} # init ray ray.init(load_code_from_local=True) # MC code to fix for an unknown bug def serializer(obj): if obj.is_cuda: return obj.cpu().numpy() else: return obj.numpy() def deserializer(serialized_obj): return serialized_obj for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer(t, serializer=serializer, deserializer=deserializer) # fix seed if fix_seed: set_random_seed(32) tune.run(Trainable, **tune_config)
def test_numpy_subclass_serialization(ray_start_regular): class MyNumpyConstant(np.ndarray): def __init__(self, value): super().__init__() self.constant = value def __str__(self): print(self.constant) constant = MyNumpyConstant(123) def explode(x): raise RuntimeError("Expected error.") ray.register_custom_serializer( type(constant), serializer=explode, deserializer=explode) try: ray.put(constant) assert False, "Should never get here!" except (RuntimeError, IndexError): print("Correct behavior, proof that customer serializer was used.")
def register_serializer(conn_str=None, apikey=None, profile=None): """ Register serializer for BTrDB Object Parameters ---------- conn_str: str, default=None The address and port of the cluster to connect to, e.g. `192.168.1.1:4411`. If set to None, will look in the environment variable `$BTRDB_ENDPOINTS` (recommended). apikey: str, default=None The API key used to authenticate requests (optional). If None, the key is looked up from the environment variable `$BTRDB_API_KEY`. profile: str, default=None The name of a profile containing the required connection information as found in the user's predictive grid credentials file `~/.predictivegrid/credentials.yaml`. """ ray.register_custom_serializer(BTrDB, serializer=btrdb_serializer, deserializer=partial(btrdb_deserializer, conn_str=conn_str, apikey=apikey, profile=profile))
def run(self): ray.init() # serialize here as ray is not thread safe outside ray.register_custom_serializer(InfoSpace, use_pickle=True) ray.register_custom_serializer(pd.DataFrame, use_pickle=True) ray.register_custom_serializer(pd.Series, use_pickle=True) def lab_trial(config, reporter): '''Trainable method to run a trial given ray config and reporter''' trial_index = config.pop('trial_index') spec = self.spec_from_config(config) info_space = deepcopy(self.experiment.info_space) info_space.set('trial', trial_index) trial_fitness_df = self.experiment.init_trial_and_run( spec, info_space) fitness_vec = trial_fitness_df.iloc[0].to_dict() fitness = analysis.calc_fitness(trial_fitness_df) trial_index = trial_fitness_df.index[0] trial_data = { **config, **fitness_vec, 'fitness': fitness, 'trial_index': trial_index, } done = True # TODO timesteps = episode len or total_t from space_clock # call reporter from inside trial/session loop reporter(timesteps_total=-1, done=done, info=trial_data) register_trainable('lab_trial', lab_trial) # TODO use hyperband # TODO parallelize on trial sessions # TODO use advanced conditional config space via lambda func config_space = self.build_config_space() spec = self.experiment.spec ray_trials = run_experiments({ spec['name']: { 'run': 'lab_trial', 'stop': { 'done': True }, 'config': config_space, 'repeat': spec['meta']['max_trial'], } }) logger.info('Ray.tune experiment.search.run() done.') # compose data format for experiment analysis trial_data_dict = {} for ray_trial in ray_trials: exp_trial_data = ray_trial.last_result.info trial_index = exp_trial_data.pop('trial_index') trial_data_dict[trial_index] = exp_trial_data ray.disconnect() return trial_data_dict
def build_graph(self): self.build_channels() # to support cyclic reference serialization try: ray.register_custom_serializer(Environment, use_pickle=True) ray.register_custom_serializer(ExecutionGraph, use_pickle=True) ray.register_custom_serializer(OpType, use_pickle=True) ray.register_custom_serializer(PStrategy, use_pickle=True) except Exception: # local mode can't use pickle pass # Each operator instance is implemented as a Ray actor # Actors are deployed in topological order, as we traverse the # logical dataflow from sources to sinks. for node in nx.topological_sort(self.env.logical_topo): operator = self.env.operators[node] # Instantiate Ray actors handles = self.__generate_actors( operator, self.input_channels.get(node, []), self.output_channels.get(node, [])) if handles: self.actor_handles.extend(handles)
def __init__(self, benchmark_database=None, evaluators=None, terminal_when=None, behaviors=None, behavior_configs=None, num_scenarios=None, benchmark_configs=None, log_eval_avg_every=None, glog_init_settings=None, checkpoint_dir=None, merge_existing=False, num_cpus=None, memory_total=None, ip_head=None, redis_password=None): super().__init__(benchmark_database=benchmark_database, evaluators=evaluators, terminal_when=terminal_when, behaviors=behaviors, behavior_configs=behavior_configs, num_scenarios=num_scenarios, benchmark_configs=benchmark_configs, checkpoint_dir=checkpoint_dir, merge_existing=merge_existing) num_cpus_available = psutil.cpu_count(logical=True) if ip_head and redis_password: ray.init(address=ip_head, redis_password=redis_password) else: if num_cpus and num_cpus <= num_cpus_available: pass else: num_cpus = num_cpus_available mem = psutil.virtual_memory() memory_available = mem.available if memory_total and memory_total <= memory_available: pass else: memory_total = memory_available ray.init(num_cpus=num_cpus, memory=memory_total*0.3, object_store_memory=memory_total*0.7, \ _internal_config='{"initial_reconstruction_timeout_milliseconds": 100000}') # we split memory between workers (30%) and objects (70%) serialized_evaluators = pickle.dumps(evaluators) ray.register_custom_serializer( BenchmarkConfig, serializer=serialize_benchmark_config, deserializer=deserialize_benchmark_config) ray.register_custom_serializer(Scenario, serializer=serialize_scenario, deserializer=deserialize_scenario) self.benchmark_config_split = [ self.configs_to_run[i::num_cpus] for i in range(0, num_cpus) ] self.actors = [ _BenchmarkRunnerActor.remote( serialized_evaluators=serialized_evaluators, terminal_when=terminal_when, benchmark_configs=self.benchmark_config_split[i], logger_name="BenchmarkingActor{}".format(i), log_eval_avg_every=log_eval_avg_every, checkpoint_dir=checkpoint_dir, actor_id=i, glog_init_settings=glog_init_settings) for i in range(num_cpus) ]
on_perc=ray.tune.grid_search([0.02, 0.04]), ), } exp_configs = ( [(name, new_experiment(base_exp_config, c)) for name, c in experiments.items()] if experiments else [(experiment_name, base_exp_config)] ) # Register serializers. ray.init() for t in [ torch.FloatTensor, torch.DoubleTensor, torch.HalfTensor, torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.Tensor, ]: ray.register_custom_serializer(t, serializer=serializer, deserializer=deserializer) # run all experiments in parallel results = [ run_experiment.remote(name, Trainable, c, tune_config) for name, c in exp_configs ] ray.get(results) ray.shutdown()
async def test_router_use_max_concurrency(serve_instance): # The VisibleRouter::get_queues method needs to pickle queries # so we register serializer here. In regular code path, query # serialization is done by Serve manually for performance. ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) signal = SignalActor.remote() @ray.remote class MockWorker: async def handle_request(self, request): await signal.wait.remote() return "DONE" def ready(self): pass class VisibleRouter(Router): def get_queues(self): return self.queries_counter, self.backend_queues worker = MockWorker.remote() q = ray.remote(VisibleRouter).remote() await q.setup.remote("") backend_name = "max-concurrent-test" config = BackendConfig({"max_concurrent_queries": 1}) await q.set_traffic.remote("svc", TrafficPolicy({backend_name: 1.0})) await q.add_new_worker.remote(backend_name, "replica-tag", worker) await q.set_backend_config.remote(backend_name, config) # We send over two queries first_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1) second_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1) # Neither queries should be available with pytest.raises(ray.exceptions.RayTimeoutError): ray.get([first_query, second_query], timeout=0.2) # Let's retrieve the router internal state queries_counter, backend_queues = await q.get_queues.remote() # There should be just one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # The second query is buffered assert len(backend_queues["max-concurrent-test"]) == 1 # Let's unblock the first query await signal.send.remote(clear=True) assert await first_query == "DONE" # The internal state of router should have changed. queries_counter, backend_queues = await q.get_queues.remote() # There should still be one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # But there shouldn't be any queries in the queue assert len(backend_queues["max-concurrent-test"]) == 0 # Unblocking the second query await signal.send.remote(clear=True) assert await second_query == "DONE" # Checking the internal state of the router one more time queries_counter, backend_queues = await q.get_queues.remote() assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 0 assert len(backend_queues["max-concurrent-test"]) == 0
def init( kv_store_connector=None, kv_store_path=None, blocking=False, start_server=True, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, ray_init_kwargs={ "object_store_memory": int(1e8), "num_cpus": max(cpu_count(), 8) }, gc_window_seconds=3600, queueing_policy=RoutePolicy.Random, policy_kwargs={}, ): """Initialize a serve cluster. If serve cluster has already initialized, this function will just return. Calling `ray.init` before `serve.init` is optional. When there is not a ray cluster initialized, serve will call `ray.init` with `object_store_memory` requirement. Args: kv_store_connector (callable): Function of (namespace) => TableObject. We will use a SQLite connector that stores to /tmp by default. kv_store_path (str, path): Path to the SQLite table. blocking (bool): If true, the function will wait for the HTTP server to be healthy, and other components to be ready before returns. start_server (bool): If true, `serve.init` starts http server. (Default: True) http_host (str): Host for HTTP server. Default to "0.0.0.0". http_port (int): Port for HTTP server. Default to 8000. ray_init_kwargs (dict): Argument passed to ray.init, if there is no ray connection. Default to {"object_store_memory": int(1e8)} for performance stability reason gc_window_seconds(int): How long will we keep the metric data in memory. Data older than the gc_window will be deleted. The default is 3600 seconds, which is 1 hour. queueing_policy(RoutePolicy): Define the queueing policy for selecting the backend for a service. (Default: RoutePolicy.Random) policy_kwargs: Arguments required to instantiate a queueing policy """ global master_actor if master_actor is not None: return # Initialize ray if needed. if not ray.is_initialized(): ray.init(**ray_init_kwargs) # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) # Try to get serve master actor if it exists try: master_actor = ray.util.get_actor(SERVE_MASTER_NAME) return except ValueError: pass # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) ray.register_custom_serializer(RequestMetadata, RequestMetadata.ray_serialize, RequestMetadata.ray_deserialize) if kv_store_path is None: _, kv_store_path = mkstemp() # Serve has not been initialized, perform init sequence # TODO move the db to session_dir. # ray.worker._global_node.address_info["session_dir"] def kv_store_connector(namespace): return SQLiteKVStore(namespace, db_path=kv_store_path) master_actor = ServeMaster.options( detached=True, name=SERVE_MASTER_NAME, max_reconstructions=ray.ray_constants.INFINITE_RECONSTRUCTION, ).remote(kv_store_connector, queueing_policy.value, policy_kwargs, start_server, http_host, http_port, gc_window_seconds) if start_server and blocking: block_until_http_ready("http://{}:{}/-/routes".format( http_host, http_port))
def init(cluster_name=None, blocking=False, start_server=True, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, ray_init_kwargs={ "object_store_memory": int(1e8), "num_cpus": max(cpu_count(), 8) }, metric_exporter=InMemoryExporter): """Initialize a serve cluster. If serve cluster has already initialized, this function will just return. Calling `ray.init` before `serve.init` is optional. When there is not a ray cluster initialized, serve will call `ray.init` with `object_store_memory` requirement. Args: cluster_name (str): A unique name for this serve cluster. This allows multiple serve clusters to run on the same ray cluster. Must be specified in all subsequent serve.init() calls. blocking (bool): If true, the function will wait for the HTTP server to be healthy, and other components to be ready before returns. start_server (bool): If true, `serve.init` starts http server. (Default: True) http_host (str): Host for HTTP server. Default to "0.0.0.0". http_port (int): Port for HTTP server. Default to 8000. ray_init_kwargs (dict): Argument passed to ray.init, if there is no ray connection. Default to {"object_store_memory": int(1e8)} for performance stability reason metric_exporter(ExporterInterface): The class aggregates metrics from all RayServe actors and optionally export them to external services. RayServe has two options built in: InMemoryExporter and PrometheusExporter """ if cluster_name is not None and not isinstance(cluster_name, str): raise TypeError("cluster_name must be a string.") # Initialize ray if needed. if not ray.is_initialized(): ray.init(**ray_init_kwargs) # Try to get serve master actor if it exists global master_actor master_actor_name = format_actor_name(SERVE_MASTER_NAME, cluster_name) try: master_actor = ray.util.get_actor(master_actor_name) return except ValueError: pass # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) ray.register_custom_serializer(RequestMetadata, RequestMetadata.ray_serialize, RequestMetadata.ray_deserialize) # TODO(edoakes): for now, always start the HTTP proxy on the node that # serve.init() was run on. We should consider making this configurable # in the future. http_node_id = ray.state.current_node_id() master_actor = ServeMaster.options( detached=True, name=master_actor_name, max_restarts=-1, ).remote(cluster_name, start_server, http_node_id, http_host, http_port, metric_exporter) if start_server and blocking: block_until_http_ready("http://{}:{}/-/routes".format( http_host, http_port))
def init(kv_store_connector=None, kv_store_path=None, blocking=False, http_host=DEFAULT_HTTP_HOST, http_port=DEFAULT_HTTP_PORT, ray_init_kwargs={ "object_store_memory": int(1e8), "num_cpus": max(cpu_count(), 8) }, gc_window_seconds=3600, queueing_policy=RoutePolicy.Random, policy_kwargs={}): """Initialize a serve cluster. If serve cluster has already initialized, this function will just return. Calling `ray.init` before `serve.init` is optional. When there is not a ray cluster initialized, serve will call `ray.init` with `object_store_memory` requirement. Args: kv_store_connector (callable): Function of (namespace) => TableObject. We will use a SQLite connector that stores to /tmp by default. kv_store_path (str, path): Path to the SQLite table. blocking (bool): If true, the function will wait for the HTTP server to be healthy, and other components to be ready before returns. http_host (str): Host for HTTP server. Default to "0.0.0.0". http_port (int): Port for HTTP server. Default to 8000. ray_init_kwargs (dict): Argument passed to ray.init, if there is no ray connection. Default to {"object_store_memory": int(1e8)} for performance stability reason gc_window_seconds(int): How long will we keep the metric data in memory. Data older than the gc_window will be deleted. The default is 3600 seconds, which is 1 hour. queueing_policy(RoutePolicy): Define the queueing policy for selecting the backend for a service. (Default: RoutePolicy.Random) policy_kwargs: Arguments required to instantiate a queueing policy """ global global_state # Noop if global_state is no longer None if global_state is not None: return # Initialize ray if needed. if not ray.is_initialized(): ray.init(**ray_init_kwargs) # Try to get serve nursery if there exists try: ray.experimental.get_actor(SERVE_NURSERY_NAME) global_state = GlobalState() return except ValueError: pass # Register serialization context once ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) if kv_store_path is None: _, kv_store_path = mkstemp() # Serve has not been initialized, perform init sequence # Todo, move the db to session_dir # ray.worker._global_node.address_info["session_dir"] def kv_store_connector(namespace): return SQLiteKVStore(namespace, db_path=kv_store_path) nursery = start_initial_state(kv_store_connector) global_state = GlobalState(nursery) global_state.init_or_get_http_server(host=http_host, port=http_port) global_state.init_or_get_router(queueing_policy=queueing_policy, policy_kwargs=policy_kwargs) global_state.init_or_get_metric_monitor( gc_window_seconds=gc_window_seconds) if blocking: block_until_http_ready("http://{}:{}".format(http_host, http_port))
features, speakers, TRAINING_CHUNK_SIZE) print("training length: {}".format(train_set)) return nspeakers, train_set if __name__ == "__main__": parser = argparse.ArgumentParser("Speech Verification") parser.add_argument("--ray", action='store_true', default=False) parser.add_argument("--data-parallel", action='store_true', default=False) parser.add_argument("--chunks", type=int, default=1) args = parser.parse_args() print(args) ray.init(num_gpus=1) ray.register_custom_serializer(torch.Tensor, serializer=serializer, deserializer=deserializer) # Load train set nspeakers, train_set = load_train_set(args) train_set_id = pin_in_object_store(train_set) print("Loaded train. pinned={}".format(True)) # Load dev set dev_set = load_dev_set(args) dev_set_id = pin_in_object_store(dev_set) print("Loaded dev. pinned={}".format(True)) tune.register_trainable('train_sc', train.Trainer) exp = Experiment(
records.append(Record((w, 1))) return records # Receives an object of type Record and returns the actual tuple def as_tuple(record): return record.record if __name__ == "__main__": # Get program parameters args = parser.parse_args() input_file = str(args.input_file) ray.init() ray.register_custom_serializer(Record, use_dict=True) ray.register_custom_serializer(BatchedQueue, use_pickle=True) ray.register_custom_serializer(OpType, use_pickle=True) ray.register_custom_serializer(PStrategy, use_pickle=True) # A Ray streaming environment with the default configuration env = Environment() env.set_parallelism(2) # Each operator will be executed by two actors # 'key_by("word")' physically partitions the stream of records # based on the hash value of the 'word' attribute (see Record class above) # 'map(as_tuple)' maps a record of type Record into a tuple # 'sum(1)' sums the 2nd element of the tuple, i.e. the word count stream = env.read_text_file(input_file) \ .round_robin() \ .flat_map(splitter) \