def __init__(self, queue_trials=False): super(RayTrialExecutor, self).__init__(queue_trials) self._running = {} # Since trial resume after paused should not run # trial.train.remote(), thus no more new remote object id generated. # We use self._paused to store paused trials here. self._paused = {} self._avail_resources = Resources(cpu=0, gpu=0) self._committed_resources = Resources(cpu=0, gpu=0) self._resources_initialized = False if ray.is_initialized(): self._update_avail_resources()
def __init__(self, env_creator, policy_graph, policy_mapping_fn=None, policies_to_train=None, tf_session_creator=None, batch_steps=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="deepmind", sample_async=False, compress_observations=False, num_envs=1, observation_filter="NoFilter", clip_rewards=None, clip_actions=True, env_config=None, model_config=None, policy_config=None, worker_index=0, monitor_path=None, log_dir=None, log_level=None, callbacks=None, input_creator=lambda ioctx: ioctx.default_sampler_input(), input_evaluation=frozenset([]), output_creator=lambda ioctx: NoopOutput(), remote_worker_envs=False, async_remote_worker_envs=False): """Initialize a policy evaluator. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional whitelist of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicyGraph. batch_steps (int): The target number of env transitions to include in each sample batch returned from this evaluator. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `batch_steps * num_envs` in size. The batch will be exactly `batch_steps * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `batch_steps * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy_graph`. worker_index (int): For remote evaluators, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (dict): Dict of custom debug callbacks. input_creator (func): Function that returns an InputReader object for loading previous generated experiences. input_evaluation (list): How to evaluate the policy performance. This only makes sense to set when the input is reading offline data. The possible values include: - "is": the step-wise importance sampling estimator. - "wis": the weighted step-wise is estimator. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. output_creator (func): Function that returns an OutputWriter object for saving generated experiences. remote_worker_envs (bool): If using num_envs > 1, whether to create those new envs in remote processes instead of in the current process. This adds overheads, but can make sense if your envs are very CPU intensive (e.g., for StarCraft). async_remote_worker_envs (bool): Similar to remote_worker_envs, but runs the envs asynchronously in the background. """ if log_level: logging.getLogger("ray.rllib").setLevel(log_level) env_context = EnvContext(env_config or {}, worker_index) policy_config = policy_config or {} self.policy_config = policy_config self.callbacks = callbacks or {} model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError( "Policy mapping function not callable. If you're using Tune, " "make sure to escape the function with tune.function() " "to prevent it from being evaluated as an expression.") self.env_creator = env_creator self.sample_batch_size = batch_steps * num_envs self.batch_mode = batch_mode self.compress_observations = compress_observations self.preprocessing_enabled = True self.env = _validate_env(env_creator(env_context)) if isinstance(self.env, MultiAgentEnv) or \ isinstance(self.env, BaseEnv): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing self.preprocessing_enabled = False if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind( env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = _monitor(env, monitor_path) return env else: def wrap(env): if monitor_path: env = _monitor(env, monitor_path) return env self.env = wrap(self.env) def make_env(vector_index): return wrap( env_creator( env_context.copy_with_overrides( vector_index=vector_index, remote=remote_worker_envs))) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy_graph, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) if _has_tensorflow_graph(policy_dict): if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE and not ray.get_gpu_ids()): logger.info("Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session( config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent: if not (isinstance(self.env, MultiAgentEnv) or isinstance(self.env, BaseEnv)): raise ValueError( "Have multiple policy graphs {}, but the env ".format( self.policy_map) + "{} is not a subclass of MultiAgentEnv?".format(self.env)) self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } # Always use vector env for consistency even if num_envs = 1 self.async_env = BaseEnv.to_base_env( self.env, make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, async_remote_envs=async_remote_worker_envs) self.num_envs = num_envs if self.batch_mode == "truncate_episodes": unroll_length = batch_steps pack_episodes = True elif self.batch_mode == "complete_episodes": unroll_length = float("inf") # never cut episodes pack_episodes = False # sampler will return 1 episode per poll else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) self.io_context = IOContext(log_dir, policy_config, worker_index, self) self.reward_estimators = [] for method in input_evaluation: if method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif method == "is": ise = ImportanceSamplingEstimator.create(self.io_context) self.reward_estimators.append(ise) elif method == "wis": wise = WeightedImportanceSamplingEstimator.create( self.io_context) self.reward_estimators.append(wise) else: raise ValueError( "Unknown evaluation method: {}".format(method)) if sample_async: self.sampler = AsyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs="simulation" in input_evaluation) self.sampler.start() else: self.sampler = SyncSampler( self.async_env, self.policy_map, policy_mapping_fn, self.preprocessors, self.filters, clip_rewards, unroll_length, self.callbacks, horizon=episode_horizon, pack=pack_episodes, tf_sess=self.tf_sess, clip_actions=clip_actions) self.input_reader = input_creator(self.io_context) assert isinstance(self.input_reader, InputReader), self.input_reader self.output_writer = output_creator(self.io_context) assert isinstance(self.output_writer, OutputWriter), self.output_writer logger.debug("Created evaluator with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))
def shutdown_only_with_initialization_check(): yield None # The code after the yield will run as teardown code. ray.shutdown() assert not ray.is_initialized()
def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=4)
return frame def test(source=0): """Grab and show video frames without multithreading.""" frame_process = Worker.remote() cap = cv2.VideoCapture(source) print("OUTSIDE no threading loop") while True: (grabbed, frame) = cap.read() if not grabbed or cv2.waitKey(1) == ord("q"): break frame2 = ray.put(frame) frame2 = frame_process.run.remote(frame2) frame3 = np.asarray(ray.get(frame2)) print(frame3) cv2.imshow("Video", frame3) def main(): test(0) if __name__ == "__main__": ray.init() print("Ray initialized: ", ray.is_initialized()) main()
def generateEphemeris(self, orbits, observers, test_orbit=None, threads=NUM_THREADS, chunk_size=100): """ Generate ephemerides for each orbit in orbits as observed by each observer in observers. Parameters ---------- orbits : `~thor.orbits.orbits.Orbits` Orbits for which to generate ephemerides. observers : dict or `~pandas.DataFrame` A dictionary with observatory codes as keys and observation_times (`~astropy.time.core.Time`) as values. test_orbit : `~thor.orbits.orbits.Orbits` Test orbit to use to generate projected coordinates. threads : int, optional Number of processes to launch. chunk_size : int, optional Number of orbits to send to each process. Returns ------- ephemeris : `~pandas.DataFrame` Ephemerides with at least the following columns: orbit_id : Input orbit ID observatory_code : Observatory's MPC code. mjd_utc : Observation time in MJD UTC. RA : Right Ascension in decimal degrees. Dec : Declination in decimal degrees. """ shutdown = False if threads > 1: orbits_split = orbits.split(chunk_size) observers_duplicated = [copy.deepcopy(observers) for i in range(len(orbits_split))] backend_duplicated = [copy.deepcopy(self) for i in range(len(orbits_split))] if USE_RAY: shutdown = False if not ray.is_initialized(): ray.init(num_cpus=threads) shutdown = True p = [] for o, t, b in zip(orbits_split, observers_duplicated, backend_duplicated): p.append(ephemeris_worker.remote(o, t, b)) ephemeris_dfs = ray.get(p) else: p = mp.Pool( processes=threads, initializer=_init_worker, ) ephemeris_dfs = p.starmap( ephemeris_worker, zip( orbits_split, observers_duplicated, backend_duplicated, ) ) p.close() ephemeris = pd.concat(ephemeris_dfs) ephemeris.reset_index( drop=True, inplace=True ) else: ephemeris = self._generateEphemeris( orbits, observers ) if test_orbit is not None: test_orbit_ephemeris = self._generateEphemeris( test_orbit, observers ) ephemeris_grouped = ephemeris.groupby(by=["observatory_code", "mjd_utc"]) ephemeris_split = [ephemeris_grouped.get_group(g).copy() for g in ephemeris_grouped.groups] test_orbit_ephemeris_grouped = test_orbit_ephemeris.groupby(by=["observatory_code", "mjd_utc"]) test_orbit_ephemeris_split = [test_orbit_ephemeris_grouped.get_group(g) for g in test_orbit_ephemeris_grouped.groups] if threads > 1: if USE_RAY: p = [] for e, te in zip(ephemeris_split, test_orbit_ephemeris_split): p.append(projectEphemeris_worker.remote(e, te)) ephemeris_dfs = ray.get(p) else: p = mp.Pool( processes=threads, initializer=_init_worker, ) ephemeris_dfs = p.starmap( projectEphemeris_worker, zip( ephemeris_split, test_orbit_ephemeris_split ) ) p.close() else: ephemeris_dfs = [] for e, te in zip(ephemeris_split, test_orbit_ephemeris_split): ephemeris_df = projectEphemeris_worker(e, te) ephemeris_dfs.append(ephemeris_df) ephemeris = pd.concat(ephemeris_dfs) ephemeris.reset_index( drop=True, inplace=True ) if shutdown: ray.shutdown() return ephemeris
def test_initialized(shutdown_only_with_initialization_check): assert not ray.is_initialized() ray.init(num_cpus=0) assert ray.is_initialized()
def _init_ray(self): self.kwargs = {"resources_per_actor": {"actor_cpus": 4.0}} if not ray.is_initialized(): ray.init(address="auto")
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, grace_period, num_hp_samplings, local_mode, redis_address, lca_n, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) t_id = task['id'] trainable = self.get_trainable(use_ray_logging=use_ray_logging) past_tasks = training_params.pop('past_tasks') normalize = training_params.pop('normalize') augment_data = training_params.pop('augment_data') transformations = [] if augment_data: transformations.extend([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor() ]) t_trans = [[] for _ in range(len(task['split_names']))] t_trans[0] = transformations datasets = trainable._load_datasets(task, task['loss_fn'], past_tasks, t_trans, normalize) train_loader, eval_loaders = get_classic_dataloaders( datasets, training_params.pop('batch_sizes')) model = self.get_model(task_id=t_id, x_dim=task['x_dim'], n_classes=task['n_classes'], descriptor=task['descriptor'], dataset=eval_loaders[:2]) if use_ray: if not ray.is_initialized(): ray.init(address=redis_address) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = { **self.get_search_space(), 'training-params': training_params } if use_ray_logging: stop_condition = { 'training_iteration': training_params['n_it_max'] } checkpoint_at_end = False keep_checkpoints_num = 1 checkpoint_score_attr = 'min-Val nll' else: stop_condition = None # loggers = [JsonLogger, MyCSVLogger] checkpoint_at_end = False keep_checkpoints_num = None checkpoint_score_attr = None trainable = rename_class(trainable, training_params['name']) experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=self.ray_resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=(JsonLogger, CSVLogger), checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr) analysis = tune.run( experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('Val nll', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial.checkpoint.result best_point = (best_res['training_iteration'], best_res['Val nll']) # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] y_keys = ['Val nll', 'Train nll'] epoch_key = 'training_epoch' it_key = 'training_iteration' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) if 'entropy' in next(iter(analysis.trial_dataframes.values())): plot_res_dataframe(analysis, training_params['name'], None, task_viz, epoch_key, it_key, ['entropy']) best_model = self.get_model(task_id=t_id) best_model.load_state_dict(torch.load(best_trial.checkpoint.value)) train_accs = analysis.trial_dataframes[best_logdir][ 'Train accuracy_0'] best_t = best_res['training_iteration'] t = best_trial.last_result['training_iteration'] else: search_space = self.get_search_space() rand_config = list(generate_variants(search_space))[0][1] learner_params = rand_config.pop('learner-params', {}) optim_params = rand_config.pop('optim') split_optims = training_params.pop('split_optims') if hasattr(model, 'set_h_params'): model.set_h_params(**learner_params) if hasattr(model, 'train_loader_wrapper'): train_loader = model.train_loader_wrapper(train_loader) loss_fn = task['loss_fn'] if hasattr(model, 'loss_wrapper'): loss_fn = model.loss_wrapper(task['loss_fn']) prepare_batch = _prepare_batch if hasattr(model, 'prepare_batch_wrapper'): prepare_batch = model.prepare_batch_wrapper( prepare_batch, t_id) optim_fact = partial(set_optim_params, optim_func=self.optim_func, optim_params=optim_params, split_optims=split_optims) if hasattr(model, 'train_func'): f = model.train_func t, metrics, b_state_dict = f(train_loader=train_loader, eval_loaders=eval_loaders, optim_fact=optim_fact, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) else: optim = optim_fact(model=model) t, metrics, b_state_dict = train( model=model, train_loader=train_loader, eval_loaders=eval_loaders, optimizer=optim, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) train_accs = metrics['Train accuracy_0'] best_t = b_state_dict['iter'] if 'training_archs' in metrics: plot_trajectory(model.ssn.graph, metrics['training_archs'], model.ssn.stochastic_node_ids, task_viz) weights = model.arch_sampler().squeeze() archs = model.ssn.get_top_archs(weights, 5) list_top_archs(archs, task_viz) list_arch_scores(self.arch_scores[t_id], task_viz) update_summary(self.arch_scores[t_id], task_viz, 'scores') if len(train_accs) > lca_n: lca_accs = [] for i in range(lca_n + 1): if i in train_accs: lca_accs.append(train_accs[i]) else: logger.warning( 'Missing step for {}/{} for lca computation'.format( i, lca_n)) lca = np.mean(lca_accs) else: lca = np.float('nan') stats = {} start = time.time() # train_idx = task['split_names'].index('Train') # train_path = task['data_path'][train_idx] # train_dataset = _load_datasets([train_path])[0] train_dataset = _load_datasets(task, 'Train')[0] stats.update( self.finish_task(train_dataset, t_id, task_viz, path='drawings')) stats['duration'] = { 'iterations': t, 'finish': time.time() - start, 'best_iterations': best_t } stats['params'] = { 'total': self.n_params(t_id), 'new': self.new_params(t_id) } stats['lca'] = lca return stats
def configure_training(self, trainer: Trainer): # Reproducibility if self.config.training.random_seed: set_random_seed(trainer.rank + self.config.training.random_seed) # Setup Distributed Training properties if self.config.training.num_gpus_per_node > 1: # must assume that trainer has rank property trainer.master = trainer.rank == 0 torch.cuda.set_device(trainer.rank) trainer.device = torch.device("cuda", trainer.rank) else: assert trainer.rank == 0 trainer.master == True trainer.device = torch.device("cuda") # Initialize Distributed Training if self.config.training.num_gpus_per_node > 1: # Init distributed # TODO: multi-node multi-gpu training torch.distributed.init_process_group( backend="nccl", rank=trainer.rank, world_size=self.config.training.num_gpus_per_node) # Initialize Dataloader # DataLoader if trainer.train_loader is None: if trainer.train_loader_fn is None: logger.error( "Please specify either `train_loader` or `train_loader_fn`!" ) raise NotImplementedError trainer.train_loader = trainer.train_loader_fn(self.config) if self.config.training.total_num_epochs > 0: try: num_training_batches = len(trainer.train_loader) trainer.no_epoch_training = False trainer.num_training_batches = num_training_batches except TypeError: # connot set the number of total_num_epoch # because it is impossible to know self.config.training.total_num_epochs = -1 trainer.no_epoch_training = True else: trainer.no_epoch_training = True # Set Num of Epochs and Num of Iterations # if the num of epochs is set if self.config.training.total_num_epochs > 0: if trainer.no_epoch_training: logger.error("Cannot get the length of train dataloader!") trainer.total_num_iterations = num_training_batches * self.config.training.total_num_epochs trainer.total_num_epochs = self.config.training.total_num_epochs # num of epochs is not set else: if self.config.training.total_num_iterations is None or self.config.training.total_num_iterations < 0: raise NotImplementedError( "Please specify the `total_num_epochs` or `total_num_iterations`!" ) else: pass # trainer.total_num_epochs = trainer.total_num_iterations // num_training_batches # Setup validation interval if self.config.training.validation_iterations_interval is None or \ self.config.training.validation_iterations_interval < 0: # validation for every epoch if not trainer.no_epoch_training: self.config.training.validation_iterations_interval = num_training_batches - 1 # Ray Initialize if trainer.master: # close existing logging if not ray.is_initialized(): logger.info(ray.init()) trainer.model = move_to_device(trainer.model, trainer.device) # FP16 if self.config.training.fp16: trainer.model, trainer.optimizer = amp.initialize( trainer.model, trainer.optimizer, opt_level=self.config.training.fp16_opt_level) if self.config.training.num_gpus_per_node > 1: # Distributed training (should be after apex fp16 initialization) trainer.model = DistributedDataParallel(trainer.model, delay_allreduce=True)
def search( self, search_config, max_time=None, n_iter=10, optimizer="RandomSearch", n_jobs=1, warm_start=False, scatter_init=False, ): """ run search Parameters ---------- search_config: dictionary Defines the search space and links it to the objective function. The objective function is the key of the dictionary, while the search space (which is also a dictionary) is the value. You can define multiple modeles/search-spaces in the search_config. The values within the search space (not search_config) must be lists or numpy arrays. Example: def model_function(para, X, y): model = GradientBoostingClassifier( n_estimators=para["n_estimators"], max_depth=para["max_depth"], ) scores = cross_val_score(model, X, y, cv=3) return scores.mean() search_config = { model_function: { "n_estimators": range(10, 200, 10), "max_depth": range(2, 12), } } max_time: float, optional (default: None) n_iter: int, optional (default: 10) optimizer: string or dict, optional (default: "RandomSearch") n_jobs: int, optional (default: 1) warm_start: dict, optional (default: False) scatter_init: int, optional (default: False) Returns ------- None """ start_time = time.time() self._main_args_.search_args(search_config, max_time, n_iter, optimizer, n_jobs, warm_start, scatter_init) self._opt_args_ = Arguments(self._main_args_.opt_para) optimizer_class = self.optimizer_dict[self._main_args_.optimizer] try: import ray if ray.is_initialized(): ray_ = True else: ray_ = False except ImportError: warnings.warn("failed to import ray", ImportWarning) ray_ = False if ray_: optimizer_class = ray.remote(optimizer_class) opts = [ optimizer_class.remote(self._main_args_, self._opt_args_) for job in range(self._main_args_.n_jobs) ] searches = [ opt.search.remote(job, ray_=ray_) for job, opt in enumerate(opts) ] ray.get(searches) else: self._optimizer_ = optimizer_class(self._main_args_, self._opt_args_) self._optimizer_.search() self.results_params = self._optimizer_.results_params self.results_models = self._optimizer_.results_models self.pos_list = self._optimizer_.pos_list self.score_list = self._optimizer_.score_list self.total_time = time.time() - start_time
def testTuneRestore(self): self.assertFalse(ray.is_initialized()) tune.run("__fake", name="TestAutoInit", stop={"training_iteration": 1}) self.assertTrue(ray.is_initialized())
def initialize_ray( override_is_cluster=False, override_redis_address: str = None, override_redis_password: str = None, ): """ Initializes ray based on parameters, environment variables and internal defaults. Parameters ---------- override_is_cluster: bool, optional Whether to override the detection of Moding being run in a cluster and always assume this runs on cluster head node. This also overrides Ray worker detection and always runs the function, not only from main thread. If not specified, $MODIN_RAY_CLUSTER env variable is used. override_redis_address: str, optional What Redis address to connect to when running in Ray cluster. If not specified, $MODIN_REDIS_ADDRESS is used. override_redis_password: str, optional What password to use when connecting to Redis. If not specified, a new random one is generated. """ import ray if not ray.is_initialized() or override_is_cluster: import secrets cluster = override_is_cluster or IsRayCluster.get() redis_address = override_redis_address or RayRedisAddress.get() redis_password = override_redis_password or secrets.token_hex(32) if cluster: # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", include_dashboard=False, ignore_reinit_error=True, _redis_password=redis_password, logging_level=100, ) else: from modin.error_message import ErrorMessage # This string is intentionally formatted this way. We want it indented in # the warning message. ErrorMessage.not_initialized( "Ray", """ import ray ray.init() """, ) object_store_memory = Memory.get() plasma_directory = RayPlasmaDir.get() if IsOutOfCore.get(): if plasma_directory is None: from tempfile import gettempdir plasma_directory = gettempdir() # We may have already set the memory from the environment variable, we don't # want to overwrite that value if we have. if object_store_memory is None: # Round down to the nearest Gigabyte. try: system_memory = ray._private.utils.get_system_memory() except AttributeError: # Compatibility with Ray <= 1.2 system_memory = ray.utils.get_system_memory() mem_bytes = system_memory // 10**9 * 10**9 # Default to 8x memory for out of core object_store_memory = 8 * mem_bytes # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: # Round down to the nearest Gigabyte. try: system_memory = ray._private.utils.get_system_memory() except AttributeError: # Compatibility with Ray <= 1.2 system_memory = ray.utils.get_system_memory() object_store_memory = int(0.6 * system_memory // 10**9 * 10**9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None else: object_store_memory = int(object_store_memory) ray.init( num_cpus=CpuCount.get(), include_dashboard=False, ignore_reinit_error=True, _plasma_directory=plasma_directory, object_store_memory=object_store_memory, address=redis_address, _redis_password=redis_password, logging_level=100, _memory=object_store_memory, _lru_evict=True, ) _move_stdlib_ahead_of_site_packages() ray.worker.global_worker.run_function_on_all_workers( _move_stdlib_ahead_of_site_packages) ray.worker.global_worker.run_function_on_all_workers(_import_pandas) num_cpus = int(ray.cluster_resources()["CPU"]) NPartitions.put_if_default(num_cpus)
def initialOrbitDetermination(observations, linkage_members, observation_selection_method='combinations', min_obs=6, rchi2_threshold=10**3, contamination_percentage=20.0, iterate=False, light_time=True, linkage_id_col="cluster_id", identify_subsets=True, threads=NUM_THREADS, backend="PYOORB", backend_kwargs={}, verbose=True): """ Run initial orbit determination on linkages found in observations. Parameters ---------- observations : `~pandas.DataFrame` Dataframe of observations with at least the following columns: "obs_id" : Observation IDs [str], "mjd_utc" : Observation time in MJD UTC [float], "RA_deg" : equatorial J2000 Right Ascension in degrees [float], "Dec_deg" : equatorial J2000 Declination in degrees [float], "RA_sigma_deg" : 1-sigma uncertainty in equatorial J2000 RA [float], "Dec_sigma_deg" : 1 sigma uncertainty in equatorial J2000 Dec [float], "observatory_code" : MPC recognized observatory code [str], "obs_x" : Observatory's heliocentric ecliptic J2000 x-position in au [float], "obs_y" : Observatory's heliocentric ecliptic J2000 y-position in au [float], "obs_z" : Observatory's heliocentric ecliptic J2000 z-position in au [float], "obs_vx" [Optional] : Observatory's heliocentric ecliptic J2000 x-velocity in au per day [float], "obs_vy" [Optional] : Observatory's heliocentric ecliptic J2000 y-velocity in au per day [float], "obs_vz" [Optional] : Observatory's heliocentric ecliptic J2000 z-velocity in au per day [float] linkage_members : `~pandas.DataFrame` Dataframe of linkages with at least two columns: "linkage_id" : Linkage ID [str], "obs_id" : Observation IDs [str], one ID per row. observation_selection_method : {'first+middle+last', 'thirds', 'combinations'}, optional Selects which three observations to use for IOD depending on the method. The avaliable methods are: 'first+middle+last' : Grab the first, middle and last observations in time. 'thirds' : Grab the middle observation in the first third, second third, and final third. 'combinations' : Return the observation IDs corresponding to every possible combination of three observations with non-coinciding observation times. min_obs : int, optional Minimum number of observations that must remain in the linkage. For example, if min_obs is set to 6 and a linkage has 8 observations, at most the two worst observations will be flagged as outliers. Only up t o the contamination percentage of observations of will be flagged as outliers, provided that at least min_obs observations remain in the linkage. rchi2_threshold : float, optional Minimum reduced chi2 for an initial orbit to be accepted. If an orbit contamination_percentage : float, optional Maximum percent of observations that can flagged as outliers. iterate : bool, optional Iterate the preliminary orbit solution using the state transition iterator. light_time : bool, optional Correct preliminary orbit for light travel time. linkage_id_col : str, optional Name of linkage_id column in the linkage_members dataframe. threads : int, optional Number of threads to use for multiprocessing. backend : {'MJOLNIR', 'PYOORB'}, optional Which backend to use for ephemeris generation. backend_kwargs : dict, optional Settings and additional parameters to pass to selected backend. Returns ------- iod_orbits : `~pandas.DataFrame` Dataframe with orbits found in linkages. "orbit_id" : Orbit ID, a uuid [str], "mjd_tdb" : Epoch at which orbit is defined in MJD TDB [float], "x" : Orbit's ecliptic J2000 x-position in au [float], "y" : Orbit's ecliptic J2000 y-position in au [float], "z" : Orbit's ecliptic J2000 z-position in au [float], "vx" : Orbit's ecliptic J2000 x-velocity in au per day [float], "vy" : Orbit's ecliptic J2000 y-velocity in au per day [float], "vz" : Orbit's ecliptic J2000 z-velocity in au per day [float], "arc_length" : Arc length in days [float], "num_obs" : Number of observations that were within the chi2 threshold of the orbit. "chi2" : Total chi2 of the orbit calculated using the predicted location of the orbit on the sky compared to the consituent observations. iod_orbit_members : `~pandas.DataFrame` Dataframe of orbit members with the following columns: "orbit_id" : Orbit ID, a uuid [str], "obs_id" : Observation IDs [str], one ID per row. "residual_ra_arcsec" : Residual (observed - expected) equatorial J2000 Right Ascension in arcseconds [float] "residual_dec_arcsec" : Residual (observed - expected) equatorial J2000 Declination in arcseconds [float] "chi2" : Observation's chi2 [float] "gauss_sol" : Flag to indicate which observations were used to calculate the Gauss soluton [int] "outlier" : Flag to indicate which observations are potential outliers (their chi2 is higher than the chi2 threshold) [float] """ time_start = time.time() if verbose: print("THOR: initialOrbitDetermination") print("-------------------------------") print("Running initial orbit determination...") print("Observation selection method: {}".format( observation_selection_method)) print("Using {} threads...".format(threads)) if len(observations) > 0 and len(linkage_members) > 0: linked_observations = linkage_members.merge(observations, on="obs_id").copy() linked_observations.sort_values(by=[linkage_id_col, "mjd_utc"], inplace=True) linked_observations.reset_index(drop=True, inplace=True) grouped_observations = linked_observations.groupby(by=[linkage_id_col]) observations_split = [ grouped_observations.get_group(g).copy() for g in grouped_observations.groups ] if threads > 1: if USE_RAY: shutdown = False if not ray.is_initialized(): ray.init(num_cpus=threads) shutdown = True p = [] for observations_i in observations_split: p.append( iod_worker.remote( observations_i, observation_selection_method= observation_selection_method, rchi2_threshold=rchi2_threshold, min_obs=min_obs, contamination_percentage=contamination_percentage, iterate=iterate, light_time=light_time, linkage_id_col=linkage_id_col, backend=backend, backend_kwargs=backend_kwargs)) iod_orbits_dfs, iod_orbit_members_dfs = ray.get(p) if shutdown: ray.shutdown() else: p = mp.Pool( processes=threads, initializer=_init_worker, ) results = p.starmap( partial(iod_worker, observation_selection_method= observation_selection_method, rchi2_threshold=rchi2_threshold, min_obs=min_obs, contamination_percentage=contamination_percentage, iterate=iterate, light_time=light_time, linkage_id_col=linkage_id_col, backend=backend, backend_kwargs=backend_kwargs), zip(observations_split, )) p.close() results = list(zip(*results)) iod_orbits_dfs = results[0] iod_orbit_members_dfs = results[1] else: iod_orbits_dfs = [] iod_orbit_members_dfs = [] for i, observations_i in enumerate(observations_split): iod_orbits_df, iod_orbit_members_df = iod_worker( observations_i, observation_selection_method=observation_selection_method, rchi2_threshold=rchi2_threshold, min_obs=min_obs, contamination_percentage=contamination_percentage, iterate=iterate, light_time=light_time, linkage_id_col=linkage_id_col, backend=backend, backend_kwargs=backend_kwargs) iod_orbits_dfs.append(iod_orbits_df) iod_orbit_members_dfs.append(iod_orbit_members_df) iod_orbits = pd.concat(iod_orbits_dfs) iod_orbits.reset_index(inplace=True, drop=True) iod_orbit_members = pd.concat(iod_orbit_members_dfs) iod_orbit_members.reset_index(inplace=True, drop=True) if verbose: print("Found {} initial orbits.".format(len(iod_orbits))) print() if identify_subsets: if verbose: print("Identifying subsets...") iod_orbits, iod_orbit_members = identifySubsetLinkages( iod_orbits, iod_orbit_members, linkage_id_col="orbit_id") if verbose: print("Done. {} subset orbits identified.".format( len(iod_orbits[~iod_orbits["subset_of"].isna()]))) print() else: iod_orbits = pd.DataFrame(columns=[ "orbit_id", "mjd_tdb", "x", "y", "z", "vx", "vy", "vz", "arc_length", "num_obs", "chi2", "rchi2" ]) iod_orbit_members = pd.DataFrame(columns=[ "orbit_id", "obs_id", "residual_ra_arcsec", "residual_dec_arcsec", "chi2", "gauss_sol", "outlier" ]) time_end = time.time() if verbose: print("Total time in seconds: {}".format(time_end - time_start)) print("-------------------------------") print("") return iod_orbits, iod_orbit_members
def train(self, filename, iterations, species=None, speciesfile=None, single_chr=None, opt="mo"): # ###################################################################################################### # Get Logger Info if hasattr(self.logger.handlers[0], "baseFilename"): name = self.logger.handlers[0].baseFilename else: name = None # ###################################################################################################### # Defining Ray Environment processors = multiprocessing.cpu_count() if processors > 22: if get_system_memory() < 80e9: memo = get_system_memory() self.logger.info("Recommended Memory > 80GB".format(memo)) else: memo = int(150e9) else: self.logger.info("Number of Recommended Processors is > 22".format(int(processors))) memo = get_system_memory() self.logger.info("Running with {0} processors. Size of Plasma Storage {1}".format(int(processors), memo)) if not ray.is_initialized(): ray.init(num_cpus=int(processors) - 1, object_store_memory=int(100e9)) # ray.init(num_cpus=int(processors) - 1) # ###################################################################################################### # Running Regions self.logger.info("Training on Regions") results = [] chromosome = [Trainer.remote(-1, filename, species, speciesfile, self.blacklisted, self.states, self.prior, self.top_states, logger=logging.getLogger().getEffectiveLevel(), log_file=name, datatype=self.datatype)] results.append(chromosome[0].train.remote(iterations=iterations, msg="Th17 Regions: ")) # Collect Results res, states = ray.get(results[0]) for l_ in np.arange(len(res[0])): self.annotations.append(res[0][l_]) self.annotations_chr.append(res[1][l_]) self.annotations_start.append(res[2][l_]) self.annotations_length.append(res[3][l_]) posterior = ray.get(chromosome[0].get_posterior.remote()) self.elbo = ray.get(chromosome[0].get_elbo.remote()) # Validate Results self.validate_regions() # ###################################################################################################### # Running Chromosomes if not self.compute_regions: # Prepare data structures and carry over states from Regions self.annotations = [] self.annotations_chr = [] self.annotations_start = [] for i_ in np.arange(len(self.states)): if isinstance(self.states[0], type(states[0])): self.states[i_] = copy.deepcopy(states[i_]) else: self.states[i_] = copy.deepcopy(states[0][i_]) self.states[i_].prior = self.states[i_].posterior # Prune chromosomes chr_list = data_handle.validate_chr(filename, species, speciesfile, chrom_list=single_chr, datatype=self.datatype) # Run Training in parallel while len(chr_list) > 0: results = [] chromosome = [] num_task = np.min([processors, len(chr_list)]) for i_ in np.arange(num_task): chr_ = chr_list[0] self.logger.info("{}: Submitting job to Queue".format(chr_)) chromosome.append(Trainer.remote(chr_, filename, species, speciesfile, self.blacklisted, self.states, self.prior, self.top_states, pi=posterior.pi, tmat=posterior.tmat, logger=logging.getLogger().getEffectiveLevel(), log_file=name, datatype=self.datatype)) results.append(chromosome[i_].train.remote(iterations=iterations, msg="{}: ".format(chr_))) chr_list.remove(chr_) """ unfinished = results while len(unfinished) > 0: finished, unfinished = ray.wait(unfinished) for r_ in finished: res, _ = ray.get(r_) for l_ in np.arange(len(res[0])): self.annotations.append(res[0][l_]) self.annotations_chr.append(res[1][l_]) self.annotations_start.append(res[2][l_]) self.annotations_length.append(res[3][l_]) """ for r_ in reversed(results): res, _ = ray.get(r_) for l_ in np.arange(len(res[0])): self.annotations.append(res[0][l_]) self.annotations_chr.append(res[1][l_]) self.annotations_start.append(res[2][l_]) self.annotations_length.append(res[3][l_]) # Clean Ray ray.shutdown()
def ppipe( records: Iterator[dict], *funcs, records_in_memory: int or None = None, processes: int or None = None, ) -> Iterator[dict]: """ A multi-threaded parallel pipe (or ppipe) which pipes records through it in parallel. Note that each record is still handled by a chain of p functions in sequence. You can think of a ppipe as a set of multiple parallel pipes. Args: records: An iterator of dictionaries. We call these dictionaries "records" throughout chunkyp. *funcs: A list of p functions to be applied to the records. records_in_memory: The number of records to pass to each one of the parallel pipes. processes: The number of pipes to launch in parallel. (Default: #logical_cores - 1) Returns: A generator of the resulting records modified by the p functions. """ ray_started_by_method = False # initialize ray if user hasn't already if not ray.is_initialized(): logging.warning('Ray is not running. Starting ray!') ray.init() ray_started_by_method = True # set number of processes if user hasn't already if processes is None: processes = psutil.cpu_count( logical=True) - 1 # number of logical cores (threads) on system # if records_in_memory is not set AND the a record generator is passed - uwind it into a list if records_in_memory is None and (isinstance(records, types.GeneratorType) or isinstance(records, itertools.chain)): print('unwinding!') records = list(records) # prepare batches if records_in_memory is None: batches = [records] else: batches = chunk_iterator(n=records_in_memory, iterable=records) # define a ray_pipe which wraps a _ppipe functions which actually handles the records def _ppipe(records: Iterator[dict], *funcs): result = records for f in funcs: result = f[1]( records=result) # f[1] selects _pp: the parallel p function return result ray_pipe = ray.remote(_ppipe) for batch in batches: batch = list(batch) # read into memory print('batch_len:', len(batch)) # chunk so we can parallelize chunks = chunk_iterator(n=math.ceil(len(batch) / processes), iterable=batch) futures = [ray_pipe.remote(list(chunk), *funcs) for chunk in chunks] results = ray.get(futures) for result in results: for record in result: yield record if ray_started_by_method is True: ray.shutdown()
def setUp(self): if not ray.is_initialized(): ray.init(num_cpus=1)
def start(detached: bool = False, http_host: str = DEFAULT_HTTP_HOST, http_port: int = DEFAULT_HTTP_PORT, http_middlewares: List[Any] = []) -> Client: """Initialize a serve instance. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the instance will instead persist until client.shutdown() is called and clients to it can be connected using serve.connect(). This is only relevant if connecting to a long-running Ray cluster (e.g., with address="auto"). Args: detached (bool): Whether not the instance should be detached from this script. http_host (str): Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". One HTTP server will be started on each node in the Ray cluster. http_port (int): Port for HTTP server. Defaults to 8000. http_middlewares (list): A list of Starlette middlewares that will be applied to the HTTP servers in the cluster. """ # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve controller if it exists if detached: controller_name = SERVE_CONTROLLER_NAME try: ray.get_actor(controller_name) raise RayServeException("Called serve.start(detached=True) but a " "detached instance is already running. " "Please use serve.connect() to connect to " "the running instance instead.") except ValueError: pass else: controller_name = format_actor_name(SERVE_CONTROLLER_NAME, get_random_letters()) controller = ServeController.options( name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, ).remote(controller_name, http_host, http_port, http_middlewares, detached=detached) futures = [] for node_id in ray.state.node_ids(): future = block_until_http_ready.options( num_cpus=0, resources={ node_id: 0.01 }).remote("http://{}:{}/-/routes".format(http_host, http_port), timeout=HTTP_PROXY_TIMEOUT) futures.append(future) ray.get(futures) return Client(controller, controller_name, detached=detached)
def initialize_ray( override_is_cluster=False, override_redis_address: str = None, override_redis_password: str = None, ): """ Initialize Ray based on parameters, ``modin.config`` variables and internal defaults. Parameters ---------- override_is_cluster : bool, default: False Whether to override the detection of Modin being run in a cluster and always assume this runs on cluster head node. This also overrides Ray worker detection and always runs the initialization function (runs from main thread only by default). If not specified, ``modin.config.IsRayCluster`` variable is used. override_redis_address : str, optional What Redis address to connect to when running in Ray cluster. If not specified, ``modin.config.RayRedisAddress`` is used. override_redis_password : str, optional What password to use when connecting to Redis. If not specified, ``modin.config.RayRedisPassword`` is used. """ if not ray.is_initialized() or override_is_cluster: cluster = override_is_cluster or IsRayCluster.get() redis_address = override_redis_address or RayRedisAddress.get() redis_password = ( (ray.ray_constants.REDIS_DEFAULT_PASSWORD if cluster else RayRedisPassword.get()) if override_redis_password is None and RayRedisPassword.get_value_source() == ValueSource.DEFAULT else override_redis_password or RayRedisPassword.get()) if cluster: # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", include_dashboard=False, ignore_reinit_error=True, _redis_password=redis_password, ) else: from modin.error_message import ErrorMessage # This string is intentionally formatted this way. We want it indented in # the warning message. ErrorMessage.not_initialized( "Ray", """ import ray ray.init() """, ) object_store_memory = Memory.get() # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: virtual_memory = psutil.virtual_memory().total if sys.platform.startswith("linux"): shm_fd = os.open("/dev/shm", os.O_RDONLY) try: shm_stats = os.fstatvfs(shm_fd) system_memory = shm_stats.f_bsize * shm_stats.f_bavail if system_memory / (virtual_memory / 2) < 0.99: warnings.warn( f"The size of /dev/shm is too small ({system_memory} bytes). The required size " + f"at least half of RAM ({virtual_memory // 2} bytes). Please, delete files in /dev/shm or " + "increase size of /dev/shm with --shm-size in Docker. Also, you can set " + "the required memory size for each Ray worker in bytes to MODIN_MEMORY environment variable." ) finally: os.close(shm_fd) else: system_memory = virtual_memory object_store_memory = int(0.6 * system_memory // 1e9 * 1e9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None else: object_store_memory = int(object_store_memory) mac_size_limit = getattr(ray.ray_constants, "MAC_DEGRADED_PERF_MMAP_SIZE_LIMIT", None) if (sys.platform == "darwin" and mac_size_limit is not None and object_store_memory > mac_size_limit): warnings.warn( "On Macs, Ray's performance is known to degrade with " + "object store size greater than " + f"{mac_size_limit / 2 ** 30:.4} GiB. Ray by default does " + "not allow setting an object store size greater than " + "that. Modin is overriding that default limit because " + "it would rather have a larger, slower object store " + "than spill to disk more often. To override Modin's " + "behavior, you can initialize Ray yourself.") os.environ["RAY_ENABLE_MAC_LARGE_OBJECT_STORE"] = "1" ray_init_kwargs = { "num_cpus": CpuCount.get(), "num_gpus": GpuCount.get(), "include_dashboard": False, "ignore_reinit_error": True, "object_store_memory": object_store_memory, "_redis_password": redis_password, "_memory": object_store_memory, } ray.init(**ray_init_kwargs) if StorageFormat.get() == "Cudf": from modin.core.execution.ray.implementations.cudf_on_ray.partitioning import ( GPUManager, GPU_MANAGERS, ) # Check that GPU_MANAGERS is empty because _update_engine can be called multiple times if not GPU_MANAGERS: for i in range(GpuCount.get()): GPU_MANAGERS.append(GPUManager.remote(i)) _move_stdlib_ahead_of_site_packages() ray.worker.global_worker.run_function_on_all_workers( _move_stdlib_ahead_of_site_packages) ray.worker.global_worker.run_function_on_all_workers(_import_pandas) num_cpus = int(ray.cluster_resources()["CPU"]) num_gpus = int(ray.cluster_resources().get("GPU", 0)) if StorageFormat.get() == "Cudf": NPartitions._put(num_gpus) else: NPartitions._put(num_cpus)
def start( detached: bool = False, http_host: Optional[str] = DEFAULT_HTTP_HOST, http_port: int = DEFAULT_HTTP_PORT, http_middlewares: List[Any] = [], http_options: Optional[Union[dict, HTTPOptions]] = None, ) -> Client: """Initialize a serve instance. By default, the instance will be scoped to the lifetime of the returned Client object (or when the script exits). If detached is set to True, the instance will instead persist until client.shutdown() is called and clients to it can be connected using serve.connect(). This is only relevant if connecting to a long-running Ray cluster (e.g., with address="auto"). Args: detached (bool): Whether not the instance should be detached from this script. http_host (Optional[str]): Deprecated, use http_options instead. http_port (int): Deprecated, use http_options instead. http_middlewares (list): Deprecated, use http_options instead. http_options (Optional[Dict, serve.HTTPOptions]): Configuration options for HTTP proxy. You can pass in a dictionary or HTTPOptions object with fields: - host(str, None): Host for HTTP servers to listen on. Defaults to "127.0.0.1". To expose Serve publicly, you probably want to set this to "0.0.0.0". - port(int): Port for HTTP server. Defaults to 8000. - middlewares(list): A list of Starlette middlewares that will be applied to the HTTP servers in the cluster. - location(str, serve.config.DeploymentMode): The deployment location of HTTP servers: - "HeadOnly": start one HTTP server on the head node. Serve assumes the head node is the node you executed serve.start on. This is the default. - "EveryNode": start one HTTP server per node. - "NoServer" or None: disable HTTP server. """ if ((http_host != DEFAULT_HTTP_HOST) or (http_port != DEFAULT_HTTP_PORT) or (len(http_middlewares) != 0)): if http_options is not None: raise ValueError( "You cannot specify both `http_options` and any of the " "`http_host`, `http_port`, and `http_middlewares` arguments. " "`http_options` is preferred.") else: warn( "`http_host`, `http_port`, `http_middlewares` are deprecated. " "Please use serve.start(http_options={'host': ..., " "'port': ..., middlewares': ...}) instead.", DeprecationWarning, ) # Initialize ray if needed. if not ray.is_initialized(): ray.init() # Try to get serve controller if it exists if detached: controller_name = SERVE_CONTROLLER_NAME try: ray.get_actor(controller_name) raise RayServeException("Called serve.start(detached=True) but a " "detached instance is already running. " "Please use serve.connect() to connect to " "the running instance instead.") except ValueError: pass else: controller_name = format_actor_name(SERVE_CONTROLLER_NAME, get_random_letters()) if isinstance(http_options, dict): http_options = HTTPOptions.parse_obj(http_options) if http_options is None: http_options = HTTPOptions(host=http_host, port=http_port, middlewares=http_middlewares) controller = ServeController.options( name=controller_name, lifetime="detached" if detached else None, max_restarts=-1, max_task_retries=-1, # Pin Serve controller on the head node. resources={ get_current_node_resource_key(): 0.01 }, ).remote( controller_name, http_options, detached=detached, ) proxy_handles = ray.get(controller.get_http_proxies.remote()) if len(proxy_handles) > 0: try: ray.get( [handle.ready.remote() for handle in proxy_handles.values()], timeout=HTTP_PROXY_TIMEOUT, ) except ray.exceptions.GetTimeoutError: raise TimeoutError( "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.") return Client(controller, controller_name, detached=detached)
def test_initialized_local_mode(shutdown_only_with_initialization_check): assert not ray.is_initialized() ray.init(num_cpus=0, local_mode=True) assert ray.is_initialized()
def setUp(self) -> None: if not ray.is_initialized(): ray.init()
def propagateOrbits(self, orbits, t1, threads=NUM_THREADS, chunk_size=100): """ Propagate each orbit in orbits to each time in t1. Parameters ---------- orbits : `~thor.orbits.orbits.Orbits` Orbits to propagate. t1 : `~astropy.time.core.Time` Times to which to propagate each orbit. threads : int, optional Number of processes to launch. chunk_size : int, optional Number of orbits to send to each process. Returns ------- propagated : `~pandas.DataFrame` Propagated orbits with at least the following columns: orbit_id : Input orbit ID. epoch_mjd_tdb : Time at which state is defined in MJD TDB. x, y, z, vx, vy, vz : Orbit as cartesian state vector with units of au and au per day. """ if threads > 1: orbits_split = orbits.split(chunk_size) t1_duplicated = [copy.deepcopy(t1) for i in range(len(orbits_split))] backend_duplicated = [copy.deepcopy(self) for i in range(len(orbits_split))] if USE_RAY: shutdown = False if not ray.is_initialized(): ray.init(num_cpus=threads) shutdown = True p = [] for o, t, b in zip(orbits_split, t1_duplicated, backend_duplicated): p.append(propagation_worker.remote(o, t, b)) propagated_dfs = ray.get(p) if shutdown: ray.shutdown() else: p = mp.Pool( processes=threads, initializer=_init_worker, ) propagated_dfs = p.starmap( propagation_worker, zip( orbits_split, t1_duplicated, backend_duplicated, ) ) p.close() propagated = pd.concat(propagated_dfs) propagated.reset_index( drop=True, inplace=True ) else: propagated = self._propagateOrbits( orbits, t1 ) return propagated
import ray import time from crawler_distributed import run_distributed_crawler from crawler import run_non_distributed_crawler # pip install requests==2.22.0 beautifulsoup4==4.8.1 ray==0.8.4 if __name__ == "__main__": address = ray.init(webui_host="0.0.0.0", num_cpus=16) print(address["webui_url"]) assert ray.is_initialized() # warmup @ray.remote def f(): return 3 ray.get([f.remote() for _ in range(1000)]) non_distributed_start = time.perf_counter() total_visited_non_distributed = run_non_distributed_crawler() non_distributed_end = time.perf_counter() distributed_start = time.perf_counter() total_visited_distributed = run_distributed_crawler() distributed_end = time.perf_counter() # Make sure they return the same result. print( f"non distributed crawler takes {non_distributed_end - non_distributed_start}"
def tearDown(self) -> None: if ray.is_initialized(): ray.shutdown()
def test_no_auto_init(shutdown_only): assert not ray.is_initialized() ray.get_runtime_context() assert not ray.is_initialized()
def ensure_ray_initialized(): if not ray.is_initialized(): ray.init()
model = get_keras_model(model_name, input_shape=args.input_shape) tf.keras.utils.plot_model(model, to_file=log_base / "plot_{}.png".format(model_name), show_shapes=True, show_layer_names=True) platform_ram = platform_memory("p32xlarge") bs_futures = defaultdict(list) # type: Dict[int, List] bs_param_ram_cost = {} # type: Dict[int, int] bs_fwd2xcost = {} # type: Dict[int, int] rg = list( range(args.batch_size_min, args.batch_size_max, args.batch_size_increment)) for bs in tqdm(rg, desc="Event dispatch"): while not ray.is_initialized(): ray.init( temp_dir="/tmp/ray_checkpoint_" + str(str(uuid.uuid4())[:8]), redis_password=str(uuid.uuid1()), num_cpus=os.cpu_count() - 2, ) futures = [] # load model at batch size g = dfgraph_from_keras(model, batch_size=bs, cost_model=cost_model, loss_cpu_cost=0, loss_ram_cost=(4 * bs)) bs_fwd2xcost[bs] = sum(g.cost_cpu_fwd.values()) + sum( g.cost_cpu.values())
def __init__(self, env_creator, policy, policy_mapping_fn=None, policies_to_train=None, tf_session_creator=None, rollout_fragment_length=100, batch_mode="truncate_episodes", episode_horizon=None, preprocessor_pref="deepmind", sample_async=False, compress_observations=False, num_envs=1, observation_fn=None, observation_filter="NoFilter", clip_rewards=None, clip_actions=True, env_config=None, model_config=None, policy_config=None, worker_index=0, num_workers=0, monitor_path=None, log_dir=None, log_level=None, callbacks=None, input_creator=lambda ioctx: ioctx.default_sampler_input(), input_evaluation=frozenset([]), output_creator=lambda ioctx: NoopOutput(), remote_worker_envs=False, remote_env_batch_wait_ms=0, soft_horizon=False, no_done_at_end=False, seed=None, extra_python_environs=None, fake_sampler=False): """Initialize a rollout worker. Arguments: env_creator (func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy (class|dict): Either a class implementing Policy, or a dictionary of policy id strings to (Policy, obs_space, action_space, config) tuples. If a dict is specified, then we are in multi-agent mode and a policy_mapping_fn should also be set. policy_mapping_fn (func): A function that maps agent ids to policy ids in multi-agent mode. This function will be called each time a new agent appears in an episode, to bind that agent to a policy for the duration of the episode. policies_to_train (list): Optional list of policies to train, or None for all policies. tf_session_creator (func): A function that returns a TF session. This is optional and only useful with TFPolicy. rollout_fragment_length (int): The target number of env transitions to include in each sample batch returned from this worker. batch_mode (str): One of the following batch modes: "truncate_episodes": Each call to sample() will return a batch of at most `rollout_fragment_length * num_envs` in size. The batch will be exactly `rollout_fragment_length * num_envs` in size if postprocessing does not change batch sizes. Episodes may be truncated in order to meet this size requirement. "complete_episodes": Each call to sample() will return a batch of at least `rollout_fragment_length * num_envs` in size. Episodes will not be truncated, but multiple episodes may be packed within one batch to meet the batch size. Note that when `num_envs > 1`, episode steps will be buffered until the episode completes, and hence batches may contain significant amounts of off-policy data. episode_horizon (int): Whether to stop episodes at this horizon. preprocessor_pref (str): Whether to prefer RLlib preprocessors ("rllib") or deepmind ("deepmind") when applicable. sample_async (bool): Whether to compute samples asynchronously in the background, which improves throughput but can cause samples to be slightly off-policy. compress_observations (bool): If true, compress the observations. They can be decompressed with rllib/utils/compression. num_envs (int): If more than one, will create multiple envs and vectorize the computation of actions. This has no effect if if the env already implements VectorEnv. observation_fn (ObservationFunction): Optional multi-agent observation function. observation_filter (str): Name of observation filter to use. clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to experience postprocessing. Setting to None means clip for Atari only. clip_actions (bool): Whether to clip action values to the range specified by the policy action space. env_config (dict): Config to pass to the env creator. model_config (dict): Config to use when creating the policy model. policy_config (dict): Config to pass to the policy. In the multi-agent case, this config will be merged with the per-policy configs specified by `policy`. worker_index (int): For remote workers, this should be set to a non-zero and unique value. This index is passed to created envs through EnvContext so that envs can be configured per worker. num_workers (int): For remote workers, how many workers altogether have been created? monitor_path (str): Write out episode stats and videos to this directory if specified. log_dir (str): Directory where logs can be placed. log_level (str): Set the root log level on creation. callbacks (DefaultCallbacks): Custom training callbacks. input_creator (func): Function that returns an InputReader object for loading previous generated experiences. input_evaluation (list): How to evaluate the policy performance. This only makes sense to set when the input is reading offline data. The possible values include: - "is": the step-wise importance sampling estimator. - "wis": the weighted step-wise is estimator. - "simulation": run the environment in the background, but use this data for evaluation only and never for learning. output_creator (func): Function that returns an OutputWriter object for saving generated experiences. remote_worker_envs (bool): If using num_envs > 1, whether to create those new envs in remote processes instead of in the current process. This adds overheads, but can make sense if your envs remote_env_batch_wait_ms (float): Timeout that remote workers are waiting when polling environments. 0 (continue when at least one env is ready) is a reasonable default, but optimal value could be obtained by measuring your environment step / reset and model inference perf. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. seed (int): Set the seed of both np and tf to this value to to ensure each remote worker has unique exploration behavior. extra_python_environs (dict): Extra python environments need to be set. fake_sampler (bool): Use a fake (inf speed) sampler for testing. """ self._original_kwargs = locals().copy() del self._original_kwargs["self"] global _global_worker _global_worker = self # set extra environs first if extra_python_environs: for key, value in extra_python_environs.items(): os.environ[key] = str(value) def gen_rollouts(): while True: yield self.sample() ParallelIteratorWorker.__init__(self, gen_rollouts, False) policy_config = policy_config or {} if (tf and policy_config.get("framework") == "tfe" and not policy_config.get("no_eager_on_workers") # This eager check is necessary for certain all-framework tests # that use tf's eager_mode() context generator. and not tf.executing_eagerly()): tf.enable_eager_execution() if log_level: logging.getLogger("ray.rllib").setLevel(log_level) if worker_index > 1: disable_log_once_globally() # only need 1 worker to log elif log_level == "DEBUG": enable_periodic_logging() env_context = EnvContext(env_config or {}, worker_index) self.policy_config = policy_config if callbacks: self.callbacks = callbacks() else: from ray.rllib.agents.callbacks import DefaultCallbacks self.callbacks = DefaultCallbacks() self.worker_index = worker_index self.num_workers = num_workers model_config = model_config or {} policy_mapping_fn = (policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID)) if not callable(policy_mapping_fn): raise ValueError("Policy mapping function not callable?") self.env_creator = env_creator self.rollout_fragment_length = rollout_fragment_length * num_envs self.batch_mode = batch_mode self.compress_observations = compress_observations self.preprocessing_enabled = True self.last_batch = None self.global_vars = None self.fake_sampler = fake_sampler self.env = _validate_env(env_creator(env_context)) if isinstance(self.env, (BaseEnv, MultiAgentEnv)): def wrap(env): return env # we can't auto-wrap these env types elif is_atari(self.env) and \ not model_config.get("custom_preprocessor") and \ preprocessor_pref == "deepmind": # Deepmind wrappers already handle all preprocessing self.preprocessing_enabled = False if clip_rewards is None: clip_rewards = True def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env else: def wrap(env): if monitor_path: from gym import wrappers env = wrappers.Monitor(env, monitor_path, resume=True) return env self.env = wrap(self.env) def make_env(vector_index): return wrap( env_creator( env_context.copy_with_overrides( vector_index=vector_index, remote=remote_worker_envs))) self.tf_sess = None policy_dict = _validate_and_canonicalize(policy, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) # set numpy and python seed if seed is not None: np.random.seed(seed) random.seed(seed) if not hasattr(self.env, "seed"): raise ValueError("Env doesn't support env.seed(): {}".format( self.env)) self.env.seed(seed) try: assert torch is not None torch.manual_seed(seed) except AssertionError: logger.info("Could not seed torch") if _has_tensorflow_graph(policy_dict) and not (tf and tf.executing_eagerly()): if not tf: raise ImportError("Could not import tensorflow") with tf.Graph().as_default(): if tf_session_creator: self.tf_sess = tf_session_creator() else: self.tf_sess = tf.Session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): # set graph-level seed if seed is not None: tf.set_random_seed(seed) self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE): if not ray.get_gpu_ids(): logger.debug( "Creating policy evaluation worker {}".format( worker_index) + " on CPU (please ignore any CUDA init errors)") elif not tf.test.is_gpu_available(): raise RuntimeError( "GPUs were assigned to this worker by Ray, but " "TensorFlow reports GPU acceleration is disabled. " "This could be due to a bad CUDA or TF installation.") else: self.policy_map, self.preprocessors = self._build_policy_map( policy_dict, policy_config) self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} if self.multiagent: if not ((isinstance(self.env, MultiAgentEnv) or isinstance(self.env, ExternalMultiAgentEnv)) or isinstance(self.env, BaseEnv)): raise ValueError( "Have multiple policies {}, but the env ".format( self.policy_map) + "{} is not a subclass of BaseEnv, MultiAgentEnv or " "ExternalMultiAgentEnv?".format(self.env)) self.filters = { policy_id: get_filter(observation_filter, policy.observation_space.shape) for (policy_id, policy) in self.policy_map.items() } if self.worker_index == 0: logger.info("Built filter map: {}".format(self.filters)) # Always use vector env for consistency even if num_envs = 1. self.async_env = BaseEnv.to_base_env( self.env, make_env=make_env, num_envs=num_envs, remote_envs=remote_worker_envs, remote_env_batch_wait_ms=remote_env_batch_wait_ms) self.num_envs = num_envs # `truncate_episodes`: Allow a batch to contain more than one episode # (fragments) and always make the batch `rollout_fragment_length` # long. if self.batch_mode == "truncate_episodes": pack = True # `complete_episodes`: Never cut episodes and sampler will return # exactly one (complete) episode per poll. elif self.batch_mode == "complete_episodes": rollout_fragment_length = float("inf") pack = False else: raise ValueError("Unsupported batch mode: {}".format( self.batch_mode)) self.io_context = IOContext(log_dir, policy_config, worker_index, self) self.reward_estimators = [] for method in input_evaluation: if method == "simulation": logger.warning( "Requested 'simulation' input evaluation method: " "will discard all sampler outputs and keep only metrics.") sample_async = True elif method == "is": ise = ImportanceSamplingEstimator.create(self.io_context) self.reward_estimators.append(ise) elif method == "wis": wise = WeightedImportanceSamplingEstimator.create( self.io_context) self.reward_estimators.append(wise) else: raise ValueError( "Unknown evaluation method: {}".format(method)) if sample_async: self.sampler = AsyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, pack_multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, blackhole_outputs="simulation" in input_evaluation, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn) # Start the Sampler thread. self.sampler.start() else: self.sampler = SyncSampler( worker=self, env=self.async_env, policies=self.policy_map, policy_mapping_fn=policy_mapping_fn, preprocessors=self.preprocessors, obs_filters=self.filters, clip_rewards=clip_rewards, rollout_fragment_length=rollout_fragment_length, callbacks=self.callbacks, horizon=episode_horizon, pack_multiple_episodes_in_batch=pack, tf_sess=self.tf_sess, clip_actions=clip_actions, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn) self.input_reader = input_creator(self.io_context) assert isinstance(self.input_reader, InputReader), self.input_reader self.output_writer = output_creator(self.io_context) assert isinstance(self.output_writer, OutputWriter), self.output_writer logger.debug( "Created rollout worker with env {} ({}), policies {}".format( self.async_env, self.env, self.policy_map))
def shutdown_ray(): if ray.is_initialized(): ray.shutdown() yield if ray.is_initialized(): ray.shutdown()
def chkRay() -> bool: return ray.is_initialized()
def test_register_ray(): register_ray() assert "ray" in joblib.parallel.BACKENDS assert not ray.is_initialized()