def _setup(self, config): env = self._env_id if env: config["env"] = env if _global_registry.contains(ENV_CREATOR, env): self.env_creator = _global_registry.get(ENV_CREATOR, env) else: import gym # soft dependency self.env_creator = lambda env_config: gym.make(env) else: self.env_creator = lambda env_config: None # Merge the supplied config with the class default merged_config = copy.deepcopy(self._default_config) merged_config = deep_update(merged_config, config, self._allow_unknown_configs, self._allow_unknown_subkeys) self.raw_user_config = config self.config = merged_config if self.config["normalize_actions"]: inner = self.env_creator self.env_creator = ( lambda env_config: NormalizeActionWrapper(inner(env_config))) Trainer._validate_config(self.config) log_level = self.config.get("log_level") if log_level in ["WARN", "ERROR"]: logger.info("Current log_level is {}. For more information, " "set 'log_level': 'INFO' / 'DEBUG' or use the -v and " "-vv flags.".format(log_level)) if self.config.get("log_level"): logging.getLogger("ray.rllib").setLevel(self.config["log_level"]) def get_scope(): if tf and not tf.executing_eagerly(): return tf.Graph().as_default() else: return open("/dev/null") # fake a no-op scope with get_scope(): self._init(self.config, self.env_creator) # Evaluation related if self.config.get("evaluation_interval"): # Update env_config with evaluation settings: extra_config = copy.deepcopy(self.config["evaluation_config"]) extra_config.update({ "batch_mode": "complete_episodes", "batch_steps": 1, }) logger.debug( "using evaluation_config: {}".format(extra_config)) self.evaluation_workers = self._make_workers( self.env_creator, self._policy, merge_dicts(self.config, extra_config), num_workers=self.config["evaluation_num_workers"]) self.evaluation_metrics = {}
def loss_game_psro_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts( GRL_DEFAULT_POKER_PPO_PARAMS, { "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 4, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "metrics_smoothing_episodes": 10000, "exploration_config": { # The Exploration class to use. In the simplest case, this is the name # (str) of any class present in the `rllib.utils.exploration` package. # You can also provide the python class directly or the full location # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy. # EpsilonGreedy"). "type": StochasticSamplingIgnoreKwargs, # Add constructor kwargs here (if any). }, "model": merge_dicts( MODEL_DEFAULTS, { "fcnet_hiddens": [32, 32], "custom_action_dist": "TorchGaussianSquashedGaussian", }), "entropy_coeff": 0.01, "lambda": 1.0, "train_batch_size": 2048, "sgd_minibatch_size": 256, "num_sgd_iter": 30, "lr": 0.0005, "clip_param": 0.2, "kl_target": 0.01, })
def larger_psro_oshi_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts( psro_oshi_ppo_params(env=env), { "model": merge_dicts( MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128, 128], "custom_model": None, "custom_action_dist": "TorchGaussianSquashedGaussian", }), # Coefficient of the entropy regularizer. "entropy_coeff": 0.0, # Decay schedule for the entropy regularizer. "entropy_coeff_schedule": [(0, 0.01), (int(2000e3), 0.0)], })
def loss_game_nfsp_dqn_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts( GRL_DEFAULT_OSHI_ZUMO_MEDIUM_DQN_PARAMS, { "metrics_smoothing_episodes": 10000, "exploration_config": { "epsilon_timesteps": int(500e6), "final_epsilon": 0.001, "initial_epsilon": 0.06, "type": ValidActionsEpsilonGreedy }, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_hiddens": [32, 32], }), "target_network_update_freq": 100000, "buffer_size": 100000, "lr": 0.007, "rollout_fragment_length": 16, "train_batch_size": 4096, })
def wrapped(*args, object_store_memory, tune_kwargs, **kwargs): import ray from ray import tune from ray.rllib.utils import merge_dicts trainable, config, tune_overrides = func(*args, **kwargs) tune_kwargs = merge_dicts(tune_kwargs, tune_overrides) ray.init(object_store_memory=object_store_memory) tune.run(trainable, config=config, **tune_kwargs)
def make_local_evaluator(self, env_creator, policy_dict): return self._make_evaluator( self._policy_evaluator, env_creator, policy_dict, 0, merge_dicts( self.config, { "tf_session_args": { "intra_op_parallelism_threads": 8, "inter_op_parallelism_threads": 8 } }))
def default_resource_request(cls, config): cf = merge_dicts(cls._default_config, config) if cf["use_gpu_for_workers"]: num_gpus_per_worker = 1 else: num_gpus_per_worker = 0 return Resources(cpu=1, gpu=cf["gpu"] and 1 or 0, extra_cpu=cf["num_workers"], extra_gpu=num_gpus_per_worker * cf["num_workers"])
def make_local_evaluator(self, env_creator, policy_graph, extra_config=None): """Convenience method to return configured local evaluator.""" return self._make_evaluator( PolicyEvaluator, env_creator, policy_graph, 0, merge_dicts( # important: allow local tf to use more CPUs for optimization merge_dicts( self.config, { "tf_session_args": self.config["local_evaluator_tf_session_args"] }), extra_config or {}))
def make_local_evaluator(self, env_creator, policy_graph, extra_config=None): """Convenience method to return configured local evaluator.""" return self._make_evaluator( PolicyEvaluator, env_creator, policy_graph, 0, merge_dicts( # important: allow local tf to use more CPUs for optimization merge_dicts( self.config, { "tf_session_args": self. config["local_evaluator_tf_session_args"] }), extra_config or {}))
def generate_policies( policy_id: str, policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space", dict], policies: Dict[str, TFPolicy], policies_to_train: List[str], policy_config: dict, preprocessors: Dict[str, Any], obs_filters: Dict[str, Any], observation_filter: str, ): """ Get policies for each ``agent_id``, and instantiate new ones for newly created agents. """ policy_cls, obs_space, act_space, conf = policy_constructor_tuple if policy_id in preprocessors != policy_id in policies: raise ValueError("'preprocessors' and 'policies' do not agree.") if policy_id in obs_filters != policy_id in policies: raise ValueError("'obs_filters' and 'policies' do not agree.") # If we haven't seen this id, we instantiate a new policy. if policy_id not in policies: merged_conf = merge_dicts(policy_config, conf) # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``. preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[policy_id] = preprocessor obs_space = preprocessor.observation_space if tf and tf.executing_eagerly(): if hasattr(policy_cls, "as_eager"): policy_cls = policy_cls.as_eager() if policy_config["eager_tracing"]: policy_cls = policy_cls.with_tracing() elif not issubclass(policy_cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(policy_cls)) if tf: with tf.variable_scope(policy_id): policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) policies_to_train.append(policy_id) else: policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) policies_to_train.append(policy_id) obs_filters[policy_id] = get_filter(observation_filter, obs_space.shape) return policies, preprocessors, obs_filters, policies_to_train
def nfsp_leduc_dqn_params_openspeil(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts(GRL_DEFAULT_OPENSPIEL_POKER_DQN_PARAMS, { # === Exploration Settings === "exploration_config": { # The Exploration class to use. "type": ValidActionsEpsilonGreedy, # Config for the Exploration class' constructor: "initial_epsilon": 0.06, "final_epsilon": 0.001, "epsilon_timesteps": int(20e6) * 10, # Timesteps over which to anneal epsilon. }, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 19200 * 10, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128], "custom_model": get_valid_action_fcn_class_for_env(env=env), }), })
def nfsp_kuhn_avg_policy_params_openspiel(env: MultiAgentEnv) -> Dict[str, Any]: return { "framework": "torch", "learning_starts": 2000, "train_batch_size": 128, "lr": 0.01, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128], }), }
def nfsp_leduc_avg_policy_params_openspiel(env: MultiAgentEnv) -> Dict[str, Any]: return { "framework": "torch", "learning_starts": 2000, "train_batch_size": 128, "lr": 0.01, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128], "custom_model": get_valid_action_fcn_class_for_env(env=env) }), }
def psro_leduc_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts( GRL_DEFAULT_POKER_PPO_PARAMS, { "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 4, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "metrics_smoothing_episodes": 20000, "model": merge_dicts( MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128, 128], "custom_model": None, }), })
def __init__(self, env_creator, policy, trainer_config=None, num_workers=0, logdir=None, _setup=True, local_shared_policy_map=None, local_shared_preprocessors=None, local_shared_tf_sess=None): """Create a new WorkerSet and initialize its workers. Arguments: env_creator (func): Function that returns env given env config. policy (cls): rllib.policy.Policy class. trainer_config (dict): Optional dict that extends the common config of the Trainer class. num_workers (int): Number of remote rollout workers to create. logdir (str): Optional logging directory for workers. _setup (bool): Whether to setup workers. This is only for testing. """ if not trainer_config: from ray.rllib.agents.trainer import COMMON_CONFIG trainer_config = COMMON_CONFIG self._env_creator = env_creator self._policy = policy self._remote_config = trainer_config self._num_workers = num_workers self._logdir = logdir if _setup: self._local_config = merge_dicts( trainer_config, {"tf_session_args": trainer_config["local_tf_session_args"]}) # Always create a local worker self._local_worker = self._make_worker( SharedPolicyRolloutWorker, env_creator, policy, 0, self._local_config, local_shared_policy_map=local_shared_policy_map, local_shared_preprocessors=local_shared_preprocessors, local_shared_tf_sess=local_shared_tf_sess) # jb changed this ^ # Create a number of remote workers self._remote_workers = [] self.add_workers(num_workers)
def get_config(): return merge_dicts( base_config(), { # === Environment === "env_config": { "deceleration_zones": { "center": [[0.0, 0.0]], "decay": [2.0] }, "random_walks": { "num_walks": tune.grid_search([8, 16]), "loc": 10.0, "scale": 2.0, }, }, # === MAPO model training === # Type of model-training to use. Possible types include # daml: policy gradient-aware model learning # mle: maximum likelihood estimation "model_loss": tune.grid_search(["DAML", "MLE"]), # Number of next states to sample from the model when calculating the # model-aware deterministic policy gradient "num_model_samples": 4, # Gradient estimator for model-aware dpg. Possible types include: # score_function, pathwise_derivative "grad_estimator": "PD", # === Replay Buffer === "buffer_size": int(5e4), # === Network === # Size and activation of the fully connected networks computing the logits # for the policy and action-value function. No layers means the component is # linear in states and/or actions. "module": { "actor": { "encoder": { "units": (64, ) } }, "critic": { "encoder": { "units": (64, ) } }, "model": { "encoder": { "units": (3, ) } }, # Bottleneck layer }, }, )
def psro_oshi_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts( GRL_DEFAULT_POKER_PPO_PARAMS, { "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 4, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "metrics_smoothing_episodes": 5000, "model": merge_dicts( MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [64, 64], "custom_model": None, "custom_action_dist": "TorchGaussianSquashedGaussian", }), })
def __init__(self, *, env_creator: Optional[Callable[[EnvContext], EnvType]] = None, validate_env: Optional[Callable[[EnvType], None]] = None, policy_class: Optional[Type[Policy]] = None, trainer_config: Optional[TrainerConfigDict] = None, num_workers: int = 0, logdir: Optional[str] = None, _setup: bool = True): """Create a new WorkerSet and initialize its workers. Args: env_creator (Optional[Callable[[EnvContext], EnvType]]): Function that returns env given env config. validate_env (Optional[Callable[[EnvType], None]]): Optional callable to validate the generated environment (only on worker=0). policy (Optional[Type[Policy]]): A rllib.policy.Policy class. trainer_config (Optional[TrainerConfigDict]): Optional dict that extends the common config of the Trainer class. num_workers (int): Number of remote rollout workers to create. logdir (Optional[str]): Optional logging directory for workers. _setup (bool): Whether to setup workers. This is only for testing. """ if not trainer_config: from ray.rllib.agents.trainer import COMMON_CONFIG trainer_config = COMMON_CONFIG self._env_creator = env_creator self._policy_class = policy_class self._remote_config = trainer_config self._logdir = logdir if _setup: self._local_config = merge_dicts( trainer_config, {"tf_session_args": trainer_config["local_tf_session_args"]}) # Create a number of remote workers. self._remote_workers = [] self.add_workers(num_workers) # Always create a local worker. self._local_worker = self._make_worker(cls=RolloutWorker, env_creator=env_creator, validate_env=validate_env, policy=self._policy_class, worker_index=0, config=self._local_config)
def get_config(): return merge_dicts( base_config(), { # === Environment === "env_config": { "setpoint": 50, "miscalibration": tune.grid_search([True, False]), "max_episode_steps": 1000, }, # === MAPO model training === "model_rollout_len": 1, # === Debugging === # Whether to use the environment's true model to sample states "true_model": True, # === Replay Buffer === "buffer_size": int(1e5), # === Network === # Size and activation of the fully connected networks computing the logits # for the policy and action-value function. No layers means the component is # linear in states and/or actions. "module": { "actor": { "encoder": { "units": (256, 256) } }, "critic": { "encoder": { "units": (256, 256) } }, "model": { "encoder": { "units": (256, 256) } }, }, # === Trainer === "train_batch_size": 256, "timesteps_per_iteration": 1000, # === Exploration Settings === "exploration_config": { "pure_exploration_steps": 2000 }, # === Evaluation === "evaluation_interval": 1, "evaluation_num_episodes": 5, }, )
def get_agent(agent_name, config_path, checkpoint, evaluate, script): from ray.rllib.utils import merge_dicts from raylab.utils.checkpoints import get_config_from_checkpoint, get_agent_cls from raylab.utils.dynamic_import import import_module_from_path msg = "Either config or checkpoint can be chosen." assert (config_path is None) != (checkpoint is None), msg if config_path is not None: config = import_module_from_path(config_path).get_config() if evaluate: if "evaluation_config" not in config: warnings.warn("Evaluation agent requested but none in config.") else: eval_conf = config["evaluation_config"] config = merge_dicts(config, eval_conf) config = merge_dicts(config, { "batch_mode": "complete_episodes", "rollout_fragment_length": 1 }) else: config = get_config_from_checkpoint( checkpoint, use_eval_config=evaluate, config_overrides={ "batch_mode": "complete_episodes", "rollout_fragment_length": 1, }, ) config["num_workers"] = 0 config["module"]["torch_script"] = script agent_cls = get_agent_cls(agent_name) agent = agent_cls(config) if checkpoint: agent.restore(checkpoint) return agent
def _setup(self, config): env = self._env_id if env: config["env"] = env if _global_registry.contains(ENV_CREATOR, env): self.env_creator = _global_registry.get(ENV_CREATOR, env) else: import gym # soft dependency self.env_creator = lambda env_config: gym.make(env) else: self.env_creator = lambda env_config: None # Merge the supplied config with the class default merged_config = copy.deepcopy(self._default_config) merged_config = deep_update(merged_config, config, self._allow_unknown_configs, self._allow_unknown_subkeys) self.raw_user_config = config self.config = merged_config Trainer._validate_config(self.config) if self.config.get("log_level"): logging.getLogger("ray.rllib").setLevel(self.config["log_level"]) def get_scope(): if tf: return tf.Graph().as_default() else: return open("/dev/null") # fake a no-op scope with get_scope(): self._init(self.config, self.env_creator) # Evaluation related if self.config.get("evaluation_interval"): # Update env_config with evaluation settings: extra_config = copy.deepcopy(self.config["evaluation_config"]) extra_config.update({ "batch_mode": "complete_episodes", "batch_steps": 1, }) logger.debug( "using evaluation_config: {}".format(extra_config)) self.evaluation_workers = self._make_workers(self.env_creator, self._policy, merge_dicts( self.config, extra_config), num_workers=0) self.evaluation_metrics = self._evaluate()
def nfsp_kuhn_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]: return { "framework": "torch", "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 0, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "learning_starts": 16000, "train_batch_size": 4096, "lr": 0.1, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128], }), }
def get_config(): return merge_dicts( base_config(), { # === Environment === "env": "IndustrialBenchmark-v0", "env_config": { "reward_type": "classic", "action_type": "continuous", "observation": tune.grid_search(["visible", "markovian"]), "max_episode_steps": 1000, "time_aware": True, }, }, )
def loss_game_nfsp_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]: return { "metrics_smoothing_episodes": 10000, "framework": "torch", "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 0, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "learning_starts": 16000, "train_batch_size": 4096, "lr": 0.07, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_hiddens": [32, 32], }), }
def _init(self): self._validate_config() env = self.env_creator(env_config={}) reset_args_shape = (env.reset_args_config["shape"][0], ) self.reset_args_holder = ResetArgsHolder.remote( (self.config["num_workers"], ) + reset_args_shape) self.config["env_config"] = merge_dicts( self.config["env_config"], {"reset_args_holder": self.reset_args_holder}) self.rng = np.random.RandomState(self.config["random_seed"]) # print("sampling goals...") self.reset_args_train, self.reset_args_test_1, self.reset_args_test_2 \ = env.sample_reset_args(self.rng, self.config["num_tasks"]) # print("sampling finished") self.reset_args_test = { 1: self.reset_args_test_1, 2: self.reset_args_test_2 } observation_space = env.observation_space action_space = env.action_space policy_dict_local = { DEFAULT_POLICY_ID: (self._policy_graph, observation_space, action_space, { "mode": "local" }) } policy_dict_remote = { DEFAULT_POLICY_ID: (self._policy_graph, observation_space, action_space, { "mode": "remote" }) } self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_dict_local) self.remote_evaluators = self.make_remote_evaluators( self.env_creator, policy_dict_remote, self.config["num_workers"], { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"] }) self.optimizer = MAMLOptimizer( self.local_evaluator, self.remote_evaluators, { "num_inner_updates": self.config["num_inner_updates"], "num_sgd_iter": self.config["num_sgd_iter"] })
def medium_oshi_zumo_nfsp_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]: return { "framework": "torch", "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 0, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "learning_starts": 16000, "train_batch_size": 2048, "lr": 0.1, "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128, 128], "custom_model": get_valid_action_fcn_class_for_env(env=env), }), }
def make_local_evaluator(self, env_creator, policy_graph): """Convenience method to return configured local evaluator.""" return self._make_evaluator( PolicyEvaluator, env_creator, policy_graph, 0, # important: allow local tf to use multiple CPUs for optimization merge_dicts( self.config, { "tf_session_args": { "intra_op_parallelism_threads": None, "inter_op_parallelism_threads": None, } }))
def _build_policy_map( self, policy_dict: MultiAgentPolicyConfigDict, policy_config: TrainerConfigDict ) -> Tuple[Dict[PolicyID, Policy], Dict[PolicyID, Preprocessor]]: policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): logger.debug("Creating policy for {}".format(name)) merged_conf = merge_dicts(policy_config, conf) merged_conf["num_workers"] = self.num_workers merged_conf["worker_index"] = self.worker_index if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") if tf1 and tf1.executing_eagerly(): if hasattr(cls, "as_eager"): cls = cls.as_eager() if policy_config.get("eager_tracing"): cls = cls.with_tracing() elif not issubclass(cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(cls)) if tf1: with tf1.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) else: policy_map[name] = cls(obs_space, act_space, merged_conf) if self.worker_index == 0: logger.info("Built policy map: {}".format(policy_map)) logger.info("Built preprocessor map: {}".format(preprocessors)) return policy_map, preprocessors
def nfsp_leduc_avg_policy_params_improved(env: MultiAgentEnv) -> Dict[str, Any]: # 09.23.29PM_Apr-30-2021/ orig dqn ant_prm FIXED 0.1 lr (0.3, 0.1) annealed 50000000 steps/leduc_nfsp_dqn_hparam_search_nfsp_sparse_09.24.43PM_Apr-30-2021l2zf5w7z return { "framework": "torch", "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 0, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 1, "learning_starts": 16000, "train_batch_size": 4096, # "lr": avg_pol_lr_start_end[0], "lr_schedule": [[0, 0.3], [50000000, 0.1]], "model": merge_dicts(MODEL_DEFAULTS, { "fcnet_activation": "relu", "fcnet_hiddens": [128], "custom_model": get_valid_action_fcn_class_for_env(env=env), }), }
def nfsp_kuhn_dqn_params(env: MultiAgentEnv) -> Dict[str, Any]: return merge_dicts(GRL_DEFAULT_OPENSPIEL_POKER_DQN_PARAMS, { "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_workers": 4, "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)), "num_envs_per_worker": 32, # How many steps of the model to sample before learning starts. "learning_starts": 16000, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1.q "rollout_fragment_length": 8, "batch_mode": "truncate_episodes", # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. "train_batch_size": 4096, })
def _build_policy_map(self, policy_dict, policy_config): policy_map = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): merged_conf = merge_dicts(policy_config, conf) with tf.variable_scope(name): if isinstance(obs_space, gym.spaces.Dict): raise ValueError( "Found raw Dict space as input to policy graph. " "Please preprocess your environment observations " "with DictFlatteningPreprocessor and set the " "obs space to `preprocessor.observation_space`.") elif isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple space as input to policy graph. " "Please preprocess your environment observations " "with TupleFlatteningPreprocessor and set the " "obs space to `preprocessor.observation_space`.") policy_map[name] = cls(obs_space, act_space, merged_conf) return policy_map
def _build_policy_map(self, policy_dict, policy_config): policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): merged_conf = merge_dicts(policy_config, conf) if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy graph. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) return policy_map, preprocessors
from __future__ import absolute_import from __future__ import division from __future__ import print_function from ray.rllib.agents.a3c.a3c import A3CAgent, DEFAULT_CONFIG as A3C_CONFIG from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override from ray.rllib.utils import merge_dicts A2C_DEFAULT_CONFIG = merge_dicts( A3C_CONFIG, { "sample_batch_size": 20, "min_iter_time_s": 10, "sample_async": False, }, ) class A2CAgent(A3CAgent): """Synchronous variant of the A3CAgent.""" _agent_name = "A2C" _default_config = A2C_DEFAULT_CONFIG @override(A3CAgent) def _make_optimizer(self): return SyncSamplesOptimizer(self.local_evaluator, self.remote_evaluators, self.config["optimizer"])
from ray.rllib.utils.annotations import override from ray.rllib.utils import merge_dicts APEX_QMIX_DEFAULT_CONFIG = merge_dicts( QMIX_CONFIG, # see also the options in qmix.py, which are also supported { "optimizer_class": "AsyncReplayOptimizer", "optimizer": merge_dicts( QMIX_CONFIG["optimizer"], { "max_weight_sync_delay": 400, "num_replay_buffer_shards": 4, "batch_replay": True, # required for RNN. Disables prio. "debug": False }), "num_gpus": 0, "num_workers": 32, "buffer_size": 2000000, "learning_starts": 50000, "train_batch_size": 512, "sample_batch_size": 50, "max_weight_sync_delay": 400, "target_network_update_freq": 500000, "timesteps_per_iteration": 25000, "per_worker_exploration": True, "min_iter_time_s": 30, }, ) class ApexQMixAgent(QMixAgent):