Example #1
0
    def _setup(self, config):
        env = self._env_id
        if env:
            config["env"] = env
            if _global_registry.contains(ENV_CREATOR, env):
                self.env_creator = _global_registry.get(ENV_CREATOR, env)
            else:
                import gym  # soft dependency
                self.env_creator = lambda env_config: gym.make(env)
        else:
            self.env_creator = lambda env_config: None

        # Merge the supplied config with the class default
        merged_config = copy.deepcopy(self._default_config)
        merged_config = deep_update(merged_config, config,
                                    self._allow_unknown_configs,
                                    self._allow_unknown_subkeys)
        self.raw_user_config = config
        self.config = merged_config

        if self.config["normalize_actions"]:
            inner = self.env_creator
            self.env_creator = (
                lambda env_config: NormalizeActionWrapper(inner(env_config)))

        Trainer._validate_config(self.config)
        log_level = self.config.get("log_level")
        if log_level in ["WARN", "ERROR"]:
            logger.info("Current log_level is {}. For more information, "
                        "set 'log_level': 'INFO' / 'DEBUG' or use the -v and "
                        "-vv flags.".format(log_level))
        if self.config.get("log_level"):
            logging.getLogger("ray.rllib").setLevel(self.config["log_level"])

        def get_scope():
            if tf and not tf.executing_eagerly():
                return tf.Graph().as_default()
            else:
                return open("/dev/null")  # fake a no-op scope

        with get_scope():
            self._init(self.config, self.env_creator)

            # Evaluation related
            if self.config.get("evaluation_interval"):
                # Update env_config with evaluation settings:
                extra_config = copy.deepcopy(self.config["evaluation_config"])
                extra_config.update({
                    "batch_mode": "complete_episodes",
                    "batch_steps": 1,
                })
                logger.debug(
                    "using evaluation_config: {}".format(extra_config))
                self.evaluation_workers = self._make_workers(
                    self.env_creator,
                    self._policy,
                    merge_dicts(self.config, extra_config),
                    num_workers=self.config["evaluation_num_workers"])
                self.evaluation_metrics = {}
Example #2
0
def loss_game_psro_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(
        GRL_DEFAULT_POKER_PPO_PARAMS,
        {
            "num_gpus":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_workers":
            4,
            "num_gpus_per_worker":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_envs_per_worker":
            1,
            "metrics_smoothing_episodes":
            10000,
            "exploration_config": {
                # The Exploration class to use. In the simplest case, this is the name
                # (str) of any class present in the `rllib.utils.exploration` package.
                # You can also provide the python class directly or the full location
                # of your class (e.g. "ray.rllib.utils.exploration.epsilon_greedy.
                # EpsilonGreedy").
                "type": StochasticSamplingIgnoreKwargs,
                # Add constructor kwargs here (if any).
            },
            "model":
            merge_dicts(
                MODEL_DEFAULTS, {
                    "fcnet_hiddens": [32, 32],
                    "custom_action_dist": "TorchGaussianSquashedGaussian",
                }),
            "entropy_coeff":
            0.01,
            "lambda":
            1.0,
            "train_batch_size":
            2048,
            "sgd_minibatch_size":
            256,
            "num_sgd_iter":
            30,
            "lr":
            0.0005,
            "clip_param":
            0.2,
            "kl_target":
            0.01,
        })
Example #3
0
def larger_psro_oshi_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(
        psro_oshi_ppo_params(env=env),
        {
            "model":
            merge_dicts(
                MODEL_DEFAULTS, {
                    "fcnet_activation": "relu",
                    "fcnet_hiddens": [128, 128],
                    "custom_model": None,
                    "custom_action_dist": "TorchGaussianSquashedGaussian",
                }),
            # Coefficient of the entropy regularizer.
            "entropy_coeff":
            0.0,
            # Decay schedule for the entropy regularizer.
            "entropy_coeff_schedule": [(0, 0.01), (int(2000e3), 0.0)],
        })
Example #4
0
def loss_game_nfsp_dqn_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(
        GRL_DEFAULT_OSHI_ZUMO_MEDIUM_DQN_PARAMS, {
            "metrics_smoothing_episodes": 10000,
            "exploration_config": {
                "epsilon_timesteps": int(500e6),
                "final_epsilon": 0.001,
                "initial_epsilon": 0.06,
                "type": ValidActionsEpsilonGreedy
            },
            "model": merge_dicts(MODEL_DEFAULTS, {
                "fcnet_hiddens": [32, 32],
            }),
            "target_network_update_freq": 100000,
            "buffer_size": 100000,
            "lr": 0.007,
            "rollout_fragment_length": 16,
            "train_batch_size": 4096,
        })
Example #5
0
    def wrapped(*args, object_store_memory, tune_kwargs, **kwargs):
        import ray
        from ray import tune
        from ray.rllib.utils import merge_dicts

        trainable, config, tune_overrides = func(*args, **kwargs)
        tune_kwargs = merge_dicts(tune_kwargs, tune_overrides)

        ray.init(object_store_memory=object_store_memory)
        tune.run(trainable, config=config, **tune_kwargs)
Example #6
0
File: maml.py Project: hyyh28/tesp
 def make_local_evaluator(self, env_creator, policy_dict):
     return self._make_evaluator(
         self._policy_evaluator, env_creator, policy_dict, 0,
         merge_dicts(
             self.config, {
                 "tf_session_args": {
                     "intra_op_parallelism_threads": 8,
                     "inter_op_parallelism_threads": 8
                 }
             }))
Example #7
0
 def default_resource_request(cls, config):
     cf = merge_dicts(cls._default_config, config)
     if cf["use_gpu_for_workers"]:
         num_gpus_per_worker = 1
     else:
         num_gpus_per_worker = 0
     return Resources(cpu=1,
                      gpu=cf["gpu"] and 1 or 0,
                      extra_cpu=cf["num_workers"],
                      extra_gpu=num_gpus_per_worker * cf["num_workers"])
Example #8
0
File: agent.py Project: zhy52/ray
    def make_local_evaluator(self,
                             env_creator,
                             policy_graph,
                             extra_config=None):
        """Convenience method to return configured local evaluator."""

        return self._make_evaluator(
            PolicyEvaluator,
            env_creator,
            policy_graph,
            0,
            merge_dicts(
                # important: allow local tf to use more CPUs for optimization
                merge_dicts(
                    self.config, {
                        "tf_session_args":
                        self.config["local_evaluator_tf_session_args"]
                    }),
                extra_config or {}))
Example #9
0
    def make_local_evaluator(self,
                             env_creator,
                             policy_graph,
                             extra_config=None):
        """Convenience method to return configured local evaluator."""

        return self._make_evaluator(
            PolicyEvaluator,
            env_creator,
            policy_graph,
            0,
            merge_dicts(
                # important: allow local tf to use more CPUs for optimization
                merge_dicts(
                    self.config, {
                        "tf_session_args": self.
                        config["local_evaluator_tf_session_args"]
                    }),
                extra_config or {}))
Example #10
0
def generate_policies(
    policy_id: str,
    policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space",
                                    dict],
    policies: Dict[str, TFPolicy],
    policies_to_train: List[str],
    policy_config: dict,
    preprocessors: Dict[str, Any],
    obs_filters: Dict[str, Any],
    observation_filter: str,
):
    """
    Get policies for each ``agent_id``, and instantiate new ones
    for newly created agents.
    """

    policy_cls, obs_space, act_space, conf = policy_constructor_tuple

    if policy_id in preprocessors != policy_id in policies:
        raise ValueError("'preprocessors' and 'policies' do not agree.")
    if policy_id in obs_filters != policy_id in policies:
        raise ValueError("'obs_filters' and 'policies' do not agree.")

    # If we haven't seen this id, we instantiate a new policy.
    if policy_id not in policies:
        merged_conf = merge_dicts(policy_config, conf)

        # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``.
        preprocessor = ModelCatalog.get_preprocessor_for_space(
            obs_space, merged_conf.get("model"))
        preprocessors[policy_id] = preprocessor
        obs_space = preprocessor.observation_space

        if tf and tf.executing_eagerly():
            if hasattr(policy_cls, "as_eager"):
                policy_cls = policy_cls.as_eager()
                if policy_config["eager_tracing"]:
                    policy_cls = policy_cls.with_tracing()
            elif not issubclass(policy_cls, TFPolicy):
                pass  # could be some other type of policy
            else:
                raise ValueError("This policy does not support eager "
                                 "execution: {}".format(policy_cls))
        if tf:
            with tf.variable_scope(policy_id):
                policies[policy_id] = policy_cls(obs_space, act_space,
                                                 merged_conf)
                policies_to_train.append(policy_id)
        else:
            policies[policy_id] = policy_cls(obs_space, act_space, merged_conf)
            policies_to_train.append(policy_id)

        obs_filters[policy_id] = get_filter(observation_filter,
                                            obs_space.shape)
    return policies, preprocessors, obs_filters, policies_to_train
Example #11
0
def nfsp_leduc_dqn_params_openspeil(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(GRL_DEFAULT_OPENSPIEL_POKER_DQN_PARAMS, {
        # === Exploration Settings ===
        "exploration_config": {
            # The Exploration class to use.
            "type": ValidActionsEpsilonGreedy,
            # Config for the Exploration class' constructor:
            "initial_epsilon": 0.06,
            "final_epsilon": 0.001,
            "epsilon_timesteps": int(20e6) * 10,  # Timesteps over which to anneal epsilon.
        },
        # Update the target network every `target_network_update_freq` steps.
        "target_network_update_freq": 19200 * 10,

        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128],
            "custom_model": get_valid_action_fcn_class_for_env(env=env),
        }),
    })
Example #12
0
def nfsp_kuhn_avg_policy_params_openspiel(env: MultiAgentEnv) -> Dict[str, Any]:
    return {
        "framework": "torch",
        "learning_starts": 2000,
        "train_batch_size": 128,
        "lr": 0.01,
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128],
        }),
    }
Example #13
0
def nfsp_leduc_avg_policy_params_openspiel(env: MultiAgentEnv) -> Dict[str, Any]:
    return {
        "framework": "torch",
        "learning_starts": 2000,
        "train_batch_size": 128,
        "lr": 0.01,
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128],
            "custom_model": get_valid_action_fcn_class_for_env(env=env)
        }),
    }
Example #14
0
def psro_leduc_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(
        GRL_DEFAULT_POKER_PPO_PARAMS, {
            "num_gpus":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_workers":
            4,
            "num_gpus_per_worker":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_envs_per_worker":
            1,
            "metrics_smoothing_episodes":
            20000,
            "model":
            merge_dicts(
                MODEL_DEFAULTS, {
                    "fcnet_activation": "relu",
                    "fcnet_hiddens": [128, 128],
                    "custom_model": None,
                }),
        })
Example #15
0
    def __init__(self,
                 env_creator,
                 policy,
                 trainer_config=None,
                 num_workers=0,
                 logdir=None,
                 _setup=True,
                 local_shared_policy_map=None,
                 local_shared_preprocessors=None,
                 local_shared_tf_sess=None):
        """Create a new WorkerSet and initialize its workers.

        Arguments:
            env_creator (func): Function that returns env given env config.
            policy (cls): rllib.policy.Policy class.
            trainer_config (dict): Optional dict that extends the common
                config of the Trainer class.
            num_workers (int): Number of remote rollout workers to create.
            logdir (str): Optional logging directory for workers.
            _setup (bool): Whether to setup workers. This is only for testing.
        """

        if not trainer_config:
            from ray.rllib.agents.trainer import COMMON_CONFIG
            trainer_config = COMMON_CONFIG

        self._env_creator = env_creator
        self._policy = policy
        self._remote_config = trainer_config
        self._num_workers = num_workers
        self._logdir = logdir

        if _setup:
            self._local_config = merge_dicts(
                trainer_config,
                {"tf_session_args": trainer_config["local_tf_session_args"]})

            # Always create a local worker
            self._local_worker = self._make_worker(
                SharedPolicyRolloutWorker,
                env_creator,
                policy,
                0,
                self._local_config,
                local_shared_policy_map=local_shared_policy_map,
                local_shared_preprocessors=local_shared_preprocessors,
                local_shared_tf_sess=local_shared_tf_sess)
            # jb changed this ^

            # Create a number of remote workers
            self._remote_workers = []
            self.add_workers(num_workers)
Example #16
0
def get_config():
    return merge_dicts(
        base_config(),
        {
            # === Environment ===
            "env_config": {
                "deceleration_zones": {
                    "center": [[0.0, 0.0]],
                    "decay": [2.0]
                },
                "random_walks": {
                    "num_walks": tune.grid_search([8, 16]),
                    "loc": 10.0,
                    "scale": 2.0,
                },
            },
            # === MAPO model training ===
            # Type of model-training to use. Possible types include
            # daml: policy gradient-aware model learning
            # mle: maximum likelihood estimation
            "model_loss": tune.grid_search(["DAML", "MLE"]),
            # Number of next states to sample from the model when calculating the
            # model-aware deterministic policy gradient
            "num_model_samples": 4,
            # Gradient estimator for model-aware dpg. Possible types include:
            # score_function, pathwise_derivative
            "grad_estimator": "PD",
            # === Replay Buffer ===
            "buffer_size": int(5e4),
            # === Network ===
            # Size and activation of the fully connected networks computing the logits
            # for the policy and action-value function. No layers means the component is
            # linear in states and/or actions.
            "module": {
                "actor": {
                    "encoder": {
                        "units": (64, )
                    }
                },
                "critic": {
                    "encoder": {
                        "units": (64, )
                    }
                },
                "model": {
                    "encoder": {
                        "units": (3, )
                    }
                },  # Bottleneck layer
            },
        },
    )
Example #17
0
def psro_oshi_ppo_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(
        GRL_DEFAULT_POKER_PPO_PARAMS, {
            "num_gpus":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_workers":
            4,
            "num_gpus_per_worker":
            float(os.getenv("WORKER_GPU_NUM", 0.0)),
            "num_envs_per_worker":
            1,
            "metrics_smoothing_episodes":
            5000,
            "model":
            merge_dicts(
                MODEL_DEFAULTS, {
                    "fcnet_activation": "relu",
                    "fcnet_hiddens": [64, 64],
                    "custom_model": None,
                    "custom_action_dist": "TorchGaussianSquashedGaussian",
                }),
        })
Example #18
0
    def __init__(self,
                 *,
                 env_creator: Optional[Callable[[EnvContext], EnvType]] = None,
                 validate_env: Optional[Callable[[EnvType], None]] = None,
                 policy_class: Optional[Type[Policy]] = None,
                 trainer_config: Optional[TrainerConfigDict] = None,
                 num_workers: int = 0,
                 logdir: Optional[str] = None,
                 _setup: bool = True):
        """Create a new WorkerSet and initialize its workers.

        Args:
            env_creator (Optional[Callable[[EnvContext], EnvType]]): Function
                that returns env given env config.
            validate_env (Optional[Callable[[EnvType], None]]): Optional
                callable to validate the generated environment (only on
                worker=0).
            policy (Optional[Type[Policy]]): A rllib.policy.Policy class.
            trainer_config (Optional[TrainerConfigDict]): Optional dict that
                extends the common config of the Trainer class.
            num_workers (int): Number of remote rollout workers to create.
            logdir (Optional[str]): Optional logging directory for workers.
            _setup (bool): Whether to setup workers. This is only for testing.
        """

        if not trainer_config:
            from ray.rllib.agents.trainer import COMMON_CONFIG
            trainer_config = COMMON_CONFIG

        self._env_creator = env_creator
        self._policy_class = policy_class
        self._remote_config = trainer_config
        self._logdir = logdir

        if _setup:
            self._local_config = merge_dicts(
                trainer_config,
                {"tf_session_args": trainer_config["local_tf_session_args"]})

            # Create a number of remote workers.
            self._remote_workers = []
            self.add_workers(num_workers)

            # Always create a local worker.
            self._local_worker = self._make_worker(cls=RolloutWorker,
                                                   env_creator=env_creator,
                                                   validate_env=validate_env,
                                                   policy=self._policy_class,
                                                   worker_index=0,
                                                   config=self._local_config)
Example #19
0
def get_config():
    return merge_dicts(
        base_config(),
        {
            # === Environment ===
            "env_config": {
                "setpoint": 50,
                "miscalibration": tune.grid_search([True, False]),
                "max_episode_steps": 1000,
            },
            # === MAPO model training ===
            "model_rollout_len": 1,
            # === Debugging ===
            # Whether to use the environment's true model to sample states
            "true_model": True,
            # === Replay Buffer ===
            "buffer_size": int(1e5),
            # === Network ===
            # Size and activation of the fully connected networks computing the logits
            # for the policy and action-value function. No layers means the component is
            # linear in states and/or actions.
            "module": {
                "actor": {
                    "encoder": {
                        "units": (256, 256)
                    }
                },
                "critic": {
                    "encoder": {
                        "units": (256, 256)
                    }
                },
                "model": {
                    "encoder": {
                        "units": (256, 256)
                    }
                },
            },
            # === Trainer ===
            "train_batch_size": 256,
            "timesteps_per_iteration": 1000,
            # === Exploration Settings ===
            "exploration_config": {
                "pure_exploration_steps": 2000
            },
            # === Evaluation ===
            "evaluation_interval": 1,
            "evaluation_num_episodes": 5,
        },
    )
def get_agent(agent_name, config_path, checkpoint, evaluate, script):
    from ray.rllib.utils import merge_dicts
    from raylab.utils.checkpoints import get_config_from_checkpoint, get_agent_cls
    from raylab.utils.dynamic_import import import_module_from_path

    msg = "Either config or checkpoint can be chosen."
    assert (config_path is None) != (checkpoint is None), msg

    if config_path is not None:
        config = import_module_from_path(config_path).get_config()
        if evaluate:
            if "evaluation_config" not in config:
                warnings.warn("Evaluation agent requested but none in config.")
            else:
                eval_conf = config["evaluation_config"]
                config = merge_dicts(config, eval_conf)
        config = merge_dicts(config, {
            "batch_mode": "complete_episodes",
            "rollout_fragment_length": 1
        })
    else:
        config = get_config_from_checkpoint(
            checkpoint,
            use_eval_config=evaluate,
            config_overrides={
                "batch_mode": "complete_episodes",
                "rollout_fragment_length": 1,
            },
        )

    config["num_workers"] = 0
    config["module"]["torch_script"] = script
    agent_cls = get_agent_cls(agent_name)
    agent = agent_cls(config)
    if checkpoint:
        agent.restore(checkpoint)
    return agent
Example #21
0
    def _setup(self, config):
        env = self._env_id
        if env:
            config["env"] = env
            if _global_registry.contains(ENV_CREATOR, env):
                self.env_creator = _global_registry.get(ENV_CREATOR, env)
            else:
                import gym  # soft dependency
                self.env_creator = lambda env_config: gym.make(env)
        else:
            self.env_creator = lambda env_config: None

        # Merge the supplied config with the class default
        merged_config = copy.deepcopy(self._default_config)
        merged_config = deep_update(merged_config, config,
                                    self._allow_unknown_configs,
                                    self._allow_unknown_subkeys)
        self.raw_user_config = config
        self.config = merged_config
        Trainer._validate_config(self.config)
        if self.config.get("log_level"):
            logging.getLogger("ray.rllib").setLevel(self.config["log_level"])

        def get_scope():
            if tf:
                return tf.Graph().as_default()
            else:
                return open("/dev/null")  # fake a no-op scope

        with get_scope():
            self._init(self.config, self.env_creator)

            # Evaluation related
            if self.config.get("evaluation_interval"):
                # Update env_config with evaluation settings:
                extra_config = copy.deepcopy(self.config["evaluation_config"])
                extra_config.update({
                    "batch_mode": "complete_episodes",
                    "batch_steps": 1,
                })
                logger.debug(
                    "using evaluation_config: {}".format(extra_config))
                self.evaluation_workers = self._make_workers(self.env_creator,
                                                             self._policy,
                                                             merge_dicts(
                                                                 self.config,
                                                                 extra_config),
                                                             num_workers=0)
                self.evaluation_metrics = self._evaluate()
Example #22
0
def nfsp_kuhn_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return {
        "framework": "torch",
        "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_workers": 0,
        "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_envs_per_worker": 1,
        "learning_starts": 16000,
        "train_batch_size": 4096,
        "lr": 0.1,
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128],
        }),
    }
Example #23
0
def get_config():
    return merge_dicts(
        base_config(),
        {
            # === Environment ===
            "env": "IndustrialBenchmark-v0",
            "env_config": {
                "reward_type": "classic",
                "action_type": "continuous",
                "observation": tune.grid_search(["visible", "markovian"]),
                "max_episode_steps": 1000,
                "time_aware": True,
            },
        },
    )
Example #24
0
def loss_game_nfsp_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return {
        "metrics_smoothing_episodes": 10000,
        "framework": "torch",
        "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_workers": 0,
        "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_envs_per_worker": 1,
        "learning_starts": 16000,
        "train_batch_size": 4096,
        "lr": 0.07,
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_hiddens": [32, 32],
        }),
    }
Example #25
0
File: maml.py Project: hyyh28/tesp
    def _init(self):
        self._validate_config()
        env = self.env_creator(env_config={})

        reset_args_shape = (env.reset_args_config["shape"][0], )
        self.reset_args_holder = ResetArgsHolder.remote(
            (self.config["num_workers"], ) + reset_args_shape)
        self.config["env_config"] = merge_dicts(
            self.config["env_config"],
            {"reset_args_holder": self.reset_args_holder})

        self.rng = np.random.RandomState(self.config["random_seed"])
        # print("sampling goals...")
        self.reset_args_train, self.reset_args_test_1, self.reset_args_test_2 \
            = env.sample_reset_args(self.rng, self.config["num_tasks"])
        # print("sampling finished")
        self.reset_args_test = {
            1: self.reset_args_test_1,
            2: self.reset_args_test_2
        }

        observation_space = env.observation_space
        action_space = env.action_space
        policy_dict_local = {
            DEFAULT_POLICY_ID:
            (self._policy_graph, observation_space, action_space, {
                "mode": "local"
            })
        }
        policy_dict_remote = {
            DEFAULT_POLICY_ID:
            (self._policy_graph, observation_space, action_space, {
                "mode": "remote"
            })
        }

        self.local_evaluator = self.make_local_evaluator(
            self.env_creator, policy_dict_local)
        self.remote_evaluators = self.make_remote_evaluators(
            self.env_creator, policy_dict_remote, self.config["num_workers"], {
                "num_cpus": self.config["num_cpus_per_worker"],
                "num_gpus": self.config["num_gpus_per_worker"]
            })
        self.optimizer = MAMLOptimizer(
            self.local_evaluator, self.remote_evaluators, {
                "num_inner_updates": self.config["num_inner_updates"],
                "num_sgd_iter": self.config["num_sgd_iter"]
            })
Example #26
0
def medium_oshi_zumo_nfsp_avg_policy_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return {
        "framework": "torch",
        "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_workers": 0,
        "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_envs_per_worker": 1,
        "learning_starts": 16000,
        "train_batch_size": 2048,
        "lr": 0.1,
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128, 128],
            "custom_model": get_valid_action_fcn_class_for_env(env=env),
        }),
    }
Example #27
0
    def make_local_evaluator(self, env_creator, policy_graph):
        """Convenience method to return configured local evaluator."""

        return self._make_evaluator(
            PolicyEvaluator,
            env_creator,
            policy_graph,
            0,
            # important: allow local tf to use multiple CPUs for optimization
            merge_dicts(
                self.config, {
                    "tf_session_args": {
                        "intra_op_parallelism_threads": None,
                        "inter_op_parallelism_threads": None,
                    }
                }))
Example #28
0
 def _build_policy_map(
         self, policy_dict: MultiAgentPolicyConfigDict,
         policy_config: TrainerConfigDict
 ) -> Tuple[Dict[PolicyID, Policy], Dict[PolicyID, Preprocessor]]:
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         logger.debug("Creating policy for {}".format(name))
         merged_conf = merge_dicts(policy_config, conf)
         merged_conf["num_workers"] = self.num_workers
         merged_conf["worker_index"] = self.worker_index
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         if tf1 and tf1.executing_eagerly():
             if hasattr(cls, "as_eager"):
                 cls = cls.as_eager()
                 if policy_config.get("eager_tracing"):
                     cls = cls.with_tracing()
             elif not issubclass(cls, TFPolicy):
                 pass  # could be some other type of policy
             else:
                 raise ValueError("This policy does not support eager "
                                  "execution: {}".format(cls))
         if tf1:
             with tf1.variable_scope(name):
                 policy_map[name] = cls(obs_space, act_space, merged_conf)
         else:
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     if self.worker_index == 0:
         logger.info("Built policy map: {}".format(policy_map))
         logger.info("Built preprocessor map: {}".format(preprocessors))
     return policy_map, preprocessors
Example #29
0
def nfsp_leduc_avg_policy_params_improved(env: MultiAgentEnv) -> Dict[str, Any]:
    # 09.23.29PM_Apr-30-2021/ orig dqn ant_prm FIXED 0.1 lr (0.3, 0.1) annealed 50000000 steps/leduc_nfsp_dqn_hparam_search_nfsp_sparse_09.24.43PM_Apr-30-2021l2zf5w7z

    return {
        "framework": "torch",
        "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_workers": 0,
        "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_envs_per_worker": 1,
        "learning_starts": 16000,
        "train_batch_size": 4096,
        # "lr": avg_pol_lr_start_end[0],
        "lr_schedule": [[0, 0.3], [50000000, 0.1]],
        "model": merge_dicts(MODEL_DEFAULTS, {
            "fcnet_activation": "relu",
            "fcnet_hiddens": [128],
            "custom_model": get_valid_action_fcn_class_for_env(env=env),
        }),
    }
Example #30
0
def nfsp_kuhn_dqn_params(env: MultiAgentEnv) -> Dict[str, Any]:
    return merge_dicts(GRL_DEFAULT_OPENSPIEL_POKER_DQN_PARAMS, {
        "num_gpus": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_workers": 4,
        "num_gpus_per_worker": float(os.getenv("WORKER_GPU_NUM", 0.0)),
        "num_envs_per_worker": 32,

        # How many steps of the model to sample before learning starts.
        "learning_starts": 16000,
        # Update the replay buffer with this many samples at once. Note that
        # this setting applies per-worker if num_workers > 1.q
        "rollout_fragment_length": 8,

        "batch_mode": "truncate_episodes",

        # Size of a batch sampled from replay buffer for training. Note that
        # if async_updates is set, then each worker returns gradients for a
        # batch of this size.
        "train_batch_size": 4096,
    })
Example #31
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         merged_conf = merge_dicts(policy_config, conf)
         with tf.variable_scope(name):
             if isinstance(obs_space, gym.spaces.Dict):
                 raise ValueError(
                     "Found raw Dict space as input to policy graph. "
                     "Please preprocess your environment observations "
                     "with DictFlatteningPreprocessor and set the "
                     "obs space to `preprocessor.observation_space`.")
             elif isinstance(obs_space, gym.spaces.Tuple):
                 raise ValueError(
                     "Found raw Tuple space as input to policy graph. "
                     "Please preprocess your environment observations "
                     "with TupleFlatteningPreprocessor and set the "
                     "obs space to `preprocessor.observation_space`.")
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     return policy_map
Example #32
0
 def _build_policy_map(self, policy_dict, policy_config):
     policy_map = {}
     preprocessors = {}
     for name, (cls, obs_space, act_space,
                conf) in sorted(policy_dict.items()):
         merged_conf = merge_dicts(policy_config, conf)
         if self.preprocessing_enabled:
             preprocessor = ModelCatalog.get_preprocessor_for_space(
                 obs_space, merged_conf.get("model"))
             preprocessors[name] = preprocessor
             obs_space = preprocessor.observation_space
         else:
             preprocessors[name] = NoPreprocessor(obs_space)
         if isinstance(obs_space, gym.spaces.Dict) or \
                 isinstance(obs_space, gym.spaces.Tuple):
             raise ValueError(
                 "Found raw Tuple|Dict space as input to policy graph. "
                 "Please preprocess these observations with a "
                 "Tuple|DictFlatteningPreprocessor.")
         with tf.variable_scope(name):
             policy_map[name] = cls(obs_space, act_space, merged_conf)
     return policy_map, preprocessors
Example #33
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from ray.rllib.agents.a3c.a3c import A3CAgent, DEFAULT_CONFIG as A3C_CONFIG
from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.rllib.utils.annotations import override
from ray.rllib.utils import merge_dicts

A2C_DEFAULT_CONFIG = merge_dicts(
    A3C_CONFIG,
    {
        "sample_batch_size": 20,
        "min_iter_time_s": 10,
        "sample_async": False,
    },
)


class A2CAgent(A3CAgent):
    """Synchronous variant of the A3CAgent."""

    _agent_name = "A2C"
    _default_config = A2C_DEFAULT_CONFIG

    @override(A3CAgent)
    def _make_optimizer(self):
        return SyncSamplesOptimizer(self.local_evaluator,
                                    self.remote_evaluators,
                                    self.config["optimizer"])
Example #34
0
from ray.rllib.utils.annotations import override
from ray.rllib.utils import merge_dicts

APEX_QMIX_DEFAULT_CONFIG = merge_dicts(
    QMIX_CONFIG,  # see also the options in qmix.py, which are also supported
    {
        "optimizer_class": "AsyncReplayOptimizer",
        "optimizer": merge_dicts(
            QMIX_CONFIG["optimizer"],
            {
                "max_weight_sync_delay": 400,
                "num_replay_buffer_shards": 4,
                "batch_replay": True,  # required for RNN. Disables prio.
                "debug": False
            }),
        "num_gpus": 0,
        "num_workers": 32,
        "buffer_size": 2000000,
        "learning_starts": 50000,
        "train_batch_size": 512,
        "sample_batch_size": 50,
        "max_weight_sync_delay": 400,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 25000,
        "per_worker_exploration": True,
        "min_iter_time_s": 30,
    },
)


class ApexQMixAgent(QMixAgent):