def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNTFPolicy, obs_space, act_space, dqn_config), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs) for i in range(200): ev.foreach_policy(lambda p, _: p.set_epsilon(max( 0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() ev.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def make_remote_evaluators(self, env_creator, policy_graph, count, remote_args): """Convenience method to return a number of remote evaluators.""" cls = PolicyEvaluator.as_remote(**remote_args).remote return [ self._make_evaluator(cls, env_creator, policy_graph, i + 1, self.config) for i in range(count) ]
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGPolicyGraph, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs, {}) for i in range(200): ev.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNPolicyGraph) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: ev.foreach_policy( lambda p, _: p.update_target() if isinstance(p, DQNPolicyGraph) else None) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def make(cls, env_creator, policy_graph, optimizer_batch_size=None, num_workers=0, num_envs_per_worker=None, optimizer_config=None, remote_num_cpus=None, remote_num_gpus=None, **eval_kwargs): """Creates an Optimizer with local and remote evaluators. Args: env_creator(func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. See PolicyEvaluator documentation. optimizer_batch_size (int): Batch size summed across all workers. Will override worker `batch_steps`. num_workers (int): Number of remote evaluators num_envs_per_worker (int): (Optional) Sets the number environments per evaluator for vectorization. If set, overrides `num_envs` in kwargs for PolicyEvaluator.__init__. optimizer_config (dict): Config passed to the optimizer. remote_num_cpus (int): CPU specification for remote evaluator. remote_num_gpus (int): GPU specification for remote evaluator. **eval_kwargs: PolicyEvaluator Class non-positional args. Returns: (Optimizer) Instance of `cls` with evaluators configured accordingly. """ optimizer_config = optimizer_config or {} if num_envs_per_worker: assert num_envs_per_worker > 0, "Improper num_envs_per_worker!" eval_kwargs["num_envs"] = int(num_envs_per_worker) if optimizer_batch_size: assert optimizer_batch_size > 0 if num_workers > 1: eval_kwargs["batch_steps"] = \ optimizer_batch_size // num_workers else: eval_kwargs["batch_steps"] = optimizer_batch_size evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs) remote_cls = PolicyEvaluator.as_remote(remote_num_cpus, remote_num_gpus) remote_evaluators = [ remote_cls.remote(env_creator, policy_graph, **eval_kwargs) for i in range(num_workers) ] return cls(evaluator, remote_evaluators, optimizer_config)
def make(cls, env_creator, policy_graph, optimizer_batch_size=None, num_workers=0, num_envs_per_worker=None, optimizer_config=None, remote_num_cpus=None, remote_num_gpus=None, **eval_kwargs): """Creates an Optimizer with local and remote evaluators. Args: env_creator(func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. See PolicyEvaluator documentation. optimizer_batch_size (int): Batch size summed across all workers. Will override worker `batch_steps`. num_workers (int): Number of remote evaluators num_envs_per_worker (int): (Optional) Sets the number environments per evaluator for vectorization. If set, overrides `num_envs` in kwargs for PolicyEvaluator.__init__. optimizer_config (dict): Config passed to the optimizer. remote_num_cpus (int): CPU specification for remote evaluator. remote_num_gpus (int): GPU specification for remote evaluator. **eval_kwargs: PolicyEvaluator Class non-positional args. Returns: (Optimizer) Instance of `cls` with evaluators configured accordingly. """ optimizer_config = optimizer_config or {} if num_envs_per_worker: assert num_envs_per_worker > 0, "Improper num_envs_per_worker!" eval_kwargs["num_envs"] = int(num_envs_per_worker) if optimizer_batch_size: assert optimizer_batch_size > 0 if num_workers > 1: eval_kwargs["batch_steps"] = \ optimizer_batch_size // num_workers else: eval_kwargs["batch_steps"] = optimizer_batch_size evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs) remote_cls = PolicyEvaluator.as_remote(remote_num_cpus, remote_num_gpus) remote_evaluators = [ remote_cls.remote(env_creator, policy_graph, **eval_kwargs) for i in range(num_workers) ] return cls(evaluator, remote_evaluators, optimizer_config)
def testMetrics(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result.episodes_total, 20) self.assertEqual(result.episode_reward_mean, 10)
def _make_evs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) ] return local, remotes
def make_remote_evaluators(self, env_creator, policy_graph, count): """Convenience method to return a number of remote evaluators.""" remote_args = { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"], "resources": self.config["custom_resources_per_worker"], } cls = PolicyEvaluator.as_remote(**remote_args).remote return [ self._make_evaluator(cls, env_creator, policy_graph, i + 1, self.config) for i in range(count) ]
def testMetrics(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def make_remote_evaluators(self, env_creator, policy_graph, count): """Convenience method to return a number of remote evaluators.""" remote_args = { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"], "resources": self.config["custom_resources_per_worker"], } cls = PolicyEvaluator.as_remote(**remote_args).remote return [ self._make_evaluator(cls, env_creator, policy_graph, i + 1, self.config) for i in range(count) ]
def _make_evs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) ] return local, remotes