def testServingEnvTruncateEpisodes(self): ev = PolicyEvaluator(env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=MockPolicyGraph, batch_steps=40, batch_mode="truncate_episodes") for _ in range(3): batch = ev.sample() self.assertEqual(batch.count, 40)
def testCompleteEpisodes(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(10), policy_graph=MockPolicyGraph, batch_steps=5, batch_mode="complete_episodes") batch = ev.sample() self.assertEqual(batch.count, 10)
def testAsync(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), sample_async=True, policy_graph=MockPolicyGraph) batch = ev.sample() for key in ["obs", "actions", "rewards", "dones", "advantages"]: self.assertIn(key, batch) self.assertGreater(batch["advantages"][0], 1)
def testBatchDivisibilityCheck(self): self.assertRaises( ValueError, lambda: PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=15, num_envs=4))
def testServingEnvHorizonNotSupported(self): ev = PolicyEvaluator(env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=MockPolicyGraph, episode_horizon=20, batch_steps=10, batch_mode="complete_episodes") ev.sample() self.assertRaises(Exception, lambda: ev.sample())
def testExternalEnvBadActions(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=BadPolicyGraph, sample_async=True, batch_steps=40, batch_mode="truncate_episodes") self.assertRaises(Exception, lambda: ev.sample())
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNTFPolicy, obs_space, act_space, dqn_config), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs) for i in range(200): ev.foreach_policy(lambda p, _: p.set_epsilon(max( 0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() ev.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def testBatchIds(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph) batch1 = ev.sample() batch2 = ev.sample() self.assertEqual(len(set(batch1["unroll_id"])), 1) self.assertEqual(len(set(batch2["unroll_id"])), 1) self.assertEqual( len(set(SampleBatch.concat(batch1, batch2)["unroll_id"])), 2)
def testAutoConcat(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=40), policy_graph=MockPolicyGraph, sample_async=True, batch_steps=10, batch_mode="truncate_episodes", observation_filter="ConcurrentMeanStdFilter") time.sleep(2) batch = ev.sample() self.assertEqual(batch.count, 40) # auto-concat up to 5 episodes
def testRewardClipping(self): # clipping on ev = PolicyEvaluator(env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) # clipping off ev2 = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000)
def testCompleteEpisodesPacking(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(10), policy_graph=MockPolicyGraph, batch_steps=15, batch_mode="complete_episodes") batch = ev.sample() self.assertEqual(batch.count, 20) self.assertEqual( batch["t"].tolist(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
def make(cls, env_creator, policy_graph, optimizer_batch_size=None, num_workers=0, num_envs_per_worker=None, optimizer_config=None, remote_num_cpus=None, remote_num_gpus=None, **eval_kwargs): """Creates an Optimizer with local and remote evaluators. Args: env_creator(func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. See PolicyEvaluator documentation. optimizer_batch_size (int): Batch size summed across all workers. Will override worker `batch_steps`. num_workers (int): Number of remote evaluators num_envs_per_worker (int): (Optional) Sets the number environments per evaluator for vectorization. If set, overrides `num_envs` in kwargs for PolicyEvaluator.__init__. optimizer_config (dict): Config passed to the optimizer. remote_num_cpus (int): CPU specification for remote evaluator. remote_num_gpus (int): GPU specification for remote evaluator. **eval_kwargs: PolicyEvaluator Class non-positional args. Returns: (Optimizer) Instance of `cls` with evaluators configured accordingly. """ optimizer_config = optimizer_config or {} if num_envs_per_worker: assert num_envs_per_worker > 0, "Improper num_envs_per_worker!" eval_kwargs["num_envs"] = int(num_envs_per_worker) if optimizer_batch_size: assert optimizer_batch_size > 0 if num_workers > 1: eval_kwargs["batch_steps"] = \ optimizer_batch_size // num_workers else: eval_kwargs["batch_steps"] = optimizer_batch_size evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs) remote_cls = PolicyEvaluator.as_remote(remote_num_cpus, remote_num_gpus) remote_evaluators = [ remote_cls.remote(env_creator, policy_graph, **eval_kwargs) for i in range(num_workers) ] return cls(evaluator, remote_evaluators, optimizer_config)
def testBaselinePerformance(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, batch_steps=100) start = time.time() count = 0 while time.time() - start < 1: count += ev.sample().count print() print("Samples per second {}".format(count / (time.time() - start))) print()
def testExternalEnvOffPolicy(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleOffPolicyServing(MockEnv(25), 42), policy_graph=MockPolicyGraph, batch_steps=40, batch_mode="complete_episodes") for _ in range(3): batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch["actions"][0], 42) self.assertEqual(batch["actions"][-1], 42)
def testFilterSync(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") time.sleep(2) ev.sample() filters = ev.get_filters(flush_after=True) obs_f = filters["default"] self.assertNotEqual(obs_f.rs.n, 0) self.assertNotEqual(obs_f.buffer.n, 0)
def testExternalMultiAgentEnvTruncateEpisodes(self): agents = 4 ev = PolicyEvaluator( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy_graph=MockPolicyGraph, batch_steps=40, batch_mode="truncate_episodes") for _ in range(3): batch = ev.sample() self.assertEqual(batch.count, 160) self.assertEqual(len(np.unique(batch["agent_index"])), agents)
def testSoftHorizon(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes", batch_steps=10, episode_horizon=4, soft_horizon=True) samples = ev.sample() # three logical episodes self.assertEqual(len(set(samples["eps_id"])), 3) # only 1 hard done value self.assertEqual(sum(samples["dones"]), 1)
def testHardHorizon(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes", batch_steps=10, episode_horizon=4, soft_horizon=False) samples = ev.sample() # three logical episodes self.assertEqual(len(set(samples["eps_id"])), 3) # 3 done values self.assertEqual(sum(samples["dones"]), 3)
def testBatchesLargerWhenVectorized(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=8), policy=MockPolicy, batch_mode="truncate_episodes", batch_steps=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def testGetFilters(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") self.sample_and_flush(ev) filters = ev.get_filters(flush_after=False) time.sleep(2) filters2 = ev.get_filters(flush_after=False) obs_f = filters["default"] obs_f2 = filters2["default"] self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
def testMetrics(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result.episodes_total, 20) self.assertEqual(result.episode_reward_mean, 10)
def testBatchesSmallerWhenVectorized(self): ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=16, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 4)
def testMultiAgentSampleWithHorizon(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), episode_horizon=10, # test with episode horizon set batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def testExternalMultiAgentEnvSample(self): agents = 2 act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def _make_evs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) ] return local, remotes
def testSampleFromEarlyDoneEnv(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: EarlyDoneMultiAgent(), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_mode="complete_episodes", batch_steps=1) self.assertRaisesRegexp(ValueError, ".*don't have a last observation.*", lambda: ev.sample())
def testMultiAgentSampleAsyncRemote(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50, num_envs=4, async_remote_worker_envs=True) batch = ev.sample() self.assertEqual(batch.count, 200)
def testVectorEnvSupport(self): ev = PolicyEvaluator( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 8)
def testReturningModelBasedRolloutsData(self): class ModelBasedPolicyGraph(PGPolicyGraph): def compute_actions(self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, is_training=False, episodes=None): # Pretend we did a model-based rollout and want to return # the extra trajectory. builder = episodes[0].new_batch_builder() rollout_id = random.randint(0, 10000) for t in range(5): builder.add_values( agent_id="extra_0", policy_id="p1", # use p1 so we can easily check it t=t, eps_id=rollout_id, # new id for each rollout obs=obs_batch[0], actions=0, rewards=0, dones=t == 4, infos={}, new_obs=obs_batch[0]) batch = builder.build_and_reset() episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {} single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space ev = PolicyEvaluator(env_creator=lambda _: MultiCartpole(2), policy_graph={ "p0": (ModelBasedPolicyGraph, obs_space, act_space, {}), "p1": (ModelBasedPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p0", batch_steps=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch.policy_batches["p0"].count, 10) self.assertEqual(batch.policy_batches["p1"].count, 25)
def testMultiAgentSample(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6)