def testMultiAgentSampleRoundRobin(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(10) ev = PolicyEvaluator( env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p0", batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50) # since we round robin introduce agents into the env, some of the env # steps don't count as proper transitions self.assertEqual(batch.policy_batches["p0"].count, 42) self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10], [ one_hot(0, 10), one_hot(1, 10), one_hot(2, 10), one_hot(3, 10), one_hot(4, 10), ] * 2) self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10], [ one_hot(1, 10), one_hot(2, 10), one_hot(3, 10), one_hot(4, 10), one_hot(5, 10), ] * 2) self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10], [100, 100, 100, 100, 0] * 2) self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10], [False, False, False, False, True] * 2) self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10], [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
def testCustomRNNStateValues(self): h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}} class StatefulPolicyGraph(PolicyGraph): def compute_actions(self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, episodes=None, **kwargs): return [0] * len(obs_batch), [[h] * len(obs_batch)], {} def get_initial_state(self): return [{}] # empty dict ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=StatefulPolicyGraph, batch_steps=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch["state_in_0"][0], {}) self.assertEqual(batch["state_out_0"][0], h) self.assertEqual(batch["state_in_0"][1], h) self.assertEqual(batch["state_out_0"][1], h)
def testCompleteEpisodes(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(10), policy_graph=MockPolicyGraph, batch_steps=5, batch_mode="complete_episodes") batch = ev.sample() self.assertEqual(batch.count, 10)
def testExternalEnvHorizonNotSupported(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=MockPolicyGraph, episode_horizon=20, batch_steps=10, batch_mode="complete_episodes") self.assertRaises(ValueError, lambda: ev.sample())
def testExternalEnvBadActions(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=BadPolicyGraph, sample_async=True, batch_steps=40, batch_mode="truncate_episodes") self.assertRaises(Exception, lambda: ev.sample())
def testAsync(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), sample_async=True, policy_graph=MockPolicyGraph) batch = ev.sample() for key in ["obs", "actions", "rewards", "dones", "advantages"]: self.assertIn(key, batch) self.assertGreater(batch["advantages"][0], 1)
def testExternalEnvTruncateEpisodes(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleServing(MockEnv(25)), policy_graph=MockPolicyGraph, batch_steps=40, batch_mode="truncate_episodes") for _ in range(3): batch = ev.sample() self.assertEqual(batch.count, 40)
def testCompleteEpisodesPacking(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(10), policy_graph=MockPolicyGraph, batch_steps=15, batch_mode="complete_episodes") batch = ev.sample() self.assertEqual(batch.count, 20) self.assertEqual( batch["t"].tolist(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
def testExternalEnvOffPolicy(self): ev = PolicyEvaluator( env_creator=lambda _: SimpleOffPolicyServing(MockEnv(25), 42), policy_graph=MockPolicyGraph, batch_steps=40, batch_mode="complete_episodes") for _ in range(3): batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch["actions"][0], 42) self.assertEqual(batch["actions"][-1], 42)
def testFilterSync(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") time.sleep(2) ev.sample() filters = ev.get_filters(flush_after=True) obs_f = filters["default"] self.assertNotEqual(obs_f.rs.n, 0) self.assertNotEqual(obs_f.buffer.n, 0)
def testBaselinePerformance(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, batch_steps=100) start = time.time() count = 0 while time.time() - start < 1: count += ev.sample().count print() print("Samples per second {}".format(count / (time.time() - start))) print()
def testGetFilters(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") self.sample_and_flush(ev) filters = ev.get_filters(flush_after=False) time.sleep(2) filters2 = ev.get_filters(flush_after=False) obs_f = filters["default"] obs_f2 = filters2["default"] self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
def testBatchesLargerWhenVectorized(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def testMultiAgentSampleWithHorizon(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), episode_horizon=10, # test with episode horizon set batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def testMetrics(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") remote_ev = PolicyEvaluator.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy_graph=MockPolicyGraph, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def testMultiAgentSampleAsyncRemote(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50, num_envs=4, async_remote_worker_envs=True) batch = ev.sample() self.assertEqual(batch.count, 200)
def testSampleFromEarlyDoneEnv(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: EarlyDoneMultiAgent(), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_mode="complete_episodes", batch_steps=1) self.assertRaisesRegexp(ValueError, ".*don't have a last observation.*", lambda: ev.sample())
def testVectorEnvSupport(self): ev = PolicyEvaluator( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8)
def testReturningModelBasedRolloutsData(self): class ModelBasedPolicyGraph(PGPolicyGraph): def compute_actions(self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, episodes=None, **kwargs): # Pretend we did a model-based rollout and want to return # the extra trajectory. builder = episodes[0].new_batch_builder() rollout_id = random.randint(0, 10000) for t in range(5): builder.add_values( agent_id="extra_0", policy_id="p1", # use p1 so we can easily check it t=t, eps_id=rollout_id, # new id for each rollout obs=obs_batch[0], actions=0, rewards=0, dones=t == 4, infos={}, new_obs=obs_batch[0]) batch = builder.build_and_reset(episode=None) episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {} single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(2), policy_graph={ "p0": (ModelBasedPolicyGraph, obs_space, act_space, {}), "p1": (ModelBasedPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p0", batch_steps=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch.policy_batches["p0"].count, 10) self.assertEqual(batch.policy_batches["p1"].count, 25)
def testAutoVectorization(self): ev = PolicyEvaluator( env_creator=lambda _: MockEnv(episode_length=20), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=16, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result.episodes_total, 8)
def testBatchDivisibilityCheck(self): self.assertRaises( ValueError, lambda: PolicyEvaluator(env_creator=lambda _: MockEnv( episode_length=8), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=15, num_envs=4))
def testMultiAgentSample(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = PolicyEvaluator( env_creator=lambda _: BasicMultiAgent(5), policy_graph={ "p0": (MockPolicyGraph, obs_space, act_space, {}), "p1": (MockPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6)
def testReturningModelBasedRolloutsData(self): class ModelBasedPolicyGraph(PGPolicyGraph): def compute_actions(self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, episodes=None): # Pretend we did a model-based rollout and want to return # the extra trajectory. builder = episodes[0].new_batch_builder() rollout_id = random.randint(0, 10000) for t in range(5): builder.add_values( agent_id="extra_0", policy_id="p1", # use p1 so we can easily check it t=t, eps_id=rollout_id, # new id for each rollout obs=obs_batch[0], actions=0, rewards=0, dones=t == 4, infos={}, new_obs=obs_batch[0]) batch = builder.build_and_reset(episode=None) episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {} single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space ev = PolicyEvaluator(env_creator=lambda _: MultiCartpole(2), policy_graph={ "p0": (ModelBasedPolicyGraph, obs_space, act_space, {}), "p1": (ModelBasedPolicyGraph, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p0", batch_steps=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch.policy_batches["p0"].count, 10) self.assertEqual(batch.policy_batches["p1"].count, 25)
def make_remote_evaluators(self, env_creator, policy_graph, count, remote_args): """Convenience method to return a number of remote evaluators.""" cls = PolicyEvaluator.as_remote(**remote_args).remote return [ self._make_evaluator(cls, env_creator, policy_graph, i + 1, self.config) for i in range(count) ]
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNTFPolicy, obs_space, act_space, dqn_config), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs) for i in range(200): ev.foreach_policy(lambda p, _: p.set_epsilon(max( 0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() ev.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def sample(self): self.reset_sample() samples = PolicyEvaluator.sample(self) policy = self.policy_map[DEFAULT_POLICY_ID] if policy.use_linear_baseline: samples = postprocess_trajectory(samples, policy.linear_baseline, self.policy_config["gamma"], self.policy_config["lambda"], self.policy_config["use_gae"]) return samples
def make(cls, env_creator, policy_graph, optimizer_batch_size=None, num_workers=0, num_envs_per_worker=None, optimizer_config=None, remote_num_cpus=None, remote_num_gpus=None, **eval_kwargs): """Creates an Optimizer with local and remote evaluators. Args: env_creator(func): Function that returns a gym.Env given an EnvContext wrapped configuration. policy_graph (class|dict): Either a class implementing PolicyGraph, or a dictionary of policy id strings to (PolicyGraph, obs_space, action_space, config) tuples. See PolicyEvaluator documentation. optimizer_batch_size (int): Batch size summed across all workers. Will override worker `batch_steps`. num_workers (int): Number of remote evaluators num_envs_per_worker (int): (Optional) Sets the number environments per evaluator for vectorization. If set, overrides `num_envs` in kwargs for PolicyEvaluator.__init__. optimizer_config (dict): Config passed to the optimizer. remote_num_cpus (int): CPU specification for remote evaluator. remote_num_gpus (int): GPU specification for remote evaluator. **eval_kwargs: PolicyEvaluator Class non-positional args. Returns: (Optimizer) Instance of `cls` with evaluators configured accordingly. """ optimizer_config = optimizer_config or {} if num_envs_per_worker: assert num_envs_per_worker > 0, "Improper num_envs_per_worker!" eval_kwargs["num_envs"] = int(num_envs_per_worker) if optimizer_batch_size: assert optimizer_batch_size > 0 if num_workers > 1: eval_kwargs["batch_steps"] = \ optimizer_batch_size // num_workers else: eval_kwargs["batch_steps"] = optimizer_batch_size evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs) remote_cls = PolicyEvaluator.as_remote(remote_num_cpus, remote_num_gpus) remote_evaluators = [ remote_cls.remote(env_creator, policy_graph, **eval_kwargs) for i in range(num_workers) ] return cls(evaluator, remote_evaluators, optimizer_config)
def test_rllib_policy_eval(init_done=False): if not init_done: init() assert ( not configs["env"]["render"]), "Tests should be run with render=False" evaluator = PolicyEvaluator( env_creator=lambda _: MultiCarlaEnv(configs), # TODO: Remove the hardcoded spaces policy_graph={ "def_policy": (PGPolicyGraph, Box(0.0, 255.0, shape=(84, 84, 3)), Box(-1.0, 1.0, shape=(2, )), { "gamma": 0.99 }) }, policy_mapping_fn=lambda agent_id: "def_policy", batch_steps=BATCH_COUNT, episode_horizon=EPISODE_HORIZON) samples, count = evaluator.sample_with_count() print("Collected {} samples".format(count)) assert count == BATCH_COUNT
def testRewardClipping(self): # clipping on ev = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) # clipping off ev2 = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000)
def testSyncFilter(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") obs_f = self.sample_and_flush(ev) # Current State filters = ev.get_filters(flush_after=False) obs_f = filters["default"] self.assertLessEqual(obs_f.buffer.n, 20) new_obsf = obs_f.copy() new_obsf.rs._n = 100 ev.sync_filters({"default": new_obsf}) filters = ev.get_filters(flush_after=False) obs_f = filters["default"] self.assertGreaterEqual(obs_f.rs.n, 100) self.assertLessEqual(obs_f.buffer.n, 20)
def testBasic(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph) batch = ev.sample() for key in [ "obs", "actions", "rewards", "dones", "advantages", "prev_rewards", "prev_actions" ]: self.assertIn(key, batch) def to_prev(vec): out = np.zeros_like(vec) for i, v in enumerate(vec): if i + 1 < len(out) and not batch["dones"][i]: out[i + 1] = v return out.tolist() self.assertEqual(batch["prev_rewards"].tolist(), to_prev(batch["rewards"])) self.assertEqual(batch["prev_actions"].tolist(), to_prev(batch["actions"])) self.assertGreater(batch["advantages"][0], 1)
def testAutoVectorization(self): ev = PolicyEvaluator( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), policy_graph=MockPolicyGraph, batch_mode="truncate_episodes", batch_steps=2, num_envs=8) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8) indices = [] for env in ev.async_env.vector_env.envs: self.assertEqual(env.unwrapped.config.worker_index, 0) indices.append(env.unwrapped.config.vector_index) self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
def make_remote_evaluators(self, env_creator, policy_graph, count): """Convenience method to return a number of remote evaluators.""" remote_args = { "num_cpus": self.config["num_cpus_per_worker"], "num_gpus": self.config["num_gpus_per_worker"], "resources": self.config["custom_resources_per_worker"], } cls = PolicyEvaluator.as_remote(**remote_args).remote return [ self._make_evaluator(cls, env_creator, policy_graph, i + 1, self.config) for i in range(count) ]
def testBasic(self): ev = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph) batch = ev.sample() for key in [ "obs", "actions", "rewards", "dones", "advantages", "prev_rewards", "prev_actions" ]: self.assertIn(key, batch) def to_prev(vec): out = np.zeros_like(vec) for i, v in enumerate(vec): if i + 1 < len(out) and not batch["dones"][i]: out[i + 1] = v return out.tolist() self.assertEqual(batch["prev_rewards"].tolist(), to_prev(batch["rewards"])) self.assertEqual(batch["prev_actions"].tolist(), to_prev(batch["actions"])) self.assertGreater(batch["advantages"][0], 1)
def testCustomRNNStateValues(self): h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}} class StatefulPolicyGraph(PolicyGraph): def compute_actions(self, obs_batch, state_batches, is_training=False, episodes=None): return [0] * len(obs_batch), [[h] * len(obs_batch)], {} def get_initial_state(self): return [{}] # empty dict ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=StatefulPolicyGraph, batch_steps=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch["state_in_0"][0], {}) self.assertEqual(batch["state_out_0"][0], h) self.assertEqual(batch["state_in_0"][1], h) self.assertEqual(batch["state_out_0"][1], h)
def _make_evs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) ] return local, remotes
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGPolicyGraph, obs_space, act_space, {}), "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config), } ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_evs = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_evs = [] optimizer = optimizer_cls(ev, remote_evs, {}) for i in range(200): ev.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNPolicyGraph) else None) optimizer.step() result = collect_metrics(ev, remote_evs) if i % 20 == 0: ev.foreach_policy( lambda p, _: p.update_target() if isinstance(p, DQNPolicyGraph) else None) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def testTrainMultiCartpoleManyPolicies(self): n = 20 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = PolicyEvaluator( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) optimizer = SyncSamplesOptimizer(ev, [], {}) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result.policy_reward_mean)) print("Total reward", result.episode_reward_mean) if result.episode_reward_mean >= 25 * n: return raise Exception("failed to improve reward")
def testRewardClipping(self): # clipping on ev = PolicyEvaluator(env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=True, batch_mode="complete_episodes") self.assertEqual(max(ev.sample()["rewards"]), 1) result = collect_metrics(ev, []) self.assertEqual(result["episode_reward_mean"], 1000) # clipping off ev2 = PolicyEvaluator( env_creator=lambda _: MockEnv2(episode_length=10), policy_graph=MockPolicyGraph, clip_rewards=False, batch_mode="complete_episodes") self.assertEqual(max(ev2.sample()["rewards"]), 100) result2 = collect_metrics(ev2, []) self.assertEqual(result2["episode_reward_mean"], 1000)
def testSyncFilter(self): ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=MockPolicyGraph, sample_async=True, observation_filter="ConcurrentMeanStdFilter") obs_f = self.sample_and_flush(ev) # Current State filters = ev.get_filters(flush_after=False) obs_f = filters["default"] self.assertLessEqual(obs_f.buffer.n, 20) new_obsf = obs_f.copy() new_obsf.rs._n = 100 ev.sync_filters({"default": new_obsf}) filters = ev.get_filters(flush_after=False) obs_f = filters["default"] self.assertGreaterEqual(obs_f.rs.n, 100) self.assertLessEqual(obs_f.buffer.n, 20)
def sample(self): self.reset_sample() return PolicyEvaluator.sample(self)
print(pretty_print(config)) sess = tf.InteractiveSession() def env_creator(config): return gym.make("CartPole-v1") evaluator = PolicyEvaluator( env_creator, MAMLPolicyGraph, batch_steps=config["sample_batch_size"], batch_mode=config["batch_mode"], episode_horizon=config["horizon"], preprocessor_pref=config["preprocessor_pref"], sample_async=config["sample_async"], compress_observations=config["compress_observations"], num_envs=config["num_envs_per_worker"], observation_filter=config["observation_filter"], clip_rewards=config["clip_rewards"], env_config=config["env_config"], model_config=config["model"], policy_config=config, worker_index=0, monitor_path=self.logdir if config["monitor"] else None, log_level=config["log_level"]) policy = evaluator.policy_map["default"] batch = evaluator.sample() grads, infos = policy.compute_inner_gradients(batch) # observation_space = env.observation_space # action_space = env.action_space # policy_graph = MAMLPolicyGraph(observation_space, action_space, config)