コード例 #1
0
 def testMultiAgentSampleRoundRobin(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(10)
     ev = PolicyEvaluator(
         env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p0",
         batch_steps=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     # since we round robin introduce agents into the env, some of the env
     # steps don't count as proper transitions
     self.assertEqual(batch.policy_batches["p0"].count, 42)
     self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10], [
         one_hot(0, 10),
         one_hot(1, 10),
         one_hot(2, 10),
         one_hot(3, 10),
         one_hot(4, 10),
     ] * 2)
     self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10], [
         one_hot(1, 10),
         one_hot(2, 10),
         one_hot(3, 10),
         one_hot(4, 10),
         one_hot(5, 10),
     ] * 2)
     self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
                      [100, 100, 100, 100, 0] * 2)
     self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
                      [False, False, False, False, True] * 2)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
                      [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
コード例 #2
0
    def testCustomRNNStateValues(self):
        h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}

        class StatefulPolicyGraph(PolicyGraph):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                prev_action_batch=None,
                                prev_reward_batch=None,
                                episodes=None,
                                **kwargs):
                return [0] * len(obs_batch), [[h] * len(obs_batch)], {}

            def get_initial_state(self):
                return [{}]  # empty dict

        ev = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=StatefulPolicyGraph,
            batch_steps=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch["state_in_0"][0], {})
        self.assertEqual(batch["state_out_0"][0], h)
        self.assertEqual(batch["state_in_0"][1], h)
        self.assertEqual(batch["state_out_0"][1], h)
コード例 #3
0
 def testCompleteEpisodes(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(10),
         policy_graph=MockPolicyGraph,
         batch_steps=5,
         batch_mode="complete_episodes")
     batch = ev.sample()
     self.assertEqual(batch.count, 10)
コード例 #4
0
 def testExternalEnvHorizonNotSupported(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: SimpleServing(MockEnv(25)),
         policy_graph=MockPolicyGraph,
         episode_horizon=20,
         batch_steps=10,
         batch_mode="complete_episodes")
     self.assertRaises(ValueError, lambda: ev.sample())
コード例 #5
0
 def testExternalEnvBadActions(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: SimpleServing(MockEnv(25)),
         policy_graph=BadPolicyGraph,
         sample_async=True,
         batch_steps=40,
         batch_mode="truncate_episodes")
     self.assertRaises(Exception, lambda: ev.sample())
コード例 #6
0
 def testAsync(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: gym.make("CartPole-v0"),
         sample_async=True,
         policy_graph=MockPolicyGraph)
     batch = ev.sample()
     for key in ["obs", "actions", "rewards", "dones", "advantages"]:
         self.assertIn(key, batch)
     self.assertGreater(batch["advantages"][0], 1)
コード例 #7
0
 def testExternalEnvTruncateEpisodes(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: SimpleServing(MockEnv(25)),
         policy_graph=MockPolicyGraph,
         batch_steps=40,
         batch_mode="truncate_episodes")
     for _ in range(3):
         batch = ev.sample()
         self.assertEqual(batch.count, 40)
コード例 #8
0
 def testCompleteEpisodesPacking(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(10),
         policy_graph=MockPolicyGraph,
         batch_steps=15,
         batch_mode="complete_episodes")
     batch = ev.sample()
     self.assertEqual(batch.count, 20)
     self.assertEqual(
         batch["t"].tolist(),
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
コード例 #9
0
 def testExternalEnvOffPolicy(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: SimpleOffPolicyServing(MockEnv(25), 42),
         policy_graph=MockPolicyGraph,
         batch_steps=40,
         batch_mode="complete_episodes")
     for _ in range(3):
         batch = ev.sample()
         self.assertEqual(batch.count, 50)
         self.assertEqual(batch["actions"][0], 42)
         self.assertEqual(batch["actions"][-1], 42)
コード例 #10
0
 def testFilterSync(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_graph=MockPolicyGraph,
         sample_async=True,
         observation_filter="ConcurrentMeanStdFilter")
     time.sleep(2)
     ev.sample()
     filters = ev.get_filters(flush_after=True)
     obs_f = filters["default"]
     self.assertNotEqual(obs_f.rs.n, 0)
     self.assertNotEqual(obs_f.buffer.n, 0)
コード例 #11
0
 def testBaselinePerformance(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_graph=MockPolicyGraph,
         batch_steps=100)
     start = time.time()
     count = 0
     while time.time() - start < 1:
         count += ev.sample().count
     print()
     print("Samples per second {}".format(count / (time.time() - start)))
     print()
コード例 #12
0
 def testGetFilters(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_graph=MockPolicyGraph,
         sample_async=True,
         observation_filter="ConcurrentMeanStdFilter")
     self.sample_and_flush(ev)
     filters = ev.get_filters(flush_after=False)
     time.sleep(2)
     filters2 = ev.get_filters(flush_after=False)
     obs_f = filters["default"]
     obs_f2 = filters2["default"]
     self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
     self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
コード例 #13
0
 def testBatchesLargerWhenVectorized(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=4,
         num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
コード例 #14
0
 def testMultiAgentSampleWithHorizon(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = PolicyEvaluator(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
             "p1": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         episode_horizon=10,  # test with episode horizon set
         batch_steps=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
コード例 #15
0
 def testMetrics(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     remote_ev = PolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
コード例 #16
0
 def testMultiAgentSampleAsyncRemote(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = PolicyEvaluator(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
             "p1": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50,
         num_envs=4,
         async_remote_worker_envs=True)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
コード例 #17
0
 def testSampleFromEarlyDoneEnv(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = PolicyEvaluator(
         env_creator=lambda _: EarlyDoneMultiAgent(),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
             "p1": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_mode="complete_episodes",
         batch_steps=1)
     self.assertRaisesRegexp(ValueError,
                             ".*don't have a last observation.*",
                             lambda: ev.sample())
コード例 #18
0
 def testVectorEnvSupport(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
コード例 #19
0
    def testReturningModelBasedRolloutsData(self):
        class ModelBasedPolicyGraph(PGPolicyGraph):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                prev_action_batch=None,
                                prev_reward_batch=None,
                                episodes=None,
                                **kwargs):
                # Pretend we did a model-based rollout and want to return
                # the extra trajectory.
                builder = episodes[0].new_batch_builder()
                rollout_id = random.randint(0, 10000)
                for t in range(5):
                    builder.add_values(
                        agent_id="extra_0",
                        policy_id="p1",  # use p1 so we can easily check it
                        t=t,
                        eps_id=rollout_id,  # new id for each rollout
                        obs=obs_batch[0],
                        actions=0,
                        rewards=0,
                        dones=t == 4,
                        infos={},
                        new_obs=obs_batch[0])
                batch = builder.build_and_reset(episode=None)
                episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        single_env = gym.make("CartPole-v0")
        obs_space = single_env.observation_space
        act_space = single_env.action_space
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(2),
            policy_graph={
                "p0": (ModelBasedPolicyGraph, obs_space, act_space, {}),
                "p1": (ModelBasedPolicyGraph, obs_space, act_space, {}),
            },
            policy_mapping_fn=lambda agent_id: "p0",
            batch_steps=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        self.assertEqual(batch.policy_batches["p1"].count, 25)
コード例 #20
0
 def testAutoVectorization(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=20),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=16,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 8)
コード例 #21
0
 def testBatchDivisibilityCheck(self):
     self.assertRaises(
         ValueError, lambda: PolicyEvaluator(env_creator=lambda _: MockEnv(
             episode_length=8),
                                             policy_graph=MockPolicyGraph,
                                             batch_mode="truncate_episodes",
                                             batch_steps=15,
                                             num_envs=4))
コード例 #22
0
ファイル: test_multi_agent_env.py プロジェクト: zhiyun/ray
 def testMultiAgentSample(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = PolicyEvaluator(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
             "p1": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     self.assertEqual(batch.policy_batches["p0"].count, 150)
     self.assertEqual(batch.policy_batches["p1"].count, 100)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                      list(range(25)) * 6)
コード例 #23
0
    def testReturningModelBasedRolloutsData(self):
        class ModelBasedPolicyGraph(PGPolicyGraph):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                prev_action_batch=None,
                                prev_reward_batch=None,
                                episodes=None):
                # Pretend we did a model-based rollout and want to return
                # the extra trajectory.
                builder = episodes[0].new_batch_builder()
                rollout_id = random.randint(0, 10000)
                for t in range(5):
                    builder.add_values(
                        agent_id="extra_0",
                        policy_id="p1",  # use p1 so we can easily check it
                        t=t,
                        eps_id=rollout_id,  # new id for each rollout
                        obs=obs_batch[0],
                        actions=0,
                        rewards=0,
                        dones=t == 4,
                        infos={},
                        new_obs=obs_batch[0])
                batch = builder.build_and_reset(episode=None)
                episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        single_env = gym.make("CartPole-v0")
        obs_space = single_env.observation_space
        act_space = single_env.action_space
        ev = PolicyEvaluator(env_creator=lambda _: MultiCartpole(2),
                             policy_graph={
                                 "p0": (ModelBasedPolicyGraph, obs_space,
                                        act_space, {}),
                                 "p1": (ModelBasedPolicyGraph, obs_space,
                                        act_space, {}),
                             },
                             policy_mapping_fn=lambda agent_id: "p0",
                             batch_steps=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        self.assertEqual(batch.policy_batches["p1"].count, 25)
コード例 #24
0
 def testMultiAgentSample(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = PolicyEvaluator(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_graph={
             "p0": (MockPolicyGraph, obs_space, act_space, {}),
             "p1": (MockPolicyGraph, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     self.assertEqual(batch.policy_batches["p0"].count, 150)
     self.assertEqual(batch.policy_batches["p1"].count, 100)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                      list(range(25)) * 6)
コード例 #25
0
ファイル: agent.py プロジェクト: gavinljj/ray
    def make_remote_evaluators(self, env_creator, policy_graph, count,
                               remote_args):
        """Convenience method to return a number of remote evaluators."""

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
コード例 #26
0
ファイル: test_multi_agent_env.py プロジェクト: zzz622848/ray
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNTFPolicy, obs_space, act_space, dqn_config),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGTFPolicy, obs_space, act_space, {}),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
            policy=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_evs = [
                PolicyEvaluator.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs)
        for i in range(200):
            ev.foreach_policy(lambda p, _: p.set_epsilon(max(
                0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None)
            optimizer.step()
            result = collect_metrics(ev, remote_evs)
            if i % 20 == 0:

                def do_update(p):
                    if isinstance(p, DQNTFPolicy):
                        p.update_target()

                ev.foreach_policy(lambda p, _: do_update(p))
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
コード例 #27
0
 def sample(self):
     self.reset_sample()
     samples = PolicyEvaluator.sample(self)
     policy = self.policy_map[DEFAULT_POLICY_ID]
     if policy.use_linear_baseline:
         samples = postprocess_trajectory(samples, policy.linear_baseline,
                                          self.policy_config["gamma"],
                                          self.policy_config["lambda"],
                                          self.policy_config["use_gae"])
     return samples
コード例 #28
0
    def make(cls,
             env_creator,
             policy_graph,
             optimizer_batch_size=None,
             num_workers=0,
             num_envs_per_worker=None,
             optimizer_config=None,
             remote_num_cpus=None,
             remote_num_gpus=None,
             **eval_kwargs):
        """Creates an Optimizer with local and remote evaluators.

        Args:
            env_creator(func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples.
                See PolicyEvaluator documentation.
            optimizer_batch_size (int): Batch size summed across all workers.
                Will override worker `batch_steps`.
            num_workers (int): Number of remote evaluators
            num_envs_per_worker (int): (Optional) Sets the number
                environments per evaluator for vectorization.
                If set, overrides `num_envs` in kwargs
                for PolicyEvaluator.__init__.
            optimizer_config (dict): Config passed to the optimizer.
            remote_num_cpus (int): CPU specification for remote evaluator.
            remote_num_gpus (int): GPU specification for remote evaluator.
            **eval_kwargs: PolicyEvaluator Class non-positional args.

        Returns:
            (Optimizer) Instance of `cls` with evaluators configured
                accordingly.
        """
        optimizer_config = optimizer_config or {}
        if num_envs_per_worker:
            assert num_envs_per_worker > 0, "Improper num_envs_per_worker!"
            eval_kwargs["num_envs"] = int(num_envs_per_worker)
        if optimizer_batch_size:
            assert optimizer_batch_size > 0
            if num_workers > 1:
                eval_kwargs["batch_steps"] = \
                    optimizer_batch_size // num_workers
            else:
                eval_kwargs["batch_steps"] = optimizer_batch_size
        evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs)
        remote_cls = PolicyEvaluator.as_remote(remote_num_cpus,
                                               remote_num_gpus)
        remote_evaluators = [
            remote_cls.remote(env_creator, policy_graph, **eval_kwargs)
            for i in range(num_workers)
        ]

        return cls(evaluator, remote_evaluators, optimizer_config)
コード例 #29
0
def test_rllib_policy_eval(init_done=False):
    if not init_done:
        init()
    assert (
        not configs["env"]["render"]), "Tests should be run with render=False"
    evaluator = PolicyEvaluator(
        env_creator=lambda _: MultiCarlaEnv(configs),
        # TODO: Remove the hardcoded spaces
        policy_graph={
            "def_policy": (PGPolicyGraph, Box(0.0, 255.0, shape=(84, 84, 3)),
                           Box(-1.0, 1.0, shape=(2, )), {
                               "gamma": 0.99
                           })
        },
        policy_mapping_fn=lambda agent_id: "def_policy",
        batch_steps=BATCH_COUNT,
        episode_horizon=EPISODE_HORIZON)
    samples, count = evaluator.sample_with_count()
    print("Collected {} samples".format(count))
    assert count == BATCH_COUNT
コード例 #30
0
    def testRewardClipping(self):
        # clipping on
        ev = PolicyEvaluator(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_graph=MockPolicyGraph,
            clip_rewards=True,
            batch_mode="complete_episodes")
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)

        # clipping off
        ev2 = PolicyEvaluator(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_graph=MockPolicyGraph,
            clip_rewards=False,
            batch_mode="complete_episodes")
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
コード例 #31
0
    def testSyncFilter(self):
        ev = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=MockPolicyGraph,
            sample_async=True,
            observation_filter="ConcurrentMeanStdFilter")
        obs_f = self.sample_and_flush(ev)

        # Current State
        filters = ev.get_filters(flush_after=False)
        obs_f = filters["default"]

        self.assertLessEqual(obs_f.buffer.n, 20)

        new_obsf = obs_f.copy()
        new_obsf.rs._n = 100
        ev.sync_filters({"default": new_obsf})
        filters = ev.get_filters(flush_after=False)
        obs_f = filters["default"]
        self.assertGreaterEqual(obs_f.rs.n, 100)
        self.assertLessEqual(obs_f.buffer.n, 20)
コード例 #32
0
    def testBasic(self):
        ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"),
                             policy_graph=MockPolicyGraph)
        batch = ev.sample()
        for key in [
                "obs", "actions", "rewards", "dones", "advantages",
                "prev_rewards", "prev_actions"
        ]:
            self.assertIn(key, batch)

        def to_prev(vec):
            out = np.zeros_like(vec)
            for i, v in enumerate(vec):
                if i + 1 < len(out) and not batch["dones"][i]:
                    out[i + 1] = v
            return out.tolist()

        self.assertEqual(batch["prev_rewards"].tolist(),
                         to_prev(batch["rewards"]))
        self.assertEqual(batch["prev_actions"].tolist(),
                         to_prev(batch["actions"]))
        self.assertGreater(batch["advantages"][0], 1)
コード例 #33
0
 def testAutoVectorization(self):
     ev = PolicyEvaluator(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
コード例 #34
0
 def testAutoVectorization(self):
     ev = PolicyEvaluator(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
コード例 #35
0
ファイル: agent.py プロジェクト: jamescasbon/ray
    def make_remote_evaluators(self, env_creator, policy_graph, count):
        """Convenience method to return a number of remote evaluators."""

        remote_args = {
            "num_cpus": self.config["num_cpus_per_worker"],
            "num_gpus": self.config["num_gpus_per_worker"],
            "resources": self.config["custom_resources_per_worker"],
        }

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
コード例 #36
0
ファイル: agent.py プロジェクト: leigeng2014/ray
    def make_remote_evaluators(self, env_creator, policy_graph, count):
        """Convenience method to return a number of remote evaluators."""

        remote_args = {
            "num_cpus": self.config["num_cpus_per_worker"],
            "num_gpus": self.config["num_gpus_per_worker"],
            "resources": self.config["custom_resources_per_worker"],
        }

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
コード例 #37
0
    def testBasic(self):
        ev = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=MockPolicyGraph)
        batch = ev.sample()
        for key in [
                "obs", "actions", "rewards", "dones", "advantages",
                "prev_rewards", "prev_actions"
        ]:
            self.assertIn(key, batch)

        def to_prev(vec):
            out = np.zeros_like(vec)
            for i, v in enumerate(vec):
                if i + 1 < len(out) and not batch["dones"][i]:
                    out[i + 1] = v
            return out.tolist()

        self.assertEqual(batch["prev_rewards"].tolist(),
                         to_prev(batch["rewards"]))
        self.assertEqual(batch["prev_actions"].tolist(),
                         to_prev(batch["actions"]))
        self.assertGreater(batch["advantages"][0], 1)
コード例 #38
0
    def testCustomRNNStateValues(self):
        h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}

        class StatefulPolicyGraph(PolicyGraph):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                is_training=False,
                                episodes=None):
                return [0] * len(obs_batch), [[h] * len(obs_batch)], {}

            def get_initial_state(self):
                return [{}]  # empty dict

        ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"),
                             policy_graph=StatefulPolicyGraph,
                             batch_steps=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch["state_in_0"][0], {})
        self.assertEqual(batch["state_out_0"][0], h)
        self.assertEqual(batch["state_in_0"][1], h)
        self.assertEqual(batch["state_out_0"][1], h)
コード例 #39
0
ファイル: test_optimizers.py プロジェクト: jamescasbon/ray
    def _make_evs(self):
        def make_sess():
            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))

        local = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=PPOPolicyGraph,
            tf_session_creator=make_sess)
        remotes = [
            PolicyEvaluator.as_remote().remote(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy_graph=PPOPolicyGraph,
                tf_session_creator=make_sess)
        ]
        return local, remotes
コード例 #40
0
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGPolicyGraph, obs_space, act_space, {}),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
            policy_graph=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_evs = [
                PolicyEvaluator.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy_graph=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs, {})
        for i in range(200):
            ev.foreach_policy(
                lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
                if isinstance(p, DQNPolicyGraph) else None)
            optimizer.step()
            result = collect_metrics(ev, remote_evs)
            if i % 20 == 0:
                ev.foreach_policy(
                    lambda p, _: p.update_target()
                    if isinstance(p, DQNPolicyGraph) else None)
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
コード例 #41
0
ファイル: test_multi_agent_env.py プロジェクト: zhiyun/ray
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i, result.policy_reward_mean))
         print("Total reward", result.episode_reward_mean)
         if result.episode_reward_mean >= 25 * n:
             return
     raise Exception("failed to improve reward")
コード例 #42
0
    def testRewardClipping(self):
        # clipping on
        ev = PolicyEvaluator(env_creator=lambda _: MockEnv2(episode_length=10),
                             policy_graph=MockPolicyGraph,
                             clip_rewards=True,
                             batch_mode="complete_episodes")
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)

        # clipping off
        ev2 = PolicyEvaluator(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_graph=MockPolicyGraph,
            clip_rewards=False,
            batch_mode="complete_episodes")
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
コード例 #43
0
    def testSyncFilter(self):
        ev = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"),
                             policy_graph=MockPolicyGraph,
                             sample_async=True,
                             observation_filter="ConcurrentMeanStdFilter")
        obs_f = self.sample_and_flush(ev)

        # Current State
        filters = ev.get_filters(flush_after=False)
        obs_f = filters["default"]

        self.assertLessEqual(obs_f.buffer.n, 20)

        new_obsf = obs_f.copy()
        new_obsf.rs._n = 100
        ev.sync_filters({"default": new_obsf})
        filters = ev.get_filters(flush_after=False)
        obs_f = filters["default"]
        self.assertGreaterEqual(obs_f.rs.n, 100)
        self.assertLessEqual(obs_f.buffer.n, 20)
コード例 #44
0
 def sample(self):
     self.reset_sample()
     return PolicyEvaluator.sample(self)
コード例 #45
0
    print(pretty_print(config))

    sess = tf.InteractiveSession()

    def env_creator(config):
        return gym.make("CartPole-v1")

    evaluator = PolicyEvaluator(
        env_creator,
        MAMLPolicyGraph,
        batch_steps=config["sample_batch_size"],
        batch_mode=config["batch_mode"],
        episode_horizon=config["horizon"],
        preprocessor_pref=config["preprocessor_pref"],
        sample_async=config["sample_async"],
        compress_observations=config["compress_observations"],
        num_envs=config["num_envs_per_worker"],
        observation_filter=config["observation_filter"],
        clip_rewards=config["clip_rewards"],
        env_config=config["env_config"],
        model_config=config["model"],
        policy_config=config,
        worker_index=0,
        monitor_path=self.logdir if config["monitor"] else None,
        log_level=config["log_level"])
    policy = evaluator.policy_map["default"]
    batch = evaluator.sample()
    grads, infos = policy.compute_inner_gradients(batch)

    # observation_space = env.observation_space
    # action_space = env.action_space
    # policy_graph = MAMLPolicyGraph(observation_space, action_space, config)