Esempio n. 1
0
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNTFPolicy, obs_space, act_space, dqn_config),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGTFPolicy, obs_space, act_space, {}),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
            policy=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_evs = [
                PolicyEvaluator.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs)
        for i in range(200):
            ev.foreach_policy(lambda p, _: p.set_epsilon(max(
                0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None)
            optimizer.step()
            result = collect_metrics(ev, remote_evs)
            if i % 20 == 0:

                def do_update(p):
                    if isinstance(p, DQNTFPolicy):
                        p.update_target()

                ev.foreach_policy(lambda p, _: do_update(p))
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
Esempio n. 2
0
    def make_remote_evaluators(self, env_creator, policy_graph, count,
                               remote_args):
        """Convenience method to return a number of remote evaluators."""

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGPolicyGraph, obs_space, act_space, {}),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
            policy_graph=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_evs = [
                PolicyEvaluator.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy_graph=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs, {})
        for i in range(200):
            ev.foreach_policy(
                lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
                if isinstance(p, DQNPolicyGraph) else None)
            optimizer.step()
            result = collect_metrics(ev, remote_evs)
            if i % 20 == 0:
                ev.foreach_policy(
                    lambda p, _: p.update_target()
                    if isinstance(p, DQNPolicyGraph) else None)
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
Esempio n. 4
0
    def make(cls,
             env_creator,
             policy_graph,
             optimizer_batch_size=None,
             num_workers=0,
             num_envs_per_worker=None,
             optimizer_config=None,
             remote_num_cpus=None,
             remote_num_gpus=None,
             **eval_kwargs):
        """Creates an Optimizer with local and remote evaluators.

        Args:
            env_creator(func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples.
                See PolicyEvaluator documentation.
            optimizer_batch_size (int): Batch size summed across all workers.
                Will override worker `batch_steps`.
            num_workers (int): Number of remote evaluators
            num_envs_per_worker (int): (Optional) Sets the number
                environments per evaluator for vectorization.
                If set, overrides `num_envs` in kwargs
                for PolicyEvaluator.__init__.
            optimizer_config (dict): Config passed to the optimizer.
            remote_num_cpus (int): CPU specification for remote evaluator.
            remote_num_gpus (int): GPU specification for remote evaluator.
            **eval_kwargs: PolicyEvaluator Class non-positional args.

        Returns:
            (Optimizer) Instance of `cls` with evaluators configured
                accordingly.
        """
        optimizer_config = optimizer_config or {}
        if num_envs_per_worker:
            assert num_envs_per_worker > 0, "Improper num_envs_per_worker!"
            eval_kwargs["num_envs"] = int(num_envs_per_worker)
        if optimizer_batch_size:
            assert optimizer_batch_size > 0
            if num_workers > 1:
                eval_kwargs["batch_steps"] = \
                    optimizer_batch_size // num_workers
            else:
                eval_kwargs["batch_steps"] = optimizer_batch_size
        evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs)
        remote_cls = PolicyEvaluator.as_remote(remote_num_cpus,
                                               remote_num_gpus)
        remote_evaluators = [
            remote_cls.remote(env_creator, policy_graph, **eval_kwargs)
            for i in range(num_workers)
        ]

        return cls(evaluator, remote_evaluators, optimizer_config)
Esempio n. 5
0
    def make(cls,
             env_creator,
             policy_graph,
             optimizer_batch_size=None,
             num_workers=0,
             num_envs_per_worker=None,
             optimizer_config=None,
             remote_num_cpus=None,
             remote_num_gpus=None,
             **eval_kwargs):
        """Creates an Optimizer with local and remote evaluators.

        Args:
            env_creator(func): Function that returns a gym.Env given an
                EnvContext wrapped configuration.
            policy_graph (class|dict): Either a class implementing
                PolicyGraph, or a dictionary of policy id strings to
                (PolicyGraph, obs_space, action_space, config) tuples.
                See PolicyEvaluator documentation.
            optimizer_batch_size (int): Batch size summed across all workers.
                Will override worker `batch_steps`.
            num_workers (int): Number of remote evaluators
            num_envs_per_worker (int): (Optional) Sets the number
                environments per evaluator for vectorization.
                If set, overrides `num_envs` in kwargs
                for PolicyEvaluator.__init__.
            optimizer_config (dict): Config passed to the optimizer.
            remote_num_cpus (int): CPU specification for remote evaluator.
            remote_num_gpus (int): GPU specification for remote evaluator.
            **eval_kwargs: PolicyEvaluator Class non-positional args.

        Returns:
            (Optimizer) Instance of `cls` with evaluators configured
                accordingly.
        """
        optimizer_config = optimizer_config or {}
        if num_envs_per_worker:
            assert num_envs_per_worker > 0, "Improper num_envs_per_worker!"
            eval_kwargs["num_envs"] = int(num_envs_per_worker)
        if optimizer_batch_size:
            assert optimizer_batch_size > 0
            if num_workers > 1:
                eval_kwargs["batch_steps"] = \
                    optimizer_batch_size // num_workers
            else:
                eval_kwargs["batch_steps"] = optimizer_batch_size
        evaluator = PolicyEvaluator(env_creator, policy_graph, **eval_kwargs)
        remote_cls = PolicyEvaluator.as_remote(remote_num_cpus,
                                               remote_num_gpus)
        remote_evaluators = [
            remote_cls.remote(env_creator, policy_graph, **eval_kwargs)
            for i in range(num_workers)
        ]

        return cls(evaluator, remote_evaluators, optimizer_config)
Esempio n. 6
0
 def testMetrics(self):
     ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=10),
                          policy_graph=MockPolicyGraph,
                          batch_mode="complete_episodes")
     remote_ev = PolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result.episodes_total, 20)
     self.assertEqual(result.episode_reward_mean, 10)
Esempio n. 7
0
    def _make_evs(self):
        def make_sess():
            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))

        local = PolicyEvaluator(env_creator=lambda _: gym.make("CartPole-v0"),
                                policy_graph=PPOPolicyGraph,
                                tf_session_creator=make_sess)
        remotes = [
            PolicyEvaluator.as_remote().remote(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy_graph=PPOPolicyGraph,
                tf_session_creator=make_sess)
        ]
        return local, remotes
Esempio n. 8
0
    def make_remote_evaluators(self, env_creator, policy_graph, count):
        """Convenience method to return a number of remote evaluators."""

        remote_args = {
            "num_cpus": self.config["num_cpus_per_worker"],
            "num_gpus": self.config["num_gpus_per_worker"],
            "resources": self.config["custom_resources_per_worker"],
        }

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
Esempio n. 9
0
 def testMetrics(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     remote_ev = PolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
Esempio n. 10
0
    def make_remote_evaluators(self, env_creator, policy_graph, count):
        """Convenience method to return a number of remote evaluators."""

        remote_args = {
            "num_cpus": self.config["num_cpus_per_worker"],
            "num_gpus": self.config["num_gpus_per_worker"],
            "resources": self.config["custom_resources_per_worker"],
        }

        cls = PolicyEvaluator.as_remote(**remote_args).remote
        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]
Esempio n. 11
0
    def _make_evs(self):
        def make_sess():
            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))

        local = PolicyEvaluator(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_graph=PPOPolicyGraph,
            tf_session_creator=make_sess)
        remotes = [
            PolicyEvaluator.as_remote().remote(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy_graph=PPOPolicyGraph,
                tf_session_creator=make_sess)
        ]
        return local, remotes