Example #1
0
 def testBatchesSmallerWhenVectorized(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=16, num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 4)
Example #2
0
 def testVectorEnvSupport(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 8)
Example #3
0
File: a3c.py Project: velconia/ray
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     result = collect_metrics(self.local_evaluator, self.remote_evaluators)
     result = result._replace(info=self.optimizer.stats())
     return result
Example #4
0
 def testMetrics(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     remote_ev = CommonPolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result.episodes_total, 20)
     self.assertEqual(result.episode_reward_mean, 10)
Example #5
0
 def _testWithOptimizer(self, optimizer_cls):
     n = 3
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     dqn_config = {"gamma": 0.95, "n_step": 3}
     if optimizer_cls == SyncReplayOptimizer:
         # TODO: support replay with non-DQN graphs. Currently this can't
         # happen since the replay buffer doesn't encode extra fields like
         # "advantages" that PG uses.
         policies = {
             "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     else:
         policies = {
             "p1": (PGPolicyGraph, obs_space, act_space, {}),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
         batch_steps=50)
     if optimizer_cls == AsyncGradientsOptimizer:
         remote_evs = [CommonPolicyEvaluator.as_remote().remote(
             env_creator=lambda _: MultiCartpole(n),
             policy_graph=policies,
             policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
             batch_steps=50)]
     else:
         remote_evs = []
     optimizer = optimizer_cls({}, ev, remote_evs)
     for i in range(200):
         ev.foreach_policy(
             lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
             if isinstance(p, DQNPolicyGraph) else None)
         optimizer.step()
         result = collect_metrics(ev, remote_evs)
         if i % 20 == 0:
             ev.foreach_policy(
                 lambda p, _: p.update_target()
                 if isinstance(p, DQNPolicyGraph) else None)
             print("Iter {}, rew {}".format(i, result.policy_reward_mean))
             print("Total reward", result.episode_reward_mean)
         if result.episode_reward_mean >= 25 * n:
             return
     print(result)
     raise Exception("failed to improve reward")
Example #6
0
    def _train(self):
        start_timestep = self.global_timestep

        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]):
            self.optimizer.step()
            self.update_target_if_needed()

        exp_vals = [self.exploration0.value(self.global_timestep)]
        self.local_evaluator.for_policy(lambda p: p.set_epsilon(exp_vals[0]))
        for i, e in enumerate(self.remote_evaluators):
            exp_val = self.explorations[i].value(self.global_timestep)
            e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
            exp_vals.append(exp_val)

        result = collect_metrics(self.local_evaluator, self.remote_evaluators)
        return result._replace(info=dict(
            {
                "min_exploration": min(exp_vals),
                "max_exploration": max(exp_vals),
                "num_target_updates": self.num_target_updates,
            }, **self.optimizer.stats()))
Example #7
0
    def _train(self):
        def postprocess_samples(batch):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            value = batch["advantages"]
            standardized = (value - value.mean()) / max(1e-4, value.std())
            batch.data["advantages"] = standardized
            batch.shuffle()
            dummy = np.zeros_like(batch["advantages"])
            if not self.config["use_gae"]:
                batch.data["value_targets"] = dummy
                batch.data["vf_preds"] = dummy

        extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples)
        kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1]
        total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1]
        policy_loss = np.array(extra_fetches["policy_loss"]).mean(axis=1)[-1]
        vf_loss = np.array(extra_fetches["vf_loss"]).mean(axis=1)[-1]
        entropy = np.array(extra_fetches["entropy"]).mean(axis=1)[-1]

        newkl = self.local_evaluator.for_policy(lambda pi: pi.update_kl(kl))

        info = {
            "kl_divergence": kl,
            "kl_coefficient": newkl,
            "total_loss": total_loss,
            "policy_loss": policy_loss,
            "vf_loss": vf_loss,
            "entropy": entropy,
        }

        FilterManager.synchronize(self.local_evaluator.filters,
                                  self.remote_evaluators)
        res = collect_metrics(self.local_evaluator, self.remote_evaluators)
        res = res._replace(info=info)
        return res
Example #8
0
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (
             PGPolicyGraph, obs_space, act_space, {})
     policy_ids = list(policies.keys())
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer({}, ev, [])
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i, result.policy_reward_mean))
         print("Total reward", result.episode_reward_mean)
         if result.episode_reward_mean >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #9
0
File: pg.py Project: zerocurve/ray
 def _train(self):
     self.optimizer.step()
     return collect_metrics(
         self.optimizer.local_evaluator, self.optimizer.remote_evaluators)
Example #10
0
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(
         self.local_evaluator.filters, self.remote_evaluators)
     return collect_metrics(self.local_evaluator, self.remote_evaluators)