Example #1
0
 def test_reject_bad_configs(self):
     local, remotes = self._make_envs()
     workers = WorkerSet._from_existing(local, remotes)
     self.assertRaises(
         ValueError, lambda: AsyncSamplesOptimizer(
             local, remotes,
             num_data_loader_buffers=2, minibatch_buffer_size=4))
     optimizer = AsyncSamplesOptimizer(
         workers,
         num_gpus=1,
         train_batch_size=100,
         rollout_fragment_length=50,
         _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
     optimizer = AsyncSamplesOptimizer(
         workers,
         num_gpus=1,
         train_batch_size=100,
         rollout_fragment_length=25,
         _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
     optimizer = AsyncSamplesOptimizer(
         workers,
         num_gpus=1,
         train_batch_size=100,
         rollout_fragment_length=74,
         _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
Example #2
0
 def test_train_multi_cartpole_many_policies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     worker = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     workers = WorkerSet._from_existing(worker, [])
     optimizer = SyncSamplesOptimizer(workers)
     for i in range(100):
         optimizer.step()
         result = collect_metrics(worker)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #3
0
 def test_train_external_multi_agent_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiAgentCartPole({"num_agents": n}),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #4
0
 def testMultiTierAggregation(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     aggregators = TreeAggregator.precreate_aggregators(1)
     optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=1)
     optimizer.aggregator.init(aggregators)
     self._wait_for(optimizer, 1000, 1000)
Example #5
0
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNTFPolicy, obs_space, act_space, dqn_config),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGTFPolicy, obs_space, act_space, {}),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        worker = RolloutWorker(
            env_creator=lambda _: MultiCartpole(n),
            policy=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_workers = [
                RolloutWorker.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_workers = []
        workers = WorkerSet._from_existing(worker, remote_workers)
        optimizer = optimizer_cls(workers)
        for i in range(200):
            worker.foreach_policy(
                lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
                if isinstance(p, DQNTFPolicy) else None)
            optimizer.step()
            result = collect_metrics(worker, remote_workers)
            if i % 20 == 0:

                def do_update(p):
                    if isinstance(p, DQNTFPolicy):
                        p.update_target()

                worker.foreach_policy(lambda p, _: do_update(p))
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
Example #6
0
 def testRejectBadConfigs(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     self.assertRaises(
         ValueError,
         lambda: AsyncSamplesOptimizer(local,
                                       remotes,
                                       num_data_loader_buffers=2,
                                       minibatch_buffer_size=4))
     optimizer = AsyncSamplesOptimizer(workers,
                                       num_gpus=2,
                                       train_batch_size=100,
                                       sample_batch_size=50,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
     optimizer = AsyncSamplesOptimizer(workers,
                                       num_gpus=2,
                                       train_batch_size=100,
                                       sample_batch_size=25,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
     optimizer = AsyncSamplesOptimizer(workers,
                                       num_gpus=2,
                                       train_batch_size=100,
                                       sample_batch_size=74,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
Example #7
0
 def testMultiTierAggregationBadConf(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     aggregators = TreeAggregator.precreate_aggregators(4)
     optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=4)
     self.assertRaises(ValueError,
                       lambda: optimizer.aggregator.init(aggregators))
Example #8
0
 def testMultiGPUParallelLoad(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers,
                                       num_gpus=2,
                                       num_data_loader_buffers=2,
                                       _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)
Example #9
0
 def test_basic(self):
     local = _MockWorker()
     remotes = ray.remote(_MockWorker)
     remote_workers = [remotes.remote() for i in range(5)]
     workers = WorkerSet._from_existing(local, remote_workers)
     test_optimizer = AsyncGradientsOptimizer(workers, grads_per_step=10)
     test_optimizer.step()
     self.assertTrue(all(local.get_weights() == 0))
Example #10
0
 def testBasic(self):
     ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024)
     local = _MockWorker()
     remotes = ray.remote(_MockWorker)
     remote_workers = [remotes.remote() for i in range(5)]
     workers = WorkerSet._from_existing(local, remote_workers)
     test_optimizer = AsyncGradientsOptimizer(workers, grads_per_step=10)
     test_optimizer.step()
     self.assertTrue(all(local.get_weights() == 0))
Example #11
0
 def testLearnerQueueTimeout(self):
     local, remotes = self._make_envs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers,
                                       sample_batch_size=1000,
                                       train_batch_size=1000,
                                       learner_queue_timeout=1)
     self.assertRaises(AssertionError,
                       lambda: self._wait_for(optimizer, 1000, 1000))
Example #12
0
 def testMultiplePasses(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers,
                                       minibatch_buffer_size=10,
                                       num_sgd_iter=10,
                                       sample_batch_size=10,
                                       train_batch_size=50)
     self._wait_for(optimizer, 1000, 10000)
     self.assertLess(optimizer.stats()["num_steps_sampled"], 5000)
     self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
Example #13
0
def make_workers(n):
    local = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                          policy_spec=PPOTFPolicy,
                          rollout_fragment_length=100)
    remotes = [
        RolloutWorker.as_remote().remote(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_spec=PPOTFPolicy,
            rollout_fragment_length=100) for _ in range(n)
    ]
    workers = WorkerSet._from_existing(local, remotes)
    return workers
Example #14
0
 def testReplay(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(
         workers,
         replay_buffer_num_slots=100,
         replay_proportion=10,
         sample_batch_size=10,
         train_batch_size=10,
     )
     self._wait_for(optimizer, 1000, 1000)
     stats = optimizer.stats()
     self.assertLess(stats["num_steps_sampled"], 5000)
     replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
     self.assertGreater(replay_ratio, 0.7)
     self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
Example #15
0
    def test_replay_and_multiple_passes(self):
        local, remotes = self._make_envs()
        workers = WorkerSet._from_existing(local, remotes)
        optimizer = AsyncSamplesOptimizer(workers,
                                          minibatch_buffer_size=10,
                                          num_sgd_iter=10,
                                          replay_buffer_num_slots=100,
                                          replay_proportion=10,
                                          rollout_fragment_length=10,
                                          train_batch_size=10)
        self._wait_for(optimizer, 1000, 1000)

        stats = optimizer.stats()
        print(stats)
        self.assertLess(stats["num_steps_sampled"], 5000)
        replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"]
        self.assertGreater(replay_ratio, 0.7)
Example #16
0
 def testSimple(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers)
     self._wait_for(optimizer, 1000, 1000)
Example #17
0
 def testMultiGPU(self):
     local, remotes = self._make_evs()
     workers = WorkerSet._from_existing(local, remotes)
     optimizer = AsyncSamplesOptimizer(workers, num_gpus=1, _fake_gpus=True)
     self._wait_for(optimizer, 1000, 1000)