def test_reject_bad_configs(self): local, remotes = self._make_envs() workers = WorkerSet._from_existing(local, remotes) self.assertRaises( ValueError, lambda: AsyncSamplesOptimizer( local, remotes, num_data_loader_buffers=2, minibatch_buffer_size=4)) optimizer = AsyncSamplesOptimizer( workers, num_gpus=1, train_batch_size=100, rollout_fragment_length=50, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer( workers, num_gpus=1, train_batch_size=100, rollout_fragment_length=25, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer( workers, num_gpus=1, train_batch_size=100, rollout_fragment_length=74, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def test_train_multi_cartpole_many_policies(self): n = 20 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) workers = WorkerSet._from_existing(worker, []) optimizer = SyncSamplesOptimizer(workers) for i in range(100): optimizer.step() result = collect_metrics(worker) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def test_train_external_multi_agent_cartpole_many_policies(self): n = 20 single_env = gym.make("CartPole-v0") act_space = single_env.action_space obs_space = single_env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": n}), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), rollout_fragment_length=100) optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev)) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def testMultiTierAggregation(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(1) optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=1) optimizer.aggregator.init(aggregators) self._wait_for(optimizer, 1000, 1000)
def _testWithOptimizer(self, optimizer_cls): n = 3 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. policies = { "p1": (DQNTFPolicy, obs_space, act_space, dqn_config), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } else: policies = { "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) if optimizer_cls == AsyncGradientsOptimizer: def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] remote_workers = [ RolloutWorker.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: remote_workers = [] workers = WorkerSet._from_existing(worker, remote_workers) optimizer = optimizer_cls(workers) for i in range(200): worker.foreach_policy( lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() result = collect_metrics(worker, remote_workers) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() worker.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return print(result) raise Exception("failed to improve reward")
def testRejectBadConfigs(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) self.assertRaises( ValueError, lambda: AsyncSamplesOptimizer(local, remotes, num_data_loader_buffers=2, minibatch_buffer_size=4)) optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, train_batch_size=100, sample_batch_size=50, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, train_batch_size=100, sample_batch_size=25, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, train_batch_size=100, sample_batch_size=74, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def testMultiTierAggregationBadConf(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(4) optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=4) self.assertRaises(ValueError, lambda: optimizer.aggregator.init(aggregators))
def testMultiGPUParallelLoad(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, num_data_loader_buffers=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)
def test_basic(self): local = _MockWorker() remotes = ray.remote(_MockWorker) remote_workers = [remotes.remote() for i in range(5)] workers = WorkerSet._from_existing(local, remote_workers) test_optimizer = AsyncGradientsOptimizer(workers, grads_per_step=10) test_optimizer.step() self.assertTrue(all(local.get_weights() == 0))
def testBasic(self): ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024) local = _MockWorker() remotes = ray.remote(_MockWorker) remote_workers = [remotes.remote() for i in range(5)] workers = WorkerSet._from_existing(local, remote_workers) test_optimizer = AsyncGradientsOptimizer(workers, grads_per_step=10) test_optimizer.step() self.assertTrue(all(local.get_weights() == 0))
def testLearnerQueueTimeout(self): local, remotes = self._make_envs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, sample_batch_size=1000, train_batch_size=1000, learner_queue_timeout=1) self.assertRaises(AssertionError, lambda: self._wait_for(optimizer, 1000, 1000))
def testMultiplePasses(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, minibatch_buffer_size=10, num_sgd_iter=10, sample_batch_size=10, train_batch_size=50) self._wait_for(optimizer, 1000, 10000) self.assertLess(optimizer.stats()["num_steps_sampled"], 5000) self.assertGreater(optimizer.stats()["num_steps_trained"], 8000)
def make_workers(n): local = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=PPOTFPolicy, rollout_fragment_length=100) remotes = [ RolloutWorker.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=PPOTFPolicy, rollout_fragment_length=100) for _ in range(n) ] workers = WorkerSet._from_existing(local, remotes) return workers
def testReplay(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( workers, replay_buffer_num_slots=100, replay_proportion=10, sample_batch_size=10, train_batch_size=10, ) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] self.assertGreater(replay_ratio, 0.7) self.assertLess(stats["num_steps_trained"], stats["num_steps_sampled"])
def test_replay_and_multiple_passes(self): local, remotes = self._make_envs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, minibatch_buffer_size=10, num_sgd_iter=10, replay_buffer_num_slots=100, replay_proportion=10, rollout_fragment_length=10, train_batch_size=10) self._wait_for(optimizer, 1000, 1000) stats = optimizer.stats() print(stats) self.assertLess(stats["num_steps_sampled"], 5000) replay_ratio = stats["num_steps_replayed"] / stats["num_steps_sampled"] self.assertGreater(replay_ratio, 0.7)
def testSimple(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers) self._wait_for(optimizer, 1000, 1000)
def testMultiGPU(self): local, remotes = self._make_evs() workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer(workers, num_gpus=1, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000)