Exemple #1
0
 def testBaselinePerformance(self):
     for _ in range(20):
         ev = RolloutWorker(
             env_creator=lambda _: gym.make("CartPole-v0"),
             policy=MockPolicy,
             batch_steps=100)
         start = time.time()
         count = 0
         while time.time() - start < 1:
             count += ev.sample().count
         print()
         print("Samples per second {}".format(
             count / (time.time() - start)))
         print()
Exemple #2
0
 def test_external_multi_agent_env_sample(self):
     agents = 2
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)),
         policy_spec={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         rollout_fragment_length=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
Exemple #3
0
    def _make_envs(self):
        def make_sess():
            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))

        local = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                              policy=PPOTFPolicy,
                              tf_session_creator=make_sess)
        remotes = [
            RolloutWorker.as_remote().remote(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy=PPOTFPolicy,
                tf_session_creator=make_sess)
        ]
        return local, remotes
 def test_get_filters(self):
     ev = RolloutWorker(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy=MockPolicy,
         sample_async=True,
         observation_filter="ConcurrentMeanStdFilter")
     self.sample_and_flush(ev)
     filters = ev.get_filters(flush_after=False)
     time.sleep(2)
     filters2 = ev.get_filters(flush_after=False)
     obs_f = filters[DEFAULT_POLICY_ID]
     obs_f2 = filters2[DEFAULT_POLICY_ID]
     self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
     self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
 def test_batches_larger_when_vectorized(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(episode_length=8),
         policy=MockPolicy,
         batch_mode="truncate_episodes",
         rollout_fragment_length=4,
         num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
    def test_extra_python_envs(self):
        extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
        self.assertFalse("env_key_1" in os.environ)
        self.assertFalse("env_key_2" in os.environ)
        ev = RolloutWorker(env_creator=lambda _: MockEnv(10),
                           policy=MockPolicy,
                           extra_python_environs=extra_envs)
        self.assertTrue("env_key_1" in os.environ)
        self.assertTrue("env_key_2" in os.environ)
        ev.stop()

        # reset to original
        del os.environ["env_key_1"]
        del os.environ["env_key_2"]
Exemple #7
0
 def test_soft_horizon(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_spec=MockPolicy,
         batch_mode="complete_episodes",
         rollout_fragment_length=10,
         episode_horizon=4,
         soft_horizon=True)
     samples = ev.sample()
     # three logical episodes
     self.assertEqual(len(set(samples["eps_id"])), 3)
     # only 1 hard done value
     self.assertEqual(sum(samples["dones"]), 1)
     ev.stop()
Exemple #8
0
 def test_multi_agent_sample_with_horizon(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         episode_horizon=10,  # test with episode horizon set
         rollout_fragment_length=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
 def test_complete_episodes_packing(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(10),
         policy_spec=MockPolicy,
         rollout_fragment_length=15,
         batch_mode="complete_episodes",
     )
     batch = ev.sample()
     self.assertEqual(batch.count, 20)
     self.assertEqual(
         batch["t"].tolist(),
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
     )
     ev.stop()
Exemple #10
0
 def test_metrics(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes")
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
    def test_returning_model_based_rollouts_data(self):
        class ModelBasedPolicy(DQNTFPolicy):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                prev_action_batch=None,
                                prev_reward_batch=None,
                                episodes=None,
                                **kwargs):
                # In policy loss initialization phase, no episodes are passed
                # in.
                if episodes is not None:
                    # Pretend we did a model-based rollout and want to return
                    # the extra trajectory.
                    builder = episodes[0].new_batch_builder()
                    rollout_id = random.randint(0, 10000)
                    for t in range(5):
                        builder.add_values(
                            agent_id="extra_0",
                            policy_id="p1",  # use p1 so we can easily check it
                            t=t,
                            eps_id=rollout_id,  # new id for each rollout
                            obs=obs_batch[0],
                            actions=0,
                            rewards=0,
                            dones=t == 4,
                            infos={},
                            new_obs=obs_batch[0])
                    batch = builder.build_and_reset(episode=None)
                    episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        single_env = gym.make("CartPole-v0")
        obs_space = single_env.observation_space
        act_space = single_env.action_space
        ev = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}),
            policy_spec={
                "p0": (ModelBasedPolicy, obs_space, act_space, {}),
                "p1": (ModelBasedPolicy, obs_space, act_space, {}),
            },
            policy_mapping_fn=lambda agent_id: "p0",
            rollout_fragment_length=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        self.assertEqual(batch.policy_batches["p1"].count, 25)
Exemple #12
0
 def testMultiAgentSampleAsyncRemote(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50,
         num_envs=4,
         remote_worker_envs=True)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
Exemple #13
0
 def test_baseline_performance(self):
     for _ in range(20):
         ev = RolloutWorker(
             env_creator=lambda _: gym.make("CartPole-v0"),
             policy_spec=MockPolicy,
             rollout_fragment_length=100,
         )
         start = time.time()
         count = 0
         while time.time() - start < 1:
             count += ev.sample().count
         print()
         print("Samples per second {}".format(count /
                                              (time.time() - start)))
         print()
Exemple #14
0
def make_workers(n):
    local = RolloutWorker(
        env_creator=lambda _: gym.make("CartPole-v0"),
        policy_spec=PPOTFPolicy,
        rollout_fragment_length=100,
    )
    remotes = [
        RolloutWorker.as_remote().remote(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_spec=PPOTFPolicy,
            rollout_fragment_length=100,
        ) for _ in range(n)
    ]
    workers = WorkerSet._from_existing(local, remotes)
    return workers
Exemple #15
0
 def testSampleFromEarlyDoneEnv(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: EarlyDoneMultiAgent(),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_mode="complete_episodes",
         batch_steps=1)
     self.assertRaisesRegexp(ValueError,
                             ".*don't have a last observation.*",
                             lambda: ev.sample())
Exemple #16
0
 def test_metrics(self):
     # Allow for Unittest run.
     ray.init(num_cpus=5, ignore_reinit_error=True)
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10),
                        policy=MockPolicy,
                        batch_mode="complete_episodes")
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
Exemple #17
0
 def test_batch_ids(self):
     fragment_len = 100
     ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                        policy_spec=MockPolicy,
                        rollout_fragment_length=fragment_len)
     batch1 = ev.sample()
     batch2 = ev.sample()
     unroll_ids_1 = set(batch1["unroll_id"])
     unroll_ids_2 = set(batch2["unroll_id"])
     # Assert no overlap of unroll IDs between sample() calls.
     self.assertTrue(not any(uid in unroll_ids_2 for uid in unroll_ids_1))
     # CartPole episodes should be short initially: Expect more than one
     # unroll ID in each batch.
     self.assertTrue(len(unroll_ids_1) > 1)
     self.assertTrue(len(unroll_ids_2) > 1)
     ev.stop()
Exemple #18
0
    def add_workers(self, num_workers: int) -> None:
        """Creates and adds a number of remote workers to this worker set.

        Can be called several times on the same WorkerSet to add more
        RolloutWorkers to the set.

        Args:
            num_workers: The number of remote Workers to add to this
                WorkerSet.
        """
        remote_args = {
            "num_cpus": self._remote_config["num_cpus_per_worker"],
            "num_gpus": self._remote_config["num_gpus_per_worker"],
            "resources": self._remote_config["custom_resources_per_worker"],
        }
        cls = RolloutWorker.as_remote(**remote_args).remote
        self._remote_workers.extend(
            [
                self._make_worker(
                    cls=cls,
                    env_creator=self._env_creator,
                    validate_env=None,
                    policy_cls=self._policy_class,
                    worker_index=i + 1,
                    num_workers=num_workers,
                    config=self._remote_config,
                )
                for i in range(num_workers)
            ]
        )
Exemple #19
0
    def _sample_and_train_torch_distributed(worker: RolloutWorker):
        # This function is applied remotely on each rollout worker.
        config = worker.policy_config

        # Generate a sample.
        start = time.perf_counter()
        batch = worker.sample()
        sample_time = time.perf_counter() - start
        expected_batch_size = (config["rollout_fragment_length"] *
                               config["num_envs_per_worker"])
        assert batch.count == expected_batch_size, (
            "Batch size possibly out of sync between workers, expected:",
            expected_batch_size,
            "got:",
            batch.count,
        )

        # Perform n minibatch SGD update(s) on the worker itself.
        start = time.perf_counter()
        info = do_minibatch_sgd(
            batch,
            worker.policy_map,
            worker,
            config["num_sgd_iter"],
            config["sgd_minibatch_size"],
            [Postprocessing.ADVANTAGES],
        )
        learn_on_batch_time = time.perf_counter() - start
        return {
            "info": info,
            "env_steps": batch.env_steps(),
            "agent_steps": batch.agent_steps(),
            "sample_time": sample_time,
            "learn_on_batch_time": learn_on_batch_time,
        }
Exemple #20
0
 def test_train_external_multi_agent_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiAgentCartPole({"num_agents": n}),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Exemple #21
0
    def add_workers(self, num_workers: int) -> None:
        """Creates and add a number of remote workers to this worker set.

        Args:
            num_workers (int): The number of remote Workers to add to this
                WorkerSet.
        """
        remote_args = {
            "num_cpus":
            self._remote_config["num_cpus_per_worker"],
            "num_gpus":
            self._remote_config["num_gpus_per_worker"],
            # memory=0 is an error, but memory=None means no limits.
            "memory":
            self._remote_config["memory_per_worker"] or None,
            "object_store_memory":
            self._remote_config["object_store_memory_per_worker"] or None,
            "resources":
            self._remote_config["custom_resources_per_worker"],
        }
        cls = RolloutWorker.as_remote(**remote_args).remote
        self._remote_workers.extend([
            self._make_worker(cls=cls,
                              env_creator=self._env_creator,
                              validate_env=None,
                              policy_cls=self._policy_class,
                              worker_index=i + 1,
                              num_workers=num_workers,
                              config=self._remote_config)
            for i in range(num_workers)
        ])
Exemple #22
0
 def test_train_multi_cartpole_many_policies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     worker = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     workers = WorkerSet._from_existing(worker, [])
     optimizer = SyncSamplesOptimizer(workers)
     for i in range(100):
         optimizer.step()
         result = collect_metrics(worker)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Exemple #23
0
    def add_workers(self, num_workers: int) -> None:
        """Creates and add a number of remote workers to this worker set.

        Args:
            num_workers (int): The number of remote Workers to add to this
                WorkerSet.
        """
        remote_args = {
            "num_cpus":
            self._remote_config["num_cpus_per_worker"],
            "num_gpus":
            self._remote_config["num_gpus_per_worker"],
            "memory":
            self._remote_config["memory_per_worker"],
            "object_store_memory":
            self._remote_config["object_store_memory_per_worker"],
            "resources":
            self._remote_config["custom_resources_per_worker"],
        }
        cls = RolloutWorker.as_remote(**remote_args).remote
        self._remote_workers.extend([
            self._make_worker(cls, self._env_creator, self._policy_class,
                              i + 1, self._remote_config)
            for i in range(num_workers)
        ])
Exemple #24
0
 def testVectorEnvSupport(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy=MockPolicy,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
Exemple #25
0
    def recreate_failed_workers(
        self, local_worker_for_synching: RolloutWorker
    ) -> Tuple[List[ActorHandle], List[ActorHandle]]:
        """Recreates any failed workers (after health check).

        Args:
            local_worker_for_synching: RolloutWorker to use to synchronize the weights
                after recreation.

        Returns:
            A tuple consisting of two items: The list of removed workers and the list of
            newly added ones.
        """
        faulty_indices = self._worker_health_check()
        removed_workers = []
        new_workers = []
        for worker_index in faulty_indices:
            worker = self.remote_workers()[worker_index - 1]
            removed_workers.append(worker)
            logger.info(f"Trying to recreate faulty worker {worker_index}")
            try:
                worker.__ray_terminate__.remote()
            except Exception:
                logger.exception("Error terminating faulty worker.")
            # Try to recreate the failed worker (start a new one).
            new_worker = self._make_worker(
                cls=self._cls,
                env_creator=self._env_creator,
                validate_env=None,
                policy_cls=self._policy_class,
                worker_index=worker_index,
                num_workers=len(self._remote_workers),
                recreated_worker=True,
                config=self._remote_config,
            )

            # Sync new worker from provided one (or local one).
            new_worker.set_weights.remote(
                weights=local_worker_for_synching.get_weights(),
                global_vars=local_worker_for_synching.get_global_vars(),
            )

            # Add new worker to list of remote workers.
            self._remote_workers[worker_index - 1] = new_worker
            new_workers.append(new_worker)

        return removed_workers, new_workers
Exemple #26
0
 def test_multi_agent_sample_sync_remote(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         # This signature will raise a soft-deprecation warning due
         # to the new signature we are using (agent_id, episode, **kwargs),
         # but should not break this test.
         policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True,
         remote_env_batch_wait_ms=99999999)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
    def test_multi_agent_sample(self):
        def policy_mapping_fn(agent_id, episode, worker, **kwargs):
            return "p{}".format(agent_id % 2)

        ev = RolloutWorker(env_creator=lambda _: BasicMultiAgent(5),
                           policy_spec={
                               "p0": PolicySpec(policy_class=MockPolicy),
                               "p1": PolicySpec(policy_class=MockPolicy),
                           },
                           policy_mapping_fn=policy_mapping_fn,
                           rollout_fragment_length=50)
        batch = ev.sample()
        self.assertEqual(batch.count, 50)
        self.assertEqual(batch.policy_batches["p0"].count, 150)
        self.assertEqual(batch.policy_batches["p1"].count, 100)
        self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                         list(range(25)) * 6)
Exemple #28
0
 def test_multiagent_env(self):
     temp_env = EpisodeEnv(NUM_STEPS, NUM_AGENTS)
     ev = RolloutWorker(
         env_creator=lambda _: EpisodeEnv(NUM_STEPS, NUM_AGENTS),
         policy_spec={
             str(agent_id): (
                 EchoPolicy,
                 temp_env.observation_space,
                 temp_env.action_space,
                 {},
             )
             for agent_id in range(NUM_AGENTS)
         },
         policy_mapping_fn=lambda aid, eps, **kwargs: str(aid),
         callbacks=LastInfoCallback,
     )
     ev.sample()
Exemple #29
0
 def testMultiAgentSample(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     self.assertEqual(batch.policy_batches["p0"].count, 150)
     self.assertEqual(batch.policy_batches["p1"].count, 100)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                      list(range(25)) * 6)
Exemple #30
0
 def test_multi_agent_sample_async_remote(self):
     # Allow to be run via Unittest.
     ray.init(num_cpus=4, ignore_reinit_error=True)
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)