Example #1
0
    def test_returning_model_based_rollouts_data(self):
        class ModelBasedPolicy(PGTFPolicy):
            def compute_actions(self,
                                obs_batch,
                                state_batches,
                                prev_action_batch=None,
                                prev_reward_batch=None,
                                episodes=None,
                                **kwargs):
                # Pretend we did a model-based rollout and want to return
                # the extra trajectory.
                builder = episodes[0].new_batch_builder()
                rollout_id = random.randint(0, 10000)
                for t in range(5):
                    builder.add_values(
                        agent_id="extra_0",
                        policy_id="p1",  # use p1 so we can easily check it
                        t=t,
                        eps_id=rollout_id,  # new id for each rollout
                        obs=obs_batch[0],
                        actions=0,
                        rewards=0,
                        dones=t == 4,
                        infos={},
                        new_obs=obs_batch[0])
                batch = builder.build_and_reset(episode=None)
                episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        single_env = gym.make("CartPole-v0")
        obs_space = single_env.observation_space
        act_space = single_env.action_space
        ev = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}),
            policy={
                "p0": (ModelBasedPolicy, obs_space, act_space, {}),
                "p1": (ModelBasedPolicy, obs_space, act_space, {}),
            },
            policy_mapping_fn=lambda agent_id: "p0",
            rollout_fragment_length=5)
        batch = ev.sample()
        self.assertEqual(batch.count, 5)
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        self.assertEqual(batch.policy_batches["p1"].count, 25)
Example #2
0
 def test_multi_agent_sample(self):
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         rollout_fragment_length=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     self.assertEqual(batch.policy_batches["p0"].count, 150)
     self.assertEqual(batch.policy_batches["p1"].count, 100)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                      list(range(25)) * 6)
Example #3
0
    def test_multi_agent_sample(self):
        def policy_mapping_fn(agent_id, episode, **kwargs):
            return "p{}".format(agent_id % 2)

        ev = RolloutWorker(env_creator=lambda _: BasicMultiAgent(5),
                           policy_spec={
                               "p0": PolicySpec(policy_class=MockPolicy),
                               "p1": PolicySpec(policy_class=MockPolicy),
                           },
                           policy_mapping_fn=policy_mapping_fn,
                           rollout_fragment_length=50)
        batch = ev.sample()
        self.assertEqual(batch.count, 50)
        self.assertEqual(batch.policy_batches["p0"].count, 150)
        self.assertEqual(batch.policy_batches["p1"].count, 100)
        self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
                         list(range(25)) * 6)
Example #4
0
 def test_multi_agent_sample_sync_remote(self):
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         # This signature will raise a soft-deprecation warning due
         # to the new signature we are using (agent_id, episode, **kwargs),
         # but should not break this test.
         policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True,
         remote_env_batch_wait_ms=99999999)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
Example #5
0
 def test_metrics(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_spec=MockPolicy,
         batch_mode="complete_episodes",
     )
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_spec=MockPolicy,
         batch_mode="complete_episodes",
     )
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
     ev.stop()
Example #6
0
 def test_vector_env_support(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_spec=MockPolicy,
         batch_mode="truncate_episodes",
         rollout_fragment_length=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     ev.stop()
Example #7
0
 def test_multiagent_env(self):
     temp_env = EpisodeEnv(NUM_STEPS, NUM_AGENTS)
     ev = RolloutWorker(
         env_creator=lambda _: EpisodeEnv(NUM_STEPS, NUM_AGENTS),
         policy_spec={
             str(agent_id): (
                 EchoPolicy,
                 temp_env.observation_space,
                 temp_env.action_space,
                 {},
             )
             for agent_id in range(NUM_AGENTS)
         },
         policy_mapping_fn=lambda aid, eps, **kwargs: str(aid),
         callbacks=LastInfoCallback,
     )
     ev.sample()
Example #8
0
 def test_multi_agent_sample_async_remote(self):
     # Allow to be run via Unittest.
     ray.init(num_cpus=4, ignore_reinit_error=True)
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
Example #9
0
    def test_action_immutability(self):
        from ray.rllib.examples.env.random_env import RandomEnv

        action_space = gym.spaces.Box(0.0001, 0.0002, (5,))

        class ActionMutationEnv(RandomEnv):
            def init(self, config):
                self.test_case = config["test_case"]
                super().__init__(config=config)

            def step(self, action):
                # Ensure that it is called from inside the sampling process.
                import inspect

                curframe = inspect.currentframe()
                called_from_check = any(
                    frame[3] == "check_gym_environments"
                    for frame in inspect.getouterframes(curframe, 2)
                )
                # Check, whether the action is immutable.
                if action.flags.writeable and not called_from_check:
                    self.test_case.assertFalse(
                        action.flags.writeable, "Action is mutable"
                    )
                return super().step(action)

        ev = RolloutWorker(
            env_creator=lambda _: ActionMutationEnv(
                config=dict(
                    test_case=self,
                    action_space=action_space,
                    max_episode_len=10,
                    p_done=0.0,
                    check_action_bounds=True,
                )
            ),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=False,
            batch_mode="complete_episodes",
        )
        ev.sample()
        ev.stop()
Example #10
0
def _create_embedded_rollout_worker(kwargs, send_fn):
    """Create a local rollout worker and a thread that samples from it.

    Args:
        kwargs (dict): args for the RolloutWorker constructor.
        send_fn (fn): function to send a JSON request to the server.
    """

    # Since the server acts as an input datasource, we have to reset the
    # input config to the default, which runs env rollouts.
    kwargs = kwargs.copy()
    del kwargs["input_creator"]

    # Since the server also acts as an output writer, we might have to reset
    # the output config to the default, i.e. "output": None, otherwise a
    # local rollout worker might write to an unknown output directory
    del kwargs["output_creator"]

    # If server has no env (which is the expected case):
    # Generate a dummy ExternalEnv here using RandomEnv and the
    # given observation/action spaces.
    if kwargs["policy_config"].get("env") is None:
        from ray.rllib.examples.env.random_env import RandomEnv, \
            RandomMultiAgentEnv
        config = {
            "action_space": kwargs["policy_config"]["action_space"],
            "observation_space": kwargs["policy_config"]["observation_space"],
        }
        _, is_ma = check_multi_agent(kwargs["policy_config"])
        kwargs["env_creator"] = _auto_wrap_external(
            lambda _: (RandomMultiAgentEnv if is_ma else RandomEnv)(config))
        kwargs["policy_config"]["env"] = True
    # Otherwise, use the env specified by the server args.
    else:
        real_env_creator = kwargs["env_creator"]
        kwargs["env_creator"] = _auto_wrap_external(real_env_creator)

    logger.info("Creating rollout worker with kwargs={}".format(kwargs))
    from ray.rllib.evaluation.rollout_worker import RolloutWorker
    rollout_worker = RolloutWorker(**kwargs)

    inference_thread = _LocalInferenceThread(rollout_worker, send_fn)
    inference_thread.start()

    return rollout_worker, inference_thread
Example #11
0
 def testMultiAgentSampleSyncRemote(self):
     # Allow to be run via Unittest.
     ray.init(num_cpus=4, ignore_reinit_error=True)
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2),
         batch_steps=50,
         num_envs=4,
         remote_worker_envs=True,
         remote_env_batch_wait_ms=99999999)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
Example #12
0
 def test_batch_ids(self):
     fragment_len = 100
     ev = RolloutWorker(
         env_creator=lambda _: gym.make("CartPole-v0"),
         policy_spec=MockPolicy,
         rollout_fragment_length=fragment_len,
     )
     batch1 = ev.sample()
     batch2 = ev.sample()
     unroll_ids_1 = set(batch1["unroll_id"])
     unroll_ids_2 = set(batch2["unroll_id"])
     # Assert no overlap of unroll IDs between sample() calls.
     self.assertTrue(not any(uid in unroll_ids_2 for uid in unroll_ids_1))
     # CartPole episodes should be short initially: Expect more than one
     # unroll ID in each batch.
     self.assertTrue(len(unroll_ids_1) > 1)
     self.assertTrue(len(unroll_ids_2) > 1)
     ev.stop()
Example #13
0
    def test_traj_view_lstm_functionality(self):
        action_space = Box(float("-inf"), float("inf"), shape=(3, ))
        obs_space = Box(float("-inf"), float("inf"), (4, ))
        max_seq_len = 50
        rollout_fragment_length = 200
        assert rollout_fragment_length % max_seq_len == 0
        policies = {
            "pol0": (EpisodeEnvAwareLSTMPolicy, obs_space, action_space, {}),
        }

        def policy_fn(agent_id, episode, **kwargs):
            return "pol0"

        config = {
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_fn,
            },
            "model": {
                "use_lstm": True,
                "max_seq_len": max_seq_len,
            },
        }

        rw = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config=config,
            rollout_fragment_length=rollout_fragment_length,
            policy_spec=policies,
            policy_mapping_fn=policy_fn,
            normalize_actions=False,
            num_envs=1,
        )

        for iteration in range(20):
            result = rw.sample()
            check(result.count, rollout_fragment_length)
            pol_batch_w = result.policy_batches["pol0"]
            assert pol_batch_w.count >= rollout_fragment_length
            analyze_rnn_batch(
                pol_batch_w,
                max_seq_len,
                view_requirements=rw.policy_map["pol0"].view_requirements,
            )
Example #14
0
    def testSyncFilter(self):
        ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                           policy=MockPolicy,
                           sample_async=True,
                           observation_filter="ConcurrentMeanStdFilter")
        obs_f = self.sample_and_flush(ev)

        # Current State
        filters = ev.get_filters(flush_after=False)
        obs_f = filters[DEFAULT_POLICY_ID]

        self.assertLessEqual(obs_f.buffer.n, 20)

        new_obsf = obs_f.copy()
        new_obsf.rs._n = 100
        ev.sync_filters({DEFAULT_POLICY_ID: new_obsf})
        filters = ev.get_filters(flush_after=False)
        obs_f = filters[DEFAULT_POLICY_ID]
        self.assertGreaterEqual(obs_f.rs.n, 100)
        self.assertLessEqual(obs_f.buffer.n, 20)
Example #15
0
def _create_embedded_rollout_worker(kwargs, send_fn):
    """Create a local rollout worker and a thread that samples from it.

    Arguments:
        kwargs (dict): args for the RolloutWorker constructor.
        send_fn (fn): function to send a JSON request to the server.
    """

    # Since the server acts as an input datasource, we have to reset the
    # input config to the default, which runs env rollouts.
    kwargs = kwargs.copy()
    del kwargs["input_creator"]
    logger.info("Creating rollout worker with kwargs={}".format(kwargs))
    real_env_creator = kwargs["env_creator"]
    kwargs["env_creator"] = _auto_wrap_external(real_env_creator)

    rollout_worker = RolloutWorker(**kwargs)
    inference_thread = _LocalInferenceThread(rollout_worker, send_fn)
    inference_thread.start()
    return rollout_worker, inference_thread
Example #16
0
 def test_multiagent_env(self):
     temp_env = EpisodeEnv(NUM_STEPS, NUM_AGENTS)
     ev = RolloutWorker(
         env_creator=lambda _: temp_env,
         policy_spec={
             str(agent_id): (
                 EchoPolicy,
                 temp_env.observation_space,
                 temp_env.action_space,
                 {},
             )
             for agent_id in range(NUM_AGENTS)
         },
         policy_mapping_fn=lambda aid, eps, **kwargs: str(aid),
     )
     sample_batches = ev.sample()
     self.assertEqual(len(sample_batches.policy_batches), 4)
     for agent_id, sample_batch in sample_batches.policy_batches.items():
         self.assertEqual(sample_batch.count, 100)
         # A batch of 100. 4 episodes, each 25.
         self.assertEqual(len(set(sample_batch["eps_id"])), 4)
Example #17
0
 def test_multi_agent_sample_sync_remote(self):
     # Allow to be run via Unittest.
     ray.init(num_cpus=4, ignore_reinit_error=True)
     act_space = gym.spaces.Discrete(2)
     obs_space = gym.spaces.Discrete(2)
     ev = RolloutWorker(
         env_creator=lambda _: BasicMultiAgent(5),
         policy_spec={
             "p0": (MockPolicy, obs_space, act_space, {}),
             "p1": (MockPolicy, obs_space, act_space, {}),
         },
         # This signature will raise a soft-deprecation warning due
         # to the new signature we are using (agent_id, episode, **kwargs),
         # but should not break this test.
         policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)),
         rollout_fragment_length=50,
         num_envs=4,
         remote_worker_envs=True,
         remote_env_batch_wait_ms=99999999)
     batch = ev.sample()
     self.assertEqual(batch.count, 200)
Example #18
0
 def testAutoVectorization(self):
     ev = RolloutWorker(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy=MockPolicy,
         batch_mode="truncate_episodes",
         batch_steps=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
Example #19
0
 def test_multi_agent_sample_round_robin(self):
     ev = RolloutWorker(
         env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0",
         rollout_fragment_length=50)
     batch = ev.sample()
     self.assertEqual(batch.count, 50)
     # since we round robin introduce agents into the env, some of the env
     # steps don't count as proper transitions
     self.assertEqual(batch.policy_batches["p0"].count, 42)
     check(batch.policy_batches["p0"]["obs"][:10],
           one_hot(np.array([0, 1, 2, 3, 4] * 2), 10))
     check(batch.policy_batches["p0"]["new_obs"][:10],
           one_hot(np.array([1, 2, 3, 4, 5] * 2), 10))
     self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
                      [100, 100, 100, 100, 0] * 2)
     self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
                      [False, False, False, False, True] * 2)
     self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
                      [4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
Example #20
0
    def testBasic(self):
        ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"),
                           policy=MockPolicy)
        batch = ev.sample()
        for key in [
                "obs", "actions", "rewards", "dones", "advantages",
                "prev_rewards", "prev_actions"
        ]:
            self.assertIn(key, batch)
            self.assertGreater(np.abs(np.mean(batch[key])), 0)

        def to_prev(vec):
            out = np.zeros_like(vec)
            for i, v in enumerate(vec):
                if i + 1 < len(out) and not batch["dones"][i]:
                    out[i + 1] = v
            return out.tolist()

        self.assertEqual(batch["prev_rewards"].tolist(),
                         to_prev(batch["rewards"]))
        self.assertEqual(batch["prev_actions"].tolist(),
                         to_prev(batch["actions"]))
        self.assertGreater(batch["advantages"][0], 1)
Example #21
0
 def test_sample_from_early_done_env(self):
     ev = RolloutWorker(
         env_creator=lambda _: EarlyDoneMultiAgent(),
         policy_spec={
             "p0": PolicySpec(policy_class=MockPolicy),
             "p1": PolicySpec(policy_class=MockPolicy),
         },
         policy_mapping_fn=(lambda aid, **kwargs: "p{}".format(aid % 2)),
         batch_mode="complete_episodes",
         rollout_fragment_length=1)
     # This used to raise an Error due to the EarlyDoneMultiAgent
     # terminating at e.g. agent0 w/o publishing the observation for
     # agent1 anymore. This limitation is fixed and an env may
     # terminate at any time (as well as return rewards for any agent
     # at any time, even when that agent doesn't have an obs returned
     # in the same call to `step()`).
     ma_batch = ev.sample()
     # Make sure that agents took the correct (alternating timesteps)
     # path. Except for the last timestep, where both agents got
     # terminated.
     ag0_ts = ma_batch.policy_batches["p0"]["t"]
     ag1_ts = ma_batch.policy_batches["p1"]["t"]
     self.assertTrue(np.all(np.abs(ag0_ts[:-1] - ag1_ts[:-1]) == 1.0))
     self.assertTrue(ag0_ts[-1] == ag1_ts[-1])
Example #22
0
    def test_action_clipping(self):
        from ray.rllib.examples.env.random_env import RandomEnv
        action_space = gym.spaces.Box(-2.0, 1.0, (3, ))

        # Clipping: True (clip between Policy's action_space.low/high),
        ev = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=True,
            batch_mode="complete_episodes")
        sample = ev.sample()
        # Check, whether the action bounds have been breached (expected).
        # We still arrived here b/c we clipped according to the Env's action
        # space.
        self.assertGreater(np.max(sample["actions"]), action_space.high[0])
        self.assertLess(np.min(sample["actions"]), action_space.low[0])
        ev.stop()

        # Clipping: False and RandomPolicy produces invalid actions.
        # Expect Env to complain.
        ev2 = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(
                action_space=action_space,
                ignore_action_bounds=True,
            ),
            clip_actions=False,  # <- should lead to Env complaining
            batch_mode="complete_episodes")
        self.assertRaisesRegex(ValueError, r"Illegal action", ev2.sample)
        ev2.stop()

        # Clipping: False and RandomPolicy produces valid (bounded) actions.
        # Expect "actions" in SampleBatch to be unclipped.
        ev3 = RolloutWorker(
            env_creator=lambda _: RandomEnv(config=dict(
                action_space=action_space,
                max_episode_len=10,
                p_done=0.0,
                check_action_bounds=True,
            )),
            policy_spec=RandomPolicy,
            policy_config=dict(action_space=action_space),
            # Should not be a problem as RandomPolicy abides to bounds.
            clip_actions=False,
            batch_mode="complete_episodes")
        sample = ev3.sample()
        self.assertGreater(np.min(sample["actions"]), action_space.low[0])
        self.assertLess(np.max(sample["actions"]), action_space.high[0])
        ev3.stop()
Example #23
0
    def test_returning_model_based_rollouts_data(self):
        class ModelBasedPolicy(DQNTFPolicy):
            def compute_actions_from_input_dict(self,
                                                input_dict,
                                                explore=None,
                                                timestep=None,
                                                episodes=None,
                                                **kwargs):
                obs_batch = input_dict["obs"]
                # In policy loss initialization phase, no episodes are passed
                # in.
                if episodes is not None:
                    # Pretend we did a model-based rollout and want to return
                    # the extra trajectory.
                    env_id = episodes[0].env_id
                    fake_eps = Episode(
                        episodes[0].policy_map,
                        episodes[0].policy_mapping_fn,
                        lambda: None,
                        lambda x: None,
                        env_id,
                    )
                    builder = get_global_worker().sampler.sample_collector
                    agent_id = "extra_0"
                    policy_id = "p1"  # use p1 so we can easily check it
                    builder.add_init_obs(fake_eps, agent_id, env_id, policy_id,
                                         -1, obs_batch[0])
                    for t in range(4):
                        builder.add_action_reward_next_obs(
                            episode_id=fake_eps.episode_id,
                            agent_id=agent_id,
                            env_id=env_id,
                            policy_id=policy_id,
                            agent_done=t == 3,
                            values=dict(
                                t=t,
                                actions=0,
                                rewards=0,
                                dones=t == 3,
                                infos={},
                                new_obs=obs_batch[0],
                            ),
                        )
                    batch = builder.postprocess_episode(episode=fake_eps,
                                                        build=True)
                    episodes[0].add_extra_batch(batch)

                # Just return zeros for actions
                return [0] * len(obs_batch), [], {}

        ev = RolloutWorker(
            env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}),
            policy_spec={
                "p0": PolicySpec(policy_class=ModelBasedPolicy),
                "p1": PolicySpec(policy_class=ModelBasedPolicy),
            },
            policy_mapping_fn=lambda agent_id, episode, **kwargs: "p0",
            rollout_fragment_length=5,
        )
        batch = ev.sample()
        # 5 environment steps (rollout_fragment_length).
        self.assertEqual(batch.count, 5)
        # 10 agent steps for p0: 2 agents, both using p0 as their policy.
        self.assertEqual(batch.policy_batches["p0"].count, 10)
        # 20 agent steps for p1: Each time both(!) agents takes 1 step,
        # p1 takes 4: 5 (rollout-fragment length) * 4 = 20
        self.assertEqual(batch.policy_batches["p1"].count, 20)
Example #24
0
def run(args, parser):

    # create exps from configs
    if args.config_file:
        # load configs from yaml
        with open(args.config_file) as f:
            exps = yaml.safe_load(f)

    else:
        exps = create_exps(args=args, )

    arena_exps = create_arena_exps(
        exps=exps,
        args=args,
        parser=parser,
    )

    # config ray cluster
    if args.ray_num_nodes:
        cluster = Cluster()
        for ray_node in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
                memory=args.ray_memory,
                redis_max_memory=args.ray_redis_max_memory,
            )
        ray.init(address=cluster.redis_address, )
    else:
        ray.init(
            address=args.ray_address,
            object_store_memory=args.ray_object_store_memory,
            memory=args.ray_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus,
        )

    if len(arena_exps.keys()) > 1:
        logger.warning(
            "There are multiple experiments scheduled, ray==0.7.4 will run them one by one, instead of cocurrently. "
            "However, recent ray can run them cocurrently. But the recent ray has failed our test (the rllib is broken)"
            "This is mainly due to there are grid search used in configs that is not supported by original rllib. "
        )

    if args.eval:

        # evaluate policies

        if len(arena_exps.keys()) < 1:
            raise ValueError

        elif len(arena_exps.keys()) >= 1:

            if len(arena_exps.keys()) > 1:

                arena_exp_key = inquire_select(
                    choices=list(arena_exps.keys()),
                    key="arena_exp_key",
                )

            else:
                # if there is just one arena_exps
                arena_exp_key = list(arena_exps.keys())[0]

        logger.info("Evaluating arena_exp_key: {}".format(arena_exp_key, ))

        arena_exp = arena_exps[arena_exp_key]

        answers = prompt(
            [{
                'type': 'input',
                'name': 'eval_log_path',
                'message':
                'Where do you want to log the results of this evaluation?',
                'default': '../eval_log_path/'
            }],
            style=custom_style_2,
        )

        prepare_path(answers['eval_log_path'])

        from ray.rllib.evaluation.rollout_worker import RolloutWorker

        # worker = ArenaRolloutWorker(
        # TODO: RolloutWorker does not support monitor for multi-agent envs
        worker = RolloutWorker(
            env_creator=lambda _: ArenaRllibEnv(
                env=arena_exp["env"],
                env_config=arena_exp["config"]["env_config"],
            ),
            policy=arena_exp["config"]["multiagent"]["policies"],
            policy_mapping_fn=arena_exp["config"]["multiagent"]
            ["policy_mapping_fn"],
            batch_mode="complete_episodes",
            batch_steps=500,
            num_envs=1,
            monitor_path=answers['eval_log_path'],
        )

        logger.info("Testing worker...")
        sample_start = time.time()
        worker.sample()
        sample_time = time.time() - sample_start
        logger.info("Finish testing worker.")

        policy_ids = list(worker.policy_map.keys())

        checkpoints = inquire_checkpoints(
            local_dir=arena_exp["local_dir"],
            policy_ids=policy_ids,
        )

        checkpoint_paths = checkpoints_2_checkpoint_paths(checkpoints)

        num_checkpoint_paths = {}
        for policy_id, checkpoint_paths_per_policy_id in checkpoint_paths.items(
        ):
            num_checkpoint_paths[policy_id] = len(
                checkpoint_paths_per_policy_id)

        num_sampling = np.prod(list(num_checkpoint_paths.values()))

        confirm = inquire_confirm(
            "You have scheduled {} sampling, each sampling will take {} minutes, which means {} hours in total."
            .format(
                num_sampling,
                sample_time / 60.0,
                num_sampling * sample_time / 60.0 / 60.0,
            ))
        if not confirm:
            os.exit()

        result_matrix = run_result_matrix(
            checkpoint_paths=checkpoint_paths,
            worker=worker,
        )

        result_matrix = np.asarray(result_matrix)

        vis_result_matrix(
            result_matrix=result_matrix,
            log_path=answers['eval_log_path'],
        )

    else:

        run_experiments(
            arena_exps,
            scheduler=_make_scheduler(args),
            queue_trials=args.queue_trials,
            resume=args.resume,
        )
Example #25
0
 def test_singleagent_env(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv3(NUM_STEPS),
                        policy_spec=EchoPolicy,
                        callbacks=LastInfoCallback)
     ev.sample()
Example #26
0
    def test_traj_view_lstm_functionality(self):
        action_space = Box(-float("inf"), float("inf"), shape=(2, ))
        obs_space = Box(float("-inf"), float("inf"), (4, ))
        max_seq_len = 50
        policies = {
            "pol0": (EpisodeEnvAwarePolicy, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        rollout_worker = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config={
                "multiagent": {
                    "policies": policies,
                    "policy_mapping_fn": policy_fn,
                },
                "_use_trajectory_view_api": True,
                "model": {
                    "use_lstm": True,
                    "_time_major": True,
                    "max_seq_len": max_seq_len,
                },
            },
            policy=policies,
            policy_mapping_fn=policy_fn,
            num_envs=1,
        )
        for i in range(100):
            pc = rollout_worker.sampler.sample_collector. \
                policy_sample_collectors["pol0"]
            sample_batch_offset_before = pc.sample_batch_offset
            buffers = pc.buffers
            result = rollout_worker.sample()
            pol_batch = result.policy_batches["pol0"]

            self.assertTrue(result.count == 100)
            self.assertTrue(pol_batch.count >= 100)
            self.assertFalse(0 in pol_batch.seq_lens)
            # Check prev_reward/action, next_obs consistency.
            for t in range(max_seq_len):
                obs_t = pol_batch["obs"][t]
                r_t = pol_batch["rewards"][t]
                if t > 0:
                    next_obs_t_m_1 = pol_batch["new_obs"][t - 1]
                    self.assertTrue((obs_t == next_obs_t_m_1).all())
                if t < max_seq_len - 1:
                    prev_rewards_t_p_1 = pol_batch["prev_rewards"][t + 1]
                    self.assertTrue((r_t == prev_rewards_t_p_1).all())

            # Check the sanity of all the buffers in the un underlying
            # PerPolicy collector.
            for sample_batch_slot, agent_slot in enumerate(
                    range(sample_batch_offset_before, pc.sample_batch_offset)):
                t_buf = buffers["t"][:, agent_slot]
                obs_buf = buffers["obs"][:, agent_slot]
                # Skip empty seqs at end (these won't be part of the batch
                # and have been copied to new agent-slots (even if seq-len=0)).
                if sample_batch_slot < len(pol_batch.seq_lens):
                    seq_len = pol_batch.seq_lens[sample_batch_slot]
                    # Make sure timesteps are always increasing within the seq.
                    assert all(t_buf[1] + j == n + 1
                               for j, n in enumerate(t_buf)
                               if j < seq_len and j != 0)
                    # Make sure all obs within seq are non-0.0.
                    assert all(
                        any(obs_buf[j] != 0.0) for j in range(1, seq_len + 1))

            # Check seq-lens.
            for agent_slot, seq_len in enumerate(pol_batch.seq_lens):
                if seq_len < max_seq_len - 1:
                    # At least in the beginning, the next slots should always
                    # be empty (once all agent slots have been used once, these
                    # may be filled with "old" values (from longer sequences)).
                    if i < 10:
                        self.assertTrue(
                            (pol_batch["obs"][seq_len +
                                              1][agent_slot] == 0.0).all())
                    print(end="")
                    self.assertFalse(
                        (pol_batch["obs"][seq_len][agent_slot] == 0.0).all())