Example #1
0
 def testBatchesLargerWhenVectorized(self):
     ev = PolicyEvaluator(env_creator=lambda _: MockEnv(episode_length=8),
                          policy_graph=MockPolicyGraph,
                          batch_mode="truncate_episodes",
                          batch_steps=4,
                          num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
Example #2
0
 def test_batches_larger_when_vectorized(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=8),
                        policy=MockPolicy,
                        batch_mode="truncate_episodes",
                        batch_steps=4,
                        num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
Example #3
0
 def testBatchesLargerWhenVectorized(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=4,
         num_envs=4)
     batch = ev.sample()
     self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     batch = ev.sample()
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 4)
Example #4
0
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #5
0
 def testVectorEnvSupport(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
Example #6
0
 def testVectorEnvSupport(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 8)
Example #7
0
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNTFPolicy, obs_space, act_space, dqn_config),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGTFPolicy, obs_space, act_space, {}),
                "p2": (DQNTFPolicy, obs_space, act_space, dqn_config),
            }
        worker = RolloutWorker(
            env_creator=lambda _: MultiCartpole(n),
            policy=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_workers = [
                RolloutWorker.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_workers = []
        workers = WorkerSet._from_existing(worker, remote_workers)
        optimizer = optimizer_cls(workers)
        for i in range(200):
            worker.foreach_policy(
                lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
                if isinstance(p, DQNTFPolicy) else None)
            optimizer.step()
            result = collect_metrics(worker, remote_workers)
            if i % 20 == 0:

                def do_update(p):
                    if isinstance(p, DQNTFPolicy):
                        p.update_target()

                worker.foreach_policy(lambda p, _: do_update(p))
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
Example #8
0
 def test_train_multi_cartpole_many_policies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     worker = RolloutWorker(
         env_creator=lambda _: MultiCartpole(n),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     workers = WorkerSet._from_existing(worker, [])
     optimizer = SyncSamplesOptimizer(workers)
     for i in range(100):
         optimizer.step()
         result = collect_metrics(worker)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #9
0
    def __call__(self, samples):
        # Dreamer training loop.
        for n in range(self.dreamer_train_iters):
            print(f"sub-iteration={n}/{self.dreamer_train_iters}")
            batch = self.episode_buffer.sample(self.batch_size)
            # if n == self.dreamer_train_iters - 1:
            #     batch["log_gif"] = True
            fetches = self.worker.learn_on_batch(batch)

        # Custom Logging
        policy_fetches = fetches[DEFAULT_POLICY_ID]["learner_stats"]
        if "log_gif" in policy_fetches:
            gif = policy_fetches["log_gif"]
            policy_fetches["log_gif"] = self.postprocess_gif(gif)

        # Metrics Calculation
        metrics = _get_shared_metrics()
        metrics.info[LEARNER_INFO] = fetches
        metrics.counters[STEPS_SAMPLED_COUNTER] = self.episode_buffer.timesteps
        metrics.counters[STEPS_SAMPLED_COUNTER] *= self.repeat
        res = collect_metrics(local_worker=self.worker)
        res["info"] = metrics.info
        res["info"].update(metrics.counters)
        res["timesteps_total"] = metrics.counters[STEPS_SAMPLED_COUNTER]

        self.episode_buffer.add(samples)
        return res
Example #10
0
 def _train(self):
     self.optimizer.step()
     FilterManager.synchronize(self.local_evaluator.filters,
                               self.remote_evaluators)
     result = collect_metrics(self.local_evaluator, self.remote_evaluators)
     result = result._replace(info=self.optimizer.stats())
     return result
Example #11
0
def eval_func(trainer, workers):
    logger.info("Evaluating current policy for {} episodes.".format(
        trainer.config["evaluation_num_episodes"]))
    if trainer.config["evaluation_num_workers"] == 0:
        for _ in range(trainer.config["evaluation_num_episodes"]):
            trainer.evaluation_workers.local_worker().sample()
    else:
        num_rounds = int(
            math.ceil(trainer.config["evaluation_num_episodes"] /
                      trainer.config["evaluation_num_workers"]))
        num_workers = len(trainer.evaluation_workers.remote_workers())
        num_episodes = num_rounds * num_workers
        for i in range(num_rounds):
            logger.info("Running round {} of parallel evaluation "
                        "({}/{} episodes)".format(i, (i + 1) * num_workers,
                                                  num_episodes))
            ray.get([
                w.sample.remote()
                for w in trainer.evaluation_workers.remote_workers()
            ])

    metrics = collect_metrics(
        trainer.evaluation_workers.local_worker(),
        trainer.evaluation_workers.remote_workers(),
    )

    return metrics
Example #12
0
 def test_train_external_multi_agent_cartpole_many_policies(self):
     n = 20
     single_env = gym.make("CartPole-v0")
     act_space = single_env.action_space
     obs_space = single_env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = RolloutWorker(
         env_creator=lambda _: MultiAgentCartPole({"num_agents": n}),
         policy=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         rollout_fragment_length=100)
     optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev))
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #13
0
def training_workflow(config, reporter):
    # Setup policy and policy evaluation actors
    env = gym.make("CartPole-v0")
    policy = CustomPolicy(env.observation_space, env.action_space, {})
    workers = [
        PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"),
                                           CustomPolicy)
        for _ in range(config["num_workers"])
    ]

    for _ in range(config["num_iters"]):
        # Broadcast weights to the policy evaluation workers
        weights = ray.put({"default_policy": policy.get_weights()})
        for w in workers:
            w.set_weights.remote(weights)

        # Gather a batch of samples
        T1 = SampleBatch.concat_samples(
            ray.get([w.sample.remote() for w in workers]))
        print("DEBUG* BATCH ************************")
        print(T1)
        print("DEBUG*************************")




        # Improve the policy using the T1 batch
        policy.learn_on_batch(T1)

        reporter(**collect_metrics(remote_evaluators=workers))
 def testAutoVectorization(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=20),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=16, num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result.episodes_total, 8)
Example #15
0
    def _train(self):
        def postprocess_samples(batch):
            # Divide by the maximum of value.std() and 1e-4
            # to guard against the case where all values are equal
            value = batch["advantages"]
            standardized = (value - value.mean()) / max(1e-4, value.std())
            batch.data["advantages"] = standardized
            batch.shuffle()
            dummy = np.zeros_like(batch["advantages"])
            if not self.config["use_gae"]:
                batch.data["value_targets"] = dummy
                batch.data["vf_preds"] = dummy
        extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples)
        kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1]
        total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1]
        policy_loss = np.array(extra_fetches["policy_loss"]).mean(axis=1)[-1]
        vf_loss = np.array(extra_fetches["vf_loss"]).mean(axis=1)[-1]
        entropy = np.array(extra_fetches["entropy"]).mean(axis=1)[-1]

        newkl = self.local_evaluator.for_policy(lambda pi: pi.update_kl(kl))

        info = {
            "kl_divergence": kl,
            "kl_coefficient": newkl,
            "total_loss": total_loss,
            "policy_loss": policy_loss,
            "vf_loss": vf_loss,
            "entropy": entropy,
        }

        FilterManager.synchronize(
            self.local_evaluator.filters, self.remote_evaluators)
        res = collect_metrics(self.local_evaluator, self.remote_evaluators)
        res = res._replace(info=info)
        return res
 def testTrainMultiCartpoleManyPolicies(self):
     n = 20
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     policies = {}
     for i in range(20):
         policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
                                        {})
     policy_ids = list(policies.keys())
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: random.choice(policy_ids),
         batch_steps=100)
     optimizer = SyncSamplesOptimizer(ev, [], {})
     for i in range(100):
         optimizer.step()
         result = collect_metrics(ev)
         print("Iteration {}, rew {}".format(i,
                                             result["policy_reward_mean"]))
         print("Total reward", result["episode_reward_mean"])
         if result["episode_reward_mean"] >= 25 * n:
             return
     raise Exception("failed to improve reward")
Example #17
0
def test_rllib_batch_policy_eval(init_done=False):
    if not init_done:
        init()
    evaluator = PolicyEvaluator(
        env_creator=lambda _: MultiCarlaEnv(configs),
        # TODO: Remove the hardcoded spaces
        policy_graph={
            "def_policy": (PGPolicyGraph, Box(0.0, 255.0, shape=(84, 84, 3)),
                           Box(-1.0, 1.0, shape=(2, )), {
                               "gamma": 0.99
                           })
        },
        policy_mapping_fn=lambda agent_id: "def_policy",
        batch_mode=BATCH_MODE,
        batch_steps=BATCH_STEPS,
        num_envs=NUM_ENVS)
    for _ in range(NUM_ENVS):
        samples, count = evaluator.sample_with_count()
        # print("sample:", samples.policy_batches["def_policy"]["actions"])
        # count >= BATCH_STEPS for complete_episodes
        # == for truncate_episodes
        if BATCH_MODE == "complete_episodes":
            assert count >= BATCH_STEPS, "Expected count:{}. actual:{}".format(
                BATCH_STEPS, count)
        elif BATCH_MODE == "truncate_episodes":
            assert count == BATCH_STEPS, "Expected count:{}. actual:{}".format(
                BATCH_STEPS, count)
        print("Successfully sampled {} items".format(count))
    results = collect_metrics(evaluator, [])
    print("results: \n", results)
    if BATCH_MODE == "complete_episodes":
        assert (results["episodes"] >= NUM_ENVS), "Expected num episodes:{}," \
         "actual:{}".format(NUM_ENVS, results["episodes"])
Example #18
0
 def testVectorEnvSupport(self):
     ev = RolloutWorker(
         env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
         policy=MockPolicy,
         batch_mode="truncate_episodes",
         batch_steps=10)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 10)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
Example #19
0
    def test_reward_clipping(self):
        # Clipping: True (clip between -1.0 and 1.0).
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=True,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)
        ev.stop()

        from ray.rllib.examples.env.random_env import RandomEnv

        # Clipping in certain range (-2.0, 2.0).
        ev2 = RolloutWorker(
            env_creator=lambda _: RandomEnv(
                dict(
                    reward_space=gym.spaces.Box(low=-10, high=10, shape=()),
                    p_done=0.0,
                    max_episode_len=10,
                )
            ),
            policy_spec=MockPolicy,
            clip_rewards=2.0,
            batch_mode="complete_episodes",
        )
        sample = ev2.sample()
        self.assertEqual(max(sample["rewards"]), 2.0)
        self.assertEqual(min(sample["rewards"]), -2.0)
        self.assertLess(np.mean(sample["rewards"]), 0.5)
        self.assertGreater(np.mean(sample["rewards"]), -0.5)
        ev2.stop()

        # Clipping: Off.
        ev2 = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            clip_rewards=False,
            batch_mode="complete_episodes",
        )
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
        ev2.stop()
Example #20
0
    def testRewardClipping(self):
        # clipping on
        ev = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10),
                           policy=MockPolicy,
                           clip_rewards=True,
                           batch_mode="complete_episodes")
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)

        # clipping off
        ev2 = RolloutWorker(env_creator=lambda _: MockEnv2(episode_length=10),
                            policy=MockPolicy,
                            clip_rewards=False,
                            batch_mode="complete_episodes")
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
Example #21
0
 def _evaluate(self):
     steps = 0
     self.evaluation_ev.restore(self.local_evaluator.save())
     self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0))
     while steps < self.evaluation_steps:
         batch = self.evaluation_ev.sample()
         steps += batch.count
     metrics = collect_metrics(self.evaluation_ev)
     return {"evaluation": metrics}
Example #22
0
 def _evaluate(self):
     logger.info("Evaluating current policy for {} episodes".format(
         self.config["evaluation_num_episodes"]))
     self.evaluation_ev.restore(self.local_evaluator.save())
     self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0))
     for _ in range(self.config["evaluation_num_episodes"]):
         self.evaluation_ev.sample()
     metrics = collect_metrics(self.evaluation_ev)
     return {"evaluation": metrics}
Example #23
0
File: dqn.py Project: ywq111520/ray
 def _evaluate(self):
     logger.info("Evaluating current policy for {} episodes".format(
         self.config["evaluation_num_episodes"]))
     self.evaluation_ev.restore(self.local_evaluator.save())
     self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0))
     for _ in range(self.config["evaluation_num_episodes"]):
         self.evaluation_ev.sample()
     metrics = collect_metrics(self.evaluation_ev)
     return {"evaluation": metrics}
    def _testWithOptimizer(self, optimizer_cls):
        n = 3
        env = gym.make("CartPole-v0")
        act_space = env.action_space
        obs_space = env.observation_space
        dqn_config = {"gamma": 0.95, "n_step": 3}
        if optimizer_cls == SyncReplayOptimizer:
            # TODO: support replay with non-DQN graphs. Currently this can't
            # happen since the replay buffer doesn't encode extra fields like
            # "advantages" that PG uses.
            policies = {
                "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        else:
            policies = {
                "p1": (PGPolicyGraph, obs_space, act_space, {}),
                "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
            }
        ev = PolicyEvaluator(
            env_creator=lambda _: MultiCartpole(n),
            policy_graph=policies,
            policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
            batch_steps=50)
        if optimizer_cls == AsyncGradientsOptimizer:

            def policy_mapper(agent_id):
                return ["p1", "p2"][agent_id % 2]

            remote_evs = [
                PolicyEvaluator.as_remote().remote(
                    env_creator=lambda _: MultiCartpole(n),
                    policy_graph=policies,
                    policy_mapping_fn=policy_mapper,
                    batch_steps=50)
            ]
        else:
            remote_evs = []
        optimizer = optimizer_cls(ev, remote_evs, {})
        for i in range(200):
            ev.foreach_policy(
                lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
                if isinstance(p, DQNPolicyGraph) else None)
            optimizer.step()
            result = collect_metrics(ev, remote_evs)
            if i % 20 == 0:
                ev.foreach_policy(
                    lambda p, _: p.update_target()
                    if isinstance(p, DQNPolicyGraph) else None)
                print("Iter {}, rew {}".format(i,
                                               result["policy_reward_mean"]))
                print("Total reward", result["episode_reward_mean"])
            if result["episode_reward_mean"] >= 25 * n:
                return
        print(result)
        raise Exception("failed to improve reward")
Example #25
0
    def collect_metrics(self):
        """Returns evaluator and optimizer stats.

        Returns:
            res (TrainingResult): TrainingResult from evaluator metrics with
                `info` replaced with stats from self.
        """
        res = collect_metrics(self.local_evaluator, self.remote_evaluators)
        res = res._replace(info=self.stats())
        return res
Example #26
0
    def _evaluate(self):
        """Evaluates current policy under `evaluation_config` settings.

        Note that this default implementation does not do anything beyond
        merging evaluation_config with the normal trainer config.
        """

        if not self.config["evaluation_config"]:
            raise ValueError(
                "No evaluation_config specified. It doesn't make sense "
                "to enable evaluation without specifying any config "
                "overrides, since the results will be the "
                "same as reported during normal policy evaluation.")

        self._before_evaluate()

        # Broadcast the new policy weights to all evaluation workers.
        logger.info("Synchronizing weights to evaluation workers.")
        weights = ray.put(self.workers.local_worker().save())
        self.evaluation_workers.foreach_worker(
            lambda w: w.restore(ray.get(weights)))
        self._sync_filters_if_needed(self.evaluation_workers)

        if self.config["custom_eval_function"]:
            logger.info("Running custom eval function {}".format(
                self.config["custom_eval_function"]))
            metrics = self.config["custom_eval_function"](
                self, self.evaluation_workers)
            if not metrics or not isinstance(metrics, dict):
                raise ValueError("Custom eval function must return "
                                 "dict of metrics, got {}.".format(metrics))
        else:
            logger.info("Evaluating current policy for {} episodes.".format(
                self.config["evaluation_num_episodes"]))
            if self.config["evaluation_num_workers"] == 0:
                for _ in range(self.config["evaluation_num_episodes"]):
                    self.evaluation_workers.local_worker().sample()
            else:
                num_rounds = int(
                    math.ceil(self.config["evaluation_num_episodes"] /
                              self.config["evaluation_num_workers"]))
                num_workers = len(self.evaluation_workers.remote_workers())
                num_episodes = num_rounds * num_workers
                for i in range(num_rounds):
                    logger.info("Running round {} of parallel evaluation "
                                "({}/{} episodes)".format(
                                    i, (i + 1) * num_workers, num_episodes))
                    ray.get([
                        w.sample.remote()
                        for w in self.evaluation_workers.remote_workers()
                    ])

            metrics = collect_metrics(self.evaluation_workers.local_worker(),
                                      self.evaluation_workers.remote_workers())
        return {"evaluation": metrics}
Example #27
0
    def _train(self):
        start_timestep = self.global_timestep

        # Update worker explorations
        exp_vals = [self.exploration0.value(self.global_timestep)]
        self.local_evaluator.foreach_trainable_policy(
            lambda p, _: p.set_epsilon(exp_vals[0]))
        for i, e in enumerate(self.remote_evaluators):
            exp_val = self.explorations[i].value(self.global_timestep)
            e.foreach_trainable_policy.remote(
                lambda p, _: p.set_epsilon(exp_val))
            exp_vals.append(exp_val)

        # Do optimization steps
        start = time.time()
        while (self.global_timestep - start_timestep <
               self.config["timesteps_per_iteration"]
               ) or time.time() - start < self.config["min_iter_time_s"]:
            self.optimizer.step()
            self.update_target_if_needed()

        if self.config["per_worker_exploration"]:
            # Only collect metrics from the third of workers with lowest eps
            result = collect_metrics(
                self.local_evaluator,
                self.remote_evaluators[-len(self.remote_evaluators) // 3:],
                timeout_seconds=self.config["collect_metrics_timeout"])
        else:
            result = collect_metrics(
                self.local_evaluator,
                self.remote_evaluators,
                timeout_seconds=self.config["collect_metrics_timeout"])

        result.update(timesteps_this_iter=self.global_timestep -
                      start_timestep,
                      info=dict(
                          {
                              "min_exploration": min(exp_vals),
                              "max_exploration": max(exp_vals),
                              "num_target_updates": self.num_target_updates,
                          }, **self.optimizer.stats()))
        return result
Example #28
0
    def testRewardClipping(self):
        # clipping on
        ev = PolicyEvaluator(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_graph=MockPolicyGraph,
            clip_rewards=True,
            batch_mode="complete_episodes")
        self.assertEqual(max(ev.sample()["rewards"]), 1)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episode_reward_mean"], 1000)

        # clipping off
        ev2 = PolicyEvaluator(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_graph=MockPolicyGraph,
            clip_rewards=False,
            batch_mode="complete_episodes")
        self.assertEqual(max(ev2.sample()["rewards"]), 100)
        result2 = collect_metrics(ev2, [])
        self.assertEqual(result2["episode_reward_mean"], 1000)
Example #29
0
    def test_vector_env_support(self):
        # Test a vector env that contains 8 actual envs
        # (MockEnv instances).
        ev = RolloutWorker(
            env_creator=(
                lambda _: VectorizedMockEnv(episode_length=20, num_envs=8)),
            policy_spec=MockPolicy,
            batch_mode="truncate_episodes",
            rollout_fragment_length=10,
        )
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episodes_this_iter"], 0)
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertEqual(result["episodes_this_iter"], 8)
        ev.stop()

        # Test a vector env that pretends(!) to contain 4 envs, but actually
        # only has 1 (CartPole).
        ev = RolloutWorker(
            env_creator=(lambda _: MockVectorEnv(20, mocked_num_envs=4)),
            policy_spec=MockPolicy,
            batch_mode="truncate_episodes",
            rollout_fragment_length=10,
        )
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertGreater(result["episodes_this_iter"], 3)
        for _ in range(8):
            batch = ev.sample()
            self.assertEqual(batch.count, 10)
        result = collect_metrics(ev, [])
        self.assertGreater(result["episodes_this_iter"], 6)
        ev.stop()
Example #30
0
def post_process_metrics(adapt_iter, workers, metrics):
    # Obtain Current Dataset Metrics and filter out
    name = "_adapt_" + str(adapt_iter) if adapt_iter > 0 else ""

    # Only workers are collecting data
    res = collect_metrics(remote_workers=workers.remote_workers())

    metrics["episode_reward_max" + str(name)] = res["episode_reward_max"]
    metrics["episode_reward_mean" + str(name)] = res["episode_reward_mean"]
    metrics["episode_reward_min" + str(name)] = res["episode_reward_min"]

    return metrics
 def testMetrics(self):
     ev = CommonPolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     remote_ev = CommonPolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result.episodes_total, 20)
     self.assertEqual(result.episode_reward_mean, 10)
Example #32
0
def post_process_metrics(prefix, workers, metrics):
    """Update current dataset metrics and filter out specific keys.

    Args:
        prefix: Prefix string to be appended
        workers: Set of workers
        metrics: Current metrics dictionary
    """
    res = collect_metrics(remote_workers=workers.remote_workers())
    for key in METRICS_KEYS:
        metrics[prefix + "_" + key] = res[key]
    return metrics
Example #33
0
 def testAutoVectorization(self):
     ev = PolicyEvaluator(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy_graph=MockPolicyGraph,
         batch_mode="truncate_episodes",
         batch_steps=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
Example #34
0
 def testAutoVectorization(self):
     ev = RolloutWorker(
         env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg),
         policy=MockPolicy,
         batch_mode="truncate_episodes",
         batch_steps=2,
         num_envs=8)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 0)
     for _ in range(8):
         batch = ev.sample()
         self.assertEqual(batch.count, 16)
     result = collect_metrics(ev, [])
     self.assertEqual(result["episodes_this_iter"], 8)
     indices = []
     for env in ev.async_env.vector_env.envs:
         self.assertEqual(env.unwrapped.config.worker_index, 0)
         indices.append(env.unwrapped.config.vector_index)
     self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
Example #35
0
 def testMetrics(self):
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10),
                        policy=MockPolicy,
                        batch_mode="complete_episodes")
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
Example #36
0
 def testMetrics(self):
     ev = PolicyEvaluator(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     remote_ev = PolicyEvaluator.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy_graph=MockPolicyGraph,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)
Example #37
0
 def _testWithOptimizer(self, optimizer_cls):
     n = 3
     env = gym.make("CartPole-v0")
     act_space = env.action_space
     obs_space = env.observation_space
     dqn_config = {"gamma": 0.95, "n_step": 3}
     if optimizer_cls == SyncReplayOptimizer:
         # TODO: support replay with non-DQN graphs. Currently this can't
         # happen since the replay buffer doesn't encode extra fields like
         # "advantages" that PG uses.
         policies = {
             "p1": (DQNPolicyGraph, obs_space, act_space, dqn_config),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     else:
         policies = {
             "p1": (PGPolicyGraph, obs_space, act_space, {}),
             "p2": (DQNPolicyGraph, obs_space, act_space, dqn_config),
         }
     ev = PolicyEvaluator(
         env_creator=lambda _: MultiCartpole(n),
         policy_graph=policies,
         policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
         batch_steps=50)
     if optimizer_cls == AsyncGradientsOptimizer:
         remote_evs = [PolicyEvaluator.as_remote().remote(
             env_creator=lambda _: MultiCartpole(n),
             policy_graph=policies,
             policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
             batch_steps=50)]
     else:
         remote_evs = []
     optimizer = optimizer_cls(ev, remote_evs, {})
     for i in range(200):
         ev.foreach_policy(
             lambda p, _: p.set_epsilon(max(0.02, 1 - i * .02))
             if isinstance(p, DQNPolicyGraph) else None)
         optimizer.step()
         result = collect_metrics(ev, remote_evs)
         if i % 20 == 0:
             ev.foreach_policy(
                 lambda p, _: p.update_target()
                 if isinstance(p, DQNPolicyGraph) else None)
             print("Iter {}, rew {}".format(i, result.policy_reward_mean))
             print("Total reward", result.episode_reward_mean)
         if result.episode_reward_mean >= 25 * n:
             return
     print(result)
     raise Exception("failed to improve reward")
Example #38
0
 def test_metrics(self):
     # Allow for Unittest run.
     ray.init(num_cpus=5, ignore_reinit_error=True)
     ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10),
                        policy=MockPolicy,
                        batch_mode="complete_episodes")
     remote_ev = RolloutWorker.as_remote().remote(
         env_creator=lambda _: MockEnv(episode_length=10),
         policy=MockPolicy,
         batch_mode="complete_episodes")
     ev.sample()
     ray.get(remote_ev.sample.remote())
     result = collect_metrics(ev, [remote_ev])
     self.assertEqual(result["episodes_this_iter"], 20)
     self.assertEqual(result["episode_reward_mean"], 10)