Exemple #1
0
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000]
        expected = [
            0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts
        ]
        config = dict(
            type=
            "ray.rllib.utils.schedules.polynomial_schedule.PolynomialSchedule",
            schedule_timesteps=100,
            initial_p=2.0,
            final_p=0.5,
            power=2.0,
        )

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            polynomial = from_config(config, framework=fw)
            for t, e in zip(ts, expected):
                out = polynomial(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = polynomial(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
Exemple #2
0
    def test_pg_fake_multi_gpu_learning(self):
        """Test whether PGTrainer can learn CartPole w/ faked multi-GPU."""
        config = copy.deepcopy(pg.DEFAULT_CONFIG)

        # Fake GPU setup.
        config["num_gpus"] = 2
        config["_fake_gpus"] = True

        # Mimic tuned_example for PG CartPole.
        config["model"]["fcnet_hiddens"] = [64]
        config["model"]["fcnet_activation"] = "linear"

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            num_iterations = 300
            learnt = False
            for i in range(num_iterations):
                results = trainer.train()
                print("reward={}".format(results["episode_reward_mean"]))
                # Make this test quite short (75.0).
                if results["episode_reward_mean"] > 65.0:
                    learnt = True
                    break
            assert learnt,\
                "PG multi-GPU (with fake-GPUs) did not learn CartPole!"
            trainer.stop()
Exemple #3
0
def do_test_explorations(
    run, env, config, dummy_obs, prev_a=None, expected_mean_action=None
):
    """Calls an Agent's `compute_actions` with different `explore` options."""

    core_config = config.copy()
    if run not in [a3c.A3CTrainer]:
        core_config["num_workers"] = 0

    # Test all frameworks.
    for _ in framework_iterator(core_config):
        print("Agent={}".format(run))

        # Test for both the default Agent's exploration AND the `Random`
        # exploration class.
        for exploration in [None, "Random"]:
            local_config = core_config.copy()
            if exploration == "Random":
                # TODO(sven): Random doesn't work for IMPALA yet.
                if run is impala.ImpalaTrainer:
                    continue
                local_config["exploration_config"] = {"type": "Random"}
            print("exploration={}".format(exploration or "default"))

            trainer = run(config=local_config, env=env)

            # Make sure all actions drawn are the same, given same
            # observations.
            actions = []
            for _ in range(25):
                actions.append(
                    trainer.compute_single_action(
                        observation=dummy_obs,
                        explore=False,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None,
                    )
                )
                check(actions[-1], actions[0])

            # Make sure actions drawn are different
            # (around some mean value), given constant observations.
            actions = []
            for _ in range(500):
                actions.append(
                    trainer.compute_single_action(
                        observation=dummy_obs,
                        explore=True,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None,
                    )
                )
            check(
                np.mean(actions),
                expected_mean_action if expected_mean_action is not None else 0.5,
                atol=0.4,
            )
            # Check that the stddev is not 0.0 (values differ).
            check(np.std(actions), 0.0, false=True)
Exemple #4
0
    def test_linear_schedule(self):
        ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000]
        config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6}

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            linear = from_config(LinearSchedule, config, framework=fw_)
            for t in ts:
                out = linear(t)
                check(out, 2.1 - (min(t, 100) / 100) * (2.1 - 0.6), decimals=4)
Exemple #5
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        num_iterations = 2

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                trainer.train()
Exemple #6
0
    def test_exponential_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            exponential = from_config(
                ExponentialSchedule, config, framework=fw_)
            for t in ts:
                out = exponential(t)
                check(out, 2.0 * 0.99**(t / 100), decimals=4)
Exemple #7
0
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        config = {"value": value}

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            constant = from_config(ConstantSchedule, config, framework=fw_)
            for t in ts:
                out = constant(t)
                check(out, value)
Exemple #8
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        num_iterations = 2

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                print(trainer.train())
            check_compute_single_action(
                trainer, include_prev_action_reward=True)
Exemple #9
0
    def test_piecewise_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
        expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
        config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
                      outside_value=14.5)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            piecewise = from_config(PiecewiseSchedule, config, framework=fw_)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)
Exemple #10
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 1
        config["rollout_fragment_length"] = 500
        num_iterations = 1

        for _ in framework_iterator(config):
            for env in ["FrozenLake-v0", "CartPole-v0"]:
                trainer = pg.PGTrainer(config=config, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_single_action(
                    trainer, include_prev_action_reward=True)
Exemple #11
0
    def test_pg_compilation(self):
        """Test whether a PGTrainer can be built with both frameworks."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0
        num_iterations = 2

        for fw in framework_iterator(config):
            # For tf, build with fake-GPUs.
            config["_fake_gpus"] = fw == "tf"
            config["num_gpus"] = 2 if fw == "tf" else 0
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                print(trainer.train())
            check_compute_single_action(trainer,
                                        include_prev_action_reward=True)
Exemple #12
0
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000]
        config = dict(type="ray.rllib.utils.schedules.polynomial_schedule."
                      "PolynomialSchedule",
                      schedule_timesteps=100,
                      initial_p=2.0,
                      final_p=0.5,
                      power=2.0)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            polynomial = from_config(config, framework=fw_)
            for t in ts:
                out = polynomial(t)
                t = min(t, 100)
                check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
Exemple #13
0
    def test_linear_schedule(self):
        ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000]
        expected = [2.1 - (min(t, 100) / 100) * (2.1 - 0.6) for t in ts]
        config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6}

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            linear = from_config(LinearSchedule, config, framework=fw)
            for t, e in zip(ts, expected):
                out = linear(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = linear(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
Exemple #14
0
    def test_piecewise_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
        expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
        config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
                      outside_value=14.5)

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            piecewise = from_config(PiecewiseSchedule, config, framework=fw)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = piecewise(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
Exemple #15
0
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        config = {"value": value}

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            constant = from_config(ConstantSchedule, config, framework=fw)
            for t in ts:
                out = constant(t)
                check(out, value)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t in ts_as_tensors:
                out = constant(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, value, decimals=4)
Exemple #16
0
    def test_exponential_schedule(self):
        decay_rate = 0.2
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        expected = [2.0 * decay_rate**(t / 100) for t in ts]
        config = dict(initial_p=2.0,
                      decay_rate=decay_rate,
                      schedule_timesteps=100)

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            exponential = from_config(ExponentialSchedule,
                                      config,
                                      framework=fw)
            for t, e in zip(ts, expected):
                out = exponential(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = exponential(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
Exemple #17
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)