コード例 #1
0
class TestSampleAverage:
    act_spec = core.Spec([core.Space(shape=(10, ))])
    obs_spec = core.Spec([core.Space(shape=(10, ))])

    def test_explores_with_epsilon_probability(self):
        with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler:
            sampler.rand.return_value = 0.05
            sampler.rint.return_value = 3

            agent = SampleAverage(self.act_spec, self.obs_spec, 0.1)
            assert agent(None, 0) == 3

    def test_takes_best_action_with_one_less_epsilon_probability(self):
        with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler:
            sampler.rand.side_effect = [0.05, 0.2]
            sampler.rint.return_value = 3

            agent = SampleAverage(self.act_spec, self.obs_spec, epsilon=0.1)
            action = agent(None, 0)

            assert agent(action, 2.0) == 3

    def test_updates_action_value_with_sample_average_reward(self):

        with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler:
            sampler.rand.return_value = 0.2
            agent = SampleAverage(self.act_spec, self.obs_spec, epsilon=0.1)

            action = agent(None, 0)
            action = agent(action, -2.0)
            action = agent(action, 3.0)
            agent(action, 1.0)

            assert agent.qtable()[0] == -2.0
            assert agent.qtable()[1] == 2.0
コード例 #2
0
    def test_updates_q_with_whole_quintuple_of_events(self):
        with patch('rlbox.env.racetrack.rnd') as rnd:
            rnd.rand.return_value = 1
            agent = OnPolicySarsa(
                act_spec=core.Spec([core.Space(shape=(1, 0))]),
                obs_spec=core.Spec([core.Space(shape=(1, 0))])
            )
            agent(0, -1)
            agent(1, -1)

            assert np.all(agent.q == [[-0.5, 0], [0, 0]])
コード例 #3
0
ファイル: narmedbandit.py プロジェクト: ocraft/rl-sandbox
 def __init__(self, steps, arms, stationary=True, mean=0.0):
     core.Environment.__init__(self,
                               core.Spec([core.Space(shape=(arms, ))]),
                               core.Spec([core.Space(shape=(arms, ))]))
     self.bandit = (Bandit(arms, mean)
                    if stationary else NonstationaryBandit(arms, mean))
     self.steps = steps
     self._step = 0
     self.all_rewards = [0.0] * steps
     self.optimal_actions = [0] * steps
     self.last_action = None
     self.reward = 0
コード例 #4
0
ファイル: test_ucb.py プロジェクト: ocraft/rl-sandbox
    def test_updates_action_value_with_sample_average_reward(self):
        agent = Ucb1(core.Spec([core.Space(shape=(10, ))]),
                     core.Spec([core.Space(shape=(10, ))]),
                     c=2)

        action = agent(None, 0)
        action = agent(action, -2.0)
        action = agent(action, 3.0)
        agent(action, 1.0)

        assert agent.qtable()[0] == -2.0
        assert agent.qtable()[1] == 3.0
        assert agent.qtable()[2] == 1.0
コード例 #5
0
ファイル: test_ucb.py プロジェクト: ocraft/rl-sandbox
    def test_takes_best_action_using_action_value_and_confidence(self):
        agent = Ucb1(core.Spec([core.Space(shape=(2, ))]),
                     core.Spec([core.Space(shape=(2, ))]),
                     c=2)

        action = agent(None, 0)
        assert action == 0
        action = agent(action, 2.0)
        assert action == 1
        action = agent(action, 3.0)
        assert action == 1
        action = agent(action, 0.5)
        assert action == 0
コード例 #6
0
ファイル: test_core.py プロジェクト: ocraft/rl-sandbox
    def test_collects_reward_from_each_step(self):
        arms = 10
        eps = 0.1
        environment = NArmedBanditEnv(10, arms)
        agent = core.Agent(
            epsilongreedy.SampleAverage(
                act_spec=core.Spec([core.Space(shape=(10, ))]),
                obs_spec=core.Spec([core.Space(shape=(10, ))]),
                epsilon=eps), lambda env: (env.last_action, env.reward),
            lambda action, env: env.step(action))

        core.Run(agent, environment).start()
        assert len(environment.all_rewards) == 10
        assert len(environment.optimal_actions) == 10
コード例 #7
0
class TestWeightedAverage:
    act_spec = core.Spec([core.Space(shape=(10, ))])
    obs_spec = core.Spec([core.Space(shape=(10, ))])

    def test_updates_action_value_with_weighted_average_reward(self):

        with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler:
            sampler.rand.return_value = 0.2
            agent = WeightedAverage(self.act_spec,
                                    self.obs_spec,
                                    epsilon=0.1,
                                    alpha=0.2)

            action = agent(None, 0)
            action = agent(action, -2.0)
            action = agent(action, 3.0)
            agent(action, 1.0)

            assert agent.qtable()[0] == -0.4
            assert agent.qtable()[1] == 0.68
コード例 #8
0
class TestGradientBandit:
    act_spec = core.Spec([core.Space(shape=(10, ))])
    obs_spec = core.Spec([core.Space(shape=(10, ))])

    def test_distributes_policy_with_softmax(self):
        agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2)

        assert np.all(agent.policy() == np.full(10, 0.1))

    def test_updates_baseline(self):
        agent = GradientBandit(self.act_spec,
                               self.obs_spec,
                               alpha=0.2,
                               baseline=True)

        agent(1, 2.0)
        assert agent.rbase() != 0

    def test_does_not_update_baseline_if_disabled(self):
        agent = GradientBandit(self.act_spec,
                               self.obs_spec,
                               alpha=0.2,
                               baseline=False)

        agent(1, 2.0)
        assert not agent.rbase()

    def test_updates_policy(self):
        agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2)

        agent(1, 2.0)
        old_policy = agent.policy()
        agent(4, 8.0)

        assert np.sum(old_policy) == 1
        assert np.sum(agent.policy()) == 1
        assert np.any(old_policy != agent.policy())