class TestSampleAverage: act_spec = core.Spec([core.Space(shape=(10, ))]) obs_spec = core.Spec([core.Space(shape=(10, ))]) def test_explores_with_epsilon_probability(self): with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler: sampler.rand.return_value = 0.05 sampler.rint.return_value = 3 agent = SampleAverage(self.act_spec, self.obs_spec, 0.1) assert agent(None, 0) == 3 def test_takes_best_action_with_one_less_epsilon_probability(self): with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler: sampler.rand.side_effect = [0.05, 0.2] sampler.rint.return_value = 3 agent = SampleAverage(self.act_spec, self.obs_spec, epsilon=0.1) action = agent(None, 0) assert agent(action, 2.0) == 3 def test_updates_action_value_with_sample_average_reward(self): with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler: sampler.rand.return_value = 0.2 agent = SampleAverage(self.act_spec, self.obs_spec, epsilon=0.1) action = agent(None, 0) action = agent(action, -2.0) action = agent(action, 3.0) agent(action, 1.0) assert agent.qtable()[0] == -2.0 assert agent.qtable()[1] == 2.0
def test_updates_q_with_whole_quintuple_of_events(self): with patch('rlbox.env.racetrack.rnd') as rnd: rnd.rand.return_value = 1 agent = OnPolicySarsa( act_spec=core.Spec([core.Space(shape=(1, 0))]), obs_spec=core.Spec([core.Space(shape=(1, 0))]) ) agent(0, -1) agent(1, -1) assert np.all(agent.q == [[-0.5, 0], [0, 0]])
def __init__(self, steps, arms, stationary=True, mean=0.0): core.Environment.__init__(self, core.Spec([core.Space(shape=(arms, ))]), core.Spec([core.Space(shape=(arms, ))])) self.bandit = (Bandit(arms, mean) if stationary else NonstationaryBandit(arms, mean)) self.steps = steps self._step = 0 self.all_rewards = [0.0] * steps self.optimal_actions = [0] * steps self.last_action = None self.reward = 0
def test_updates_action_value_with_sample_average_reward(self): agent = Ucb1(core.Spec([core.Space(shape=(10, ))]), core.Spec([core.Space(shape=(10, ))]), c=2) action = agent(None, 0) action = agent(action, -2.0) action = agent(action, 3.0) agent(action, 1.0) assert agent.qtable()[0] == -2.0 assert agent.qtable()[1] == 3.0 assert agent.qtable()[2] == 1.0
def test_takes_best_action_using_action_value_and_confidence(self): agent = Ucb1(core.Spec([core.Space(shape=(2, ))]), core.Spec([core.Space(shape=(2, ))]), c=2) action = agent(None, 0) assert action == 0 action = agent(action, 2.0) assert action == 1 action = agent(action, 3.0) assert action == 1 action = agent(action, 0.5) assert action == 0
def test_collects_reward_from_each_step(self): arms = 10 eps = 0.1 environment = NArmedBanditEnv(10, arms) agent = core.Agent( epsilongreedy.SampleAverage( act_spec=core.Spec([core.Space(shape=(10, ))]), obs_spec=core.Spec([core.Space(shape=(10, ))]), epsilon=eps), lambda env: (env.last_action, env.reward), lambda action, env: env.step(action)) core.Run(agent, environment).start() assert len(environment.all_rewards) == 10 assert len(environment.optimal_actions) == 10
class TestWeightedAverage: act_spec = core.Spec([core.Space(shape=(10, ))]) obs_spec = core.Spec([core.Space(shape=(10, ))]) def test_updates_action_value_with_weighted_average_reward(self): with patch('rlbox.agent.tab.epsilongreedy.rnd') as sampler: sampler.rand.return_value = 0.2 agent = WeightedAverage(self.act_spec, self.obs_spec, epsilon=0.1, alpha=0.2) action = agent(None, 0) action = agent(action, -2.0) action = agent(action, 3.0) agent(action, 1.0) assert agent.qtable()[0] == -0.4 assert agent.qtable()[1] == 0.68
class TestGradientBandit: act_spec = core.Spec([core.Space(shape=(10, ))]) obs_spec = core.Spec([core.Space(shape=(10, ))]) def test_distributes_policy_with_softmax(self): agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2) assert np.all(agent.policy() == np.full(10, 0.1)) def test_updates_baseline(self): agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2, baseline=True) agent(1, 2.0) assert agent.rbase() != 0 def test_does_not_update_baseline_if_disabled(self): agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2, baseline=False) agent(1, 2.0) assert not agent.rbase() def test_updates_policy(self): agent = GradientBandit(self.act_spec, self.obs_spec, alpha=0.2) agent(1, 2.0) old_policy = agent.policy() agent(4, 8.0) assert np.sum(old_policy) == 1 assert np.sum(agent.policy()) == 1 assert np.any(old_policy != agent.policy())