Exemple #1
0
def test_egreedy_update_params():
    policy = EpsilonGreedy(n_actions=2, epsilon=1.0)
    policy.action_counts_temp = np.array([4, 3])
    policy.action_counts = np.copy(policy.action_counts_temp)
    policy.reward_counts_temp = np.array([2.0, 0.0])
    policy.reward_counts = np.copy(policy.reward_counts_temp)
    action = 0
    reward = 1.0
    policy.update_params(action, reward)
    assert np.array_equal(policy.action_counts, np.array([5, 3]))
    assert np.allclose(policy.reward_counts, np.array([2.0 + reward, 0.0]))
Exemple #2
0
def test_egreedy_update_params():
    policy = EpsilonGreedy(n_actions=2, epsilon=1.0)
    policy.action_counts_temp = np.array([4, 3])
    policy.action_counts = np.copy(policy.action_counts_temp)
    policy.reward_counts_temp = np.array([2.0, 0.0])
    policy.reward_counts = np.copy(policy.reward_counts_temp)
    action = 0
    reward = 1.0
    policy.update_params(action, reward)
    assert np.array_equal(policy.action_counts, np.array([5, 3]))
    # in epsilon greedy, reward_counts is defined as the mean of observed rewards for each action
    next_reward = (2.0 * (5 - 1) / 5) + (reward / 5)
    assert np.allclose(policy.reward_counts, np.array([next_reward, 0.0]))