def main():
    env = Blackjack()
    target_policy = init_policy(env)
    behavior_policy = init_equiprobable_random_policy(env)
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        off_policy_evaluation(episode, q, c, target_policy, behavior_policy)
    env.visualize_action_value(q)
def main():
    env = Blackjack()
    policy = init_deterministic_policy(env)
    q = policy_iteration(env, policy)
    env.visualize_action_value(q)
def main():
    env = Blackjack()
    epsilon = 0.4
    policy = init_epsilon_greedy_policy(env, epsilon)
    q = policy_iteration(env, policy, epsilon)
    env.visualize_action_value(q)
Beispiel #4
0
def main():
    env = Blackjack()
    target_policy = init_policy(env)
    behavior_policy = init_equiprobable_random_policy(env)
    q = policy_iteration(env, target_policy, behavior_policy)
    env.visualize_action_value(q)