def mc_control_importance_sampling( env: BlackjackEnv, num_episodes: float, behavior_policy: Callable, discount_factor: float = 1.0) -> Tuple[Dict, Callable]: Q = defaultdict(lambda: np.zeros(env.action_space.n)) C = defaultdict(lambda: np.zeros(env.action_space.n)) target_policy = create_greedy_policy(Q) for _ in tqdm(range(int(num_episodes))): episode = [] state = env.reset() is_over = False while not is_over: probs = behavior_policy() action = np.random.choice(np.arange(len(probs)), p=probs) next_state, reward, is_over, _ = env.step(action) episode.append(Step(state, action, reward)) state = next_state g = 0.0 w = 1.0 for ep in episode[::-1]: g = discount_factor * g + ep.reward C[ep.state][ep.action] += w Q[ep.state][ep.action] += (w / C[ep.state][ep.action]) * ( g - Q[ep.state][ep.action]) if ep.action != np.argmax(target_policy(ep.state)): break w = w * 1.0 / behavior_policy()[ep.action] return Q, target_policy
def mc_prediction(policy: np.array, env: BlackjackEnv, num_episodes: Union[int, float], discount_factor: float = 1.0) -> Dict[Tuple, float]: rewards_sum = defaultdict(float) rewards_count = defaultdict(float) v = defaultdict(float) for _ in tqdm(range(int(num_episodes))): episode = [] state = env.reset() is_over = False while not is_over: action = policy(state) next_state, reward, is_over, _ = env.step(action) episode.append(Step(state, action, reward)) state = next_state states_in_episode = set([tuple(ep.state) for ep in episode]) for state in states_in_episode: first_visit_idx = next(i for i, ep in enumerate(episode) if ep.state == state) g = sum([ ep.reward * (discount_factor**i) for i, ep in enumerate(episode[first_visit_idx:]) ]) rewards_sum[state] += g rewards_count[state] += 1.0 v[state] = rewards_sum[state] / rewards_count[state] return v
def mc_control_epsilon_greedy( env: BlackjackEnv, num_episodes: float, discount_factor: float = 1.0, epsilon: float = 0.1) -> Tuple[Dict, Callable[[int], np.array]]: rewards_sum = defaultdict(float) rewards_count = defaultdict(float) Q = defaultdict(lambda: np.zeros(env.action_space.n)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for _ in tqdm(range(int(num_episodes))): episode = [] state = env.reset() is_over = False while not is_over: probs = policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) next_state, reward, is_over, _ = env.step(action) episode.append(Step(state, action, reward)) state = next_state sa_in_episode = set([(tuple(ep.state), ep.action) for ep in episode]) for state, action in sa_in_episode: sa_pair = (state, action) first_visit = next(i for i, ep in enumerate(episode) if ep.state == state and ep.action == action) g = sum([ ep.reward * (discount_factor**i) for i, ep in enumerate(episode[first_visit:]) ]) rewards_sum[sa_pair] += g rewards_count[sa_pair] += 1.0 Q[state][action] = rewards_sum[sa_pair] / rewards_count[sa_pair] return Q, policy
# -*- coding: utf-8 -*- import gym import matplotlib import numpy as np import sys import matplotlib.pyplot as pl from collections import defaultdict from envs.blackjack import BlackjackEnv from lib import plotting import envs #matplotlib.style.use('ggplot') env = BlackjackEnv() def mc_prediction(policy, env, num_episodes, discount_factor=1.0): """ Monte Carlo prediction algorithm. Calculates the value function for a given policy using sampling. Args: policy: A function that maps an observation to action probabilities. env: OpenAI gym environment. num_episodes: Number of episodes to sample. discount_factor: Gamma discount factor. Returns: A dictionary that maps from state -> value. The state is a tuple and the value is a float. """
import numpy as np import sys if "../" not in sys.path: sys.path.append("../") from envs.blackjack import BlackjackEnv env = BlackjackEnv() def print_observation(observation): score, dealer_score, usable_ace = observation print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format( score, usable_ace, dealer_score)) def strategy(observation): score, dealer_score, usable_ace = observation # Stick (action 0) if the score is > 20, hit (action 1) otherwise return 0 if score >= 20 else 1 for i_episode in range(20): observation = env.reset() for t in range(100): print_observation(observation) action = strategy(observation) print("Taking action: {}".format(["Stick", "Hit"][action])) observation, reward, done, _ = env.step(action) if done: print_observation(observation) print("Game end. Reward: {}\n".format(float(reward)))