def least_squares_td(env, policy, epsilon, alpha, gamma, n_episodes, tile_coder): # Initialization. n = tile_coder.total_n_tiles A = (1/epsilon) * np.eye(n) b = np.zeros((n,1)) d = np.zeros((n,1)) for episode in range(n_episodes): done = False obs = env.reset() while not done: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs_prime, reward, done = env.step(a) x = np.array(tile_coder.get_tile_code(obs)).reshape(-1,1) x_prime = np.array(tile_coder.get_tile_code(obs_prime)).reshape(-1,1) b = b + reward * x d = (x - gamma * x_prime) A = A + x @ d.T if env.steps == 2: inv_A = np.linalg.inv(A) else: t = np.eye(n) - (((x @ d.T)/(1 + ((d.T @ inv_A) @ x))) @ nv_A) inv_A = inv_A @ t theta = inv_A @ b print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) v = LinearValueFunction(tile_coder.total_n_tiles) v.weights = theta.flatten() return v
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \ gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) z_th = np.zeros(env.observation_space_size * env.action_space_size) z_w = np.zeros(env.observation_space_size) R_bar = 0 for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate( obs_vec) R_bar += eta * delta z_w = lambda_w * z_w + obs_vec z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs) v.weights += alpha_w * delta * z_w policy.weights += alpha_th * delta * z_th obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size) v = LinearValueFunction(env.observation_space_size) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) I = 1 while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) v.weights += alpha_w * I * delta * obs_vec policy.weights += alpha_th * I * delta * policy.eligibility_vector(a, sa_pairs) I *= I obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def gradient_mc_prediction(env, policy, alpha, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) states = [] rewards = [None] for episode in range(n_episodes): done = False obs = env.reset() # Store the feature vector representation of the state. states.append(tile_coder.get_tile_code(obs)) feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) while not done: obs, reward, done = env.step(a) feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) rewards.append(reward) states.append(tile_coder.get_tile_code(obs)) for i in range(len(states)): G = np.sum(rewards[i + 1:]) # Update weights. v.weights += alpha * np.dot((G - v.evaluate(states[i])), states[i]) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v
def sarsa(env, gamma, alpha, epsilon, n_episodes): # Create iterators. sa_pairs = product(range(70), range(4)) # Initialize state-action value function. Q = dict.fromkeys(sa_pairs, 0.0) epsilon_start = epsilon decay = lambda x: x - (10/n_episodes)*epsilon_start if \ x - (10/n_episodes)*epsilon_start > 1e-4 else 1e-4 for episode in range(n_episodes): done = False obs = env.reset() action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) while not done: obs_prime, reward, done = env.step(action) action_prime = eps_greedy_policy(Q, obs_prime, epsilon, \ env.action_space_size) # Update state-action value estimate. Q[obs,action] += alpha * (reward + gamma * \ (Q[obs_prime, action_prime]) - Q[obs, action]) obs = obs_prime action = action_prime # Decay epsilon. epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return Q
def off_policy_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes): # Initialize target policy and state-action value function. sa_pairs = product(range(env.observation_space_size), \ range(env.action_space_size)) Q = dict.fromkeys(sa_pairs, 0.0) policy = dict.fromkeys(range(env.observation_space_size), 0) states = np.zeros(n) actions = np.zeros(n) rewards = np.zeros(n) decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1 for episode in range(n_episodes): done = False obs = env.reset() action = np.random.randint(4) states[0] = obs actions[0] = action t = 0 tau = -1 T = np.inf while not done or tau != T - 1: if t < T: obs_prime, reward, done = env.step(action) states[(t + 1) % n] = obs_prime rewards[(t + 1) % n] = reward if done: T = t + 1 else: action = np.random.randint(4) actions[(t + 1) % n] = action tau = t - n + 1 if tau > -1: p = 1 for i in range(tau + 1, min(tau + n - 1, T - 1)): s = states[i % n] a = actions[i % n] policy_proba = eps_greedy_proba(policy, s, a, epsilon) # 0.25 constant used as behaviour policy acts randomly. p *= policy_proba / 0.25 G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in \ range(tau+1, min(tau+n,T))]) if tau + n < T: s = states[(tau + n) % n] a = actions[(tau + n) % n] G += gamma**n * Q[s, a] s = states[tau % n] a = actions[tau % n] # Update state-action value estimate of the target policy. Q[s, a] += alpha * p * (G - Q[s, a]) # Make target policy greedy w.r.t. Q. action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) t += 1 epsilon = decay(epsilon) if episode % 1000 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def REINFORCE(env, alpha, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha * (gamma ** t) * G_t * \ policy.eligibility_vector(actions[t], all_sa_pairs) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def semi_gradient_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes, \ tile_coder, action_len, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size) states = [None] * n actions = np.zeros(n) rewards = np.zeros(n) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() states[0] = obs a = eps_greedy_func_policy(q, obs, epsilon, tile_coder,\ env.action_space_size) t = 0 tau = -1 T = np.inf while not done or tau != T-1: if t < T: obs_prime, reward, done = env.step(a) rewards[(t+1)%n] = reward states[(t+1)%n] = obs_prime if done: T = t+1 else: a = eps_greedy_func_policy(q, obs_prime, epsilon, \ tile_coder, env.action_space_size) actions[(t+1)%n] = a tau = t-n+1 if tau > -1: # Calculate n-step return. G = np.sum([gamma**(i-tau-1)*rewards[i%n] \ for i in range(tau+1, min(tau+n,T))]) if tau + n < T: s = states[(tau+n)%n] a = actions[(tau+n)%n] x = tile_coder.get_feature_vector(s, a) G += gamma**n * q.evaluate(x) s = states[tau%n] a = actions[tau%n] x = tile_coder.get_feature_vector(s, a) # Update weights. q.weights += alpha * (np.dot((G - q.evaluate(x)),x)) t += 1 print_episode(episode, n_episodes) # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. sa_pairs = product(range(env.observation_space_size), \ range(env.action_space_size)) Q = dict.fromkeys(sa_pairs, 0.0) states = np.zeros(n) actions = np.zeros(n) rewards = np.zeros(n) for episode in range(n_episodes): done = False obs = env.reset() t = 0 T = np.inf states[t] = obs a = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) actions[t] = a step_count = 0 while not done or tau != T - 1: step_count += 1 if t < T: obs_prime, reward, done = env.step(a) rewards[(t + 1) % n] = reward states[(t + 1) % n] = obs_prime if done: T = t + 1 else: a = eps_greedy_policy(Q, obs_prime, epsilon,\ env.action_space_size) actions[(t + 1) % n] = a tau = t - n + 1 if tau > -1: G = np.sum([ gamma**(i - tau - 1) * rewards[i % n] for i in range(tau + 1, min(tau + n, T)) ]) if tau + n < T: state = states[(tau + n) % n] action = actions[(tau + n) % n] G += gamma**n * Q[state, action] s = states[tau % n] a = actions[tau % n] # Update state-action value estimate. Q[s, a] += alpha * (G - Q[s, a]) t += 1 if episode % 1 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return Q
def differential_semi_gradient_n_step_sarsa(env, n, alpha, beta, epsilon, \ n_episodes, tile_coder, action_vec_dim, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size) r_bar = 0 states = [None] * n actions = np.zeros(n) rewards = np.zeros(n) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() states[0] = obs a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) t = 0 tau = -1 while not done: obs, reward, done = env.step(a) states[(t + 1) % n] = obs rewards[(t + 1) % n] = reward a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) actions[(t + 1) % n] = a tau = t - n + 1 if tau > -1: x = tile_coder.get_feature_vector(states[tau % n], actions[tau % n]) x_n = tile_coder.get_feature_vector(states[(tau+n)%n], \ actions[(tau+n)%n]) summ = np.sum( [rewards[i % n] - r_bar for i in range(tau + 1, tau + n)]) delta = summ + q.evaluate(x_n) - q.evaluate(x) r_bar += beta * delta q.weights += alpha * delta * x t += 1 # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def off_policy_mc(env, gamma, b_policy, n_episodes): # Create required iterators. n_hands, n_dealer, usable = tuple( [env.observation_space[i].n for i in range(3)]) state_space = product(range(n_hands), range(n_dealer), [True, False]) it_states1, it_states2 = tee(state_space) action_space = range(2) sa_pairs = product(it_states1, action_space) it_pairs1, it_pairs2 = tee(sa_pairs) # Initialization Q = dict.fromkeys(it_pairs1, 0.0) C = dict.fromkeys(it_pairs2, 0.0) target = dict.fromkeys(it_states2, 0) # Solving for optimal policy. for episode in range(n_episodes): if episode % 10000 == 0: print_episode(episode, n_episodes) done = False obs = env.reset() states = [] actions = [] rewards = [] # Generate an episode. while not done: action = b_policy(obs) states.append(obs) obs, reward, done, info = env.step(action) actions.append(action) rewards.append(reward) G = 0 W = 1 # Update action-value function. for t in range(len(states) - 1, -1, -1): G = gamma * G + rewards[t] s, a = states[t], actions[t] C[(s, a)] += W Q[(s, a)] += (W / C[(s, a)]) * (G - Q[(s, a)]) action_values = [Q[s, i] for i in range(env.action_space.n)] target[(s, a)] = np.argmax(action_values) if a == target[(s, a)]: W *= (1 / 0.5) else: break print_episode(n_episodes, n_episodes) return target
def op_mc_control(env, n_episodes): '''On-policy first-visit Monte Carlo control algorithm.''' obs_space = product(range(1, 32), range(1, 11), [True, False]) states = list(obs_space) sa_pairs = product(states, range(2)) keys = list(sa_pairs) # Initialization. Q = {s: np.zeros((2)) for s in states} returns = {pair: [] for pair in keys} policy = {s[0]: 1 for s in keys} epsilon = 1.0 for episode in range(n_episodes): done = False obs = env.reset() pairs = [] # Generate an episode. while not done: action = policy[obs] pairs.append([obs, action]) obs, reward, done, info = env.step(action) pairs.append((obs, policy[obs])) # Store returns for each state-action pair visited. for s, a in pairs: returns[s, a].append(reward) # Average returns for each state-action pair. for (s, a), G in returns.items(): if len(G) > 0: Q[s][a] = np.mean(G) # Update policy (epsilon-greedy w.r.t action-value function). for s, _ in pairs: opt_a = np.argmax(Q[s]) if np.random.uniform() < epsilon: policy[s] = env.action_space.sample() else: policy[s] = opt_a # Decay epsilon. epsilon = epsilon - 3 / n_episodes if epsilon > 0.1 else 0.1 if episode % 1000 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def td_pred(env, policy, alpha, gamma, n_episodes): # Initialize state-value function. V = np.zeros(70) for episode in range(n_episodes): done = False obs = env.reset() while not done: action = policy[obs] obs_prime, reward, done = env.step(action) # Update state-value estimate. V[obs] += alpha * (reward + gamma * V[obs_prime] - V[obs]) obs = obs_prime if episode % 1000 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return V
def mc_pred(env, policy, n_episodes): '''First-visit Monte Carlo prediction algorithm.''' hands = range(12, 22) dealer = range(1, 11) usable = [True, False] obs_space = product(hands, dealer, usable) # Initialization. keys = list(obs_space) V = dict.fromkeys(keys, 0) returns = {key:[] for key in keys} # For all hands less than 12 the player will hit to attain a hand in # the interval [12, 21]. Function prevents these states from being tracked # as optimal action already known (hit). is_valid = lambda x: True if x[0] > 11 and x[0] < 22 else False for episode in range(n_episodes): done = False obs = env.reset() states = [] if is_valid(obs): states.append(obs) # Generate an episode using given policy. while not done: action = policy[obs] obs, reward, done, info = env.step(action) if obs not in states and is_valid(obs): states.append(obs) # Append return that follows first occurrence of each state visited. for state in states: ls = returns[state] returns[state].append(reward) # Updated action-value function. for state,G in returns.items(): if len(G) > 0: V[state] = np.mean(G) if episode % 1000 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return V
def Q_learing(env, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. Q = {} curr_row = 0 for row, col in env.state_space: for i in range(curr_row, curr_row + row): positions = product([i], range(col)) velocities = product(range(-3, 1), range(-2, 3)) states = product(positions, velocities) sa_pairs = product(states, range(9)) # Key: (((pos_x, pos_y), (dy, dx)), action) for pair in sa_pairs: Q[pair] = 0 curr_row += row # Store rewards for plot. rewards = [] decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0 else 0.1 for episode in range(n_episodes): done = False val = 0 obs = env.reset() while not done: action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) obs_prime, reward, done = env.step(action) val += reward action_values = [Q[obs_prime, i] for i in range(9)] opt_a = np.argmax(action_values) # Update state-action value estimate. Q[obs,action] += alpha * (reward + gamma * Q[obs_prime,opt_a] \ - Q[obs,action]) obs = obs_prime epsilon = decay(epsilon) rewards.append(val) if episode % 10 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) # Plot rewards over training process. create_line_plot(range(len(rewards)), rewards, 'Episode number:', \ 'Return:', 'Agent returns over training:') return Q
def sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, \ env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() action = eps_greedy_policy_bin_features(q, obs, epsilon, \ env.observation_space_size, env.action_space_size) z = np.zeros(env.observation_space_size * env.action_space_size) while not done: obs_prime, reward, done = env.step(action) delta = reward sa_vec = encode_sa_pair(obs, action, env.observation_space_size, \ env.action_space_size) idx_active = np.argwhere(sa_vec == 1) delta -= np.sum(q.weights[idx_active]) # Accumulating traces. z[idx_active] += 1 if done: # Update weights. q.weights += alpha * delta * z else: action_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \ env.observation_space_size, env.action_space_size) sa_prime_vec = encode_sa_pair(obs_prime, action_prime, \ env.observation_space_size, env.action_space_size) idx_active = np.argwhere(sa_prime_vec == 1) delta += gamma * np.sum(q.weights[idx_active]) # Update weights. q.weights += alpha * delta * z # Update accumulating traces. z = gamma * lamda * z obs = obs_prime action = action_prime if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return q
def double_Q(env, alpha, gamma, epsilon, n_episodes): # Initialize state-action value functions. Q_1 = {} Q_2 = {} curr_row = 0 for row, col in env.state_space: for i in range(curr_row, curr_row + row): positions = product([i], range(col)) velocities = product(range(-3, 1), range(-2, 3)) states = product(positions, velocities) # Key: (((pos_x, pos_y), (dy, dx)), action) for pair in product(states, range(9)): Q_1[pair] = 0 Q_2[pair] = 0 curr_row += row decay = lambda i,x: x/(i+1) for episode in range(n_episodes): done = False obs = env.reset() while not done: a = double_Q_eps_greedy_policy(obs, Q_1, Q_2, epsilon) obs_prime, reward, done = env.step(a) # Update state-action value estimate. if np.random.uniform() < 0.5: action_vals = [Q_1[obs_prime, i] for i in range(9)] a_prime = np.argmax(action_vals) Q_1[obs, a] += alpha * (reward +gamma*Q_2[obs_prime, a_prime]\ - Q_1[obs,a]) else: action_vals = [Q_2[obs_prime, i] for i in range(9)] a_prime = np.argmax(action_vals) Q_2[obs, a] += alpha * (reward + gamma*Q_1[obs_prime, a_prime]\ - Q_2[obs,a]) obs = obs_prime epsilon = decay(episode, epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) Q = {s:i + x for s,i,x in zip(Q_1.keys(), Q_1.values(), Q_2.values())} return Q
def semi_gradient_td_zero(env, policy, alpha, gamma, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) for episode in range(n_episodes): done = False obs = env.reset() while not done: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs_prime, reward, done = env.step(a) s = tile_coder.get_tile_code(obs) s_prime = tile_coder.get_tile_code(obs_prime) # Update weights. v.weights += alpha * (np.dot((reward + gamma*v.evaluate(s_prime)- \ v.evaluate(s)), s)) obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) returns = [] for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) x_t = encode_state(states[t], env.observation_space_size) delta = G_t - v.evaluate(x_t) v.weights += alpha_w * (gamma**t) * delta * x_t all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha_th * (gamma ** t) * G_t * delta * \ policy.eligibility_vector(actions[t], all_sa_pairs) returns.append(sum(rewards[1:])) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return (policy, np.array(returns))
def semi_gradient_n_step_td(env, policy, n, alpha, gamma, n_episodes, tile_coder): # Initialization. v = LinearValueFunction(tile_coder.total_n_tiles) states = [None] * n rewards = np.zeros(n) for episode in range(n_episodes): done = False obs = env.reset() states[0] = tile_coder.get_tile_code(obs) t = 0 tau = -1 T = np.inf while not done or tau != T - 1: if t < T: feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \ env.action_space_size) a = policy.greedy_action(feature_vectors) obs, reward, done = env.step(a) states[(t + 1) % n] = tile_coder.get_tile_code(obs) rewards[(t + 1) % n] = reward if done: T = t + 1 tau = t - n + 1 if tau > -1: # Calculate n-step return. G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in range(tau+1, \ min(tau+n, T))]) if tau + n < T: G += gamma**n * v.evaluate(states[(tau + n) % n]) # Update weights. v.weights += alpha * np.dot((G-v.evaluate(states[tau%n])), \ states[tau%n]) t += 1 print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return v
def n_step_td_pred(env, policy, n, alpha, gamma, n_episodes): # Initialize state-value function. V = np.zeros(env.observation_space_size) states = np.zeros(n) rewards = np.zeros(n) for episode in range(n_episodes): done = False obs = env.reset() states[0] = obs tau = -1 t = 0 T = np.inf while not done or tau != T - 1: if t < T: action = policy(obs) obs_prime, reward, done = env.step(action) states[(t + 1) % n] = obs_prime rewards[(t + 1) % n] = reward obs = obs_prime if done: T = t + 1 tau = t - n + 1 if tau > -1: G = np.sum([gamma ** (i-tau-1) * rewards[i % n] for i in \ range(tau + 1, min(tau+n,T))]) if tau + n < T: state = int(states[(tau + n) % n]) G += gamma**n * V[state] state = int(states[tau % n]) # Update state-value estimate. V[state] += alpha * (G - V[state]) t += 1 if episode % 1 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return V
def online_sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes): # Initialize state-action value function. q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, env.action_space_size) for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_policy_bin_features(q, obs, epsilon, env.observation_space_size, \ env.action_space_size) x = encode_sa_pair(obs, a, env.observation_space_size, env.action_space_size) z = np.zeros(env.observation_space_size * env.action_space_size) Q_old = 0 while not done: obs_prime, reward, done = env.step(a) a_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \ env.observation_space_size, env.action_space_size) x_prime = encode_sa_pair(obs_prime, a_prime, env.observation_space_size, \ env.action_space_size) Q = q.evaluate(x) Q_prime = q.evaluate(x_prime) delta = reward + gamma * Q_prime - Q # Update eligibility traces. z = gamma * lamda * z + (1 - alpha * gamma * lamda * np.dot(z, x)) * x # Update weights. q.weights += alpha * (delta + Q - Q_old) * z - alpha * (Q - Q_old) * x Q_old = Q x = x_prime a = a_prime if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return q
def semi_gradient_sarsa(env, alpha, gamma, epsilon, n_episodes, tile_coder, action_len): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size) all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) while not done: obs_prime, reward, done = env.step(a) x = tile_coder.get_feature_vector(obs, a) if done: # Update weights. q.weights += alpha * np.dot((reward - q.evaluate(x)), x) else: a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, \ tile_coder, env.action_space_size) x_prime = tile_coder.get_feature_vector(obs_prime, a_prime) # Update weights. q.weights += alpha * np.dot((reward + \ gamma * q.evaluate(x_prime) - q.evaluate(x)), x) obs = obs_prime a = a_prime # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance over training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def tabular_dyna_Q(env, alpha, gamma, epsilon, n_episodes, n): # Create iterators. sa_pairs = product(range(env.observation_space_size), \ range(env.action_space_size)) pairs_one, pairs_two = tee(sa_pairs) # Initialize state-action value function and model. Q = dict.fromkeys(pairs_one, 0) model = {pair:(-1,-1) for pair in pairs_two} for episode in range(n_episodes): done = False obs = env.reset() while not done: # Acting, model-learning and direct RL. action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) obs_prime, reward, done = env.step(action) max_Q = np.argmax([Q[obs_prime, i] for i in range(4)]) Q[obs, action] += alpha * (reward + gamma * Q[obs_prime, max_Q] - Q[obs, action]) model[obs, action] = (reward, obs_prime) obs = obs_prime # Q-planning algorithm. for i in range(n): possible_pairs = [(s,a) for s,a in list(model.keys()) if \ (model[s,a] != (-1,-1))] idx = np.random.choice(len(possible_pairs)) pair = possible_pairs[idx] s = pair[0] a = pair[1] r, s_prime = model[s,a] max_Q = np.argmax([Q[s, x] for x in range(4)]) Q[s,a] += alpha * (r + gamma * Q[s_prime, max_Q] - Q[s,a]) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return Q
def differential_semi_gradient_sarsa(env, alpha, beta, epsilon, n_episodes,\ tile_coder, action_vec_dim, stop_threshold): # Initialization. q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size) r_bar = 0 all_steps = [] for episode in range(n_episodes): done = False obs = env.reset() a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \ env.action_space_size) while not done: obs_prime, reward, done = env.step(a) a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, tile_coder,\ env.action_space_size) x = tile_coder.get_feature_vector(obs, a) x_prime = tile_coder.get_feature_vector(obs_prime, a_prime) delta = reward - r_bar + q.evaluate(x_prime) - q.evaluate(x) r_bar += beta * delta # Update weights. q.weights += alpha * delta * x obs = obs_prime a = a_prime # Stop training if state-action value function has converged. if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold: break # Store steps for plotting. all_steps.append(env.steps) print_episode(episode, n_episodes) # Plot agent performance during training. create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \ 'Number of steps:', 'Number of steps required to reach goal during training:') print_episode(n_episodes, n_episodes) return q
def n_step_Q_sigma(env, n, alpha, gamma, epsilon, sigma, n_episodes): # Initialize policy and state-action value function. sa_pairs = product(range(env.observation_space_size), \ range(env.action_space_size)) Q = dict.fromkeys(sa_pairs, 0) policy = dict.fromkeys(range(env.observation_space_size), 0) states = np.zeros(n) actions = np.zeros(n) Qs = np.zeros(n) deltas = np.zeros(n) pis = np.zeros(n) ratios = np.zeros(n) decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1 for episode in range(n_episodes): done = False obs = env.reset() action = np.random.randint(4) states[0] = obs actions[0] = action Qs[0] = Q[obs, action] t = 0 tau = -1 T = np.inf while not done or tau != T - 1: if t < T: obs_prime, reward, done = env.step(action) states[(t + 1) % n] = obs_prime if done: T = t + 1 deltas[t % n] = reward - Qs[t % n] else: action = np.random.randint(4) actions[(t + 1) % n] = action Qs[(t + 1) % n] = Q[obs_prime, action] sample = gamma * Qs[(t + 1) % n] expectation = gamma*np.sum([eps_greedy_proba(policy, \ obs_prime,i,epsilon)*Q[obs_prime, i] for i in range(4)]) deltas[t%n] = reward + sigma*sample + (1-sigma) * \ expectation - Qs[t%n] pis[(t+1)%n] = eps_greedy_proba(policy, obs_prime, \ action, epsilon) ratios[(t + 1) % n] = pis[(t + 1) % n] / 0.25 tau = t - n + 1 if tau > -1: p = 1 Z = 1 G = Qs[tau % n] for k in range(tau, min(tau + n - 1, T - 1)): G += Z * deltas[k % n] Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma) p = p * (1 - sigma + sigma * ratios[k % n]) s = states[tau % n] a = actions[tau % n] # Update state-action value function. Q[s, a] += alpha * p * (G - Q[s, a]) action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) t += 1 epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def mc_control(env, n_episodes): '''Monte Carlo control with Exploring Starts.''' # Create required iterators and lists. obs_space = product(range(12,22), range(1,11), [True, False]) states = list(obs_space) sa_pairs = product(states, range(2)) keys = list(sa_pairs) # Initialization. Q = {s:np.zeros((2)) for s in states} returns = {pair:[] for pair in keys} starting_sa_pairs = list(returns.keys()) policy = get_init_policy() # Don't track hands where optimal action is known (action = 1 if hand < 12). is_valid = lambda x: True if x[0] > 11 and x[0] < 22 else False player = lambda x: [x,0] for episode in range(n_episodes): env.reset() # Select random starting state and action. rand = np.random.randint(len(starting_sa_pairs)) (x, y, usable),a = starting_sa_pairs[rand] # Configure the environment to use exploring starts. env.player = player(x) env.dealer = player(y) # Used to store result of episode. done = a == 0 episode_data = [starting_sa_pairs[rand]] if not done else [] obs = starting_sa_pairs[rand][0] # Query environment if hold chosen as starting action. if done: obs, reward, done, info = env.step(a) episode_data.append((obs,a)) else: # Hit chosen as starting action. while not done: a = policy[obs] obs, reward, done, info = env.step(a) if obs not in episode_data and is_valid(obs): episode_data.append((obs,a)) # Append return that follows first occurrence of each state-action pair. for obs, a in episode_data: returns[obs,a].append(reward) # Update action-value function. for pair, G in returns.items(): if len(G) > 0: s,a = pair Q[s][a] = np.mean(G) # Update policy, make greedy w.r.t. action-value function. for s,ls in Q.items(): policy[s] = np.argmax(ls) if episode % 1000 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def n_step_tree_backup(env, n, alpha, gamma, epsilon, n_episodes): # Initialize policy and state-action value function. sa_pairs = product(range(env.observation_space_size),\ range(env.action_space_size)) Q = dict.fromkeys(sa_pairs, 0.0) policy = dict.fromkeys(range(env.observation_space_size), 0) states = np.zeros(n) actions = np.zeros(n) Qs = np.zeros(n) deltas = np.zeros(n) pis = np.zeros(n) decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0.1 else 0.1 for episode in range(n_episodes): done = False obs = env.reset() action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) states[0] = obs actions[0] = action Qs[0] = Q[obs, action] t = -1 tau = -1 T = np.inf while not done or t != T - 1: t += 1 if t < T: obs_prime, reward, done = env.step(action) states[(t + 1) % n] = obs_prime if done: T = t + 1 deltas[t % n] = reward - Qs[t % n] else: deltas[t%n] = reward + gamma * \ np.sum([policy_proba(policy, obs_prime, i, epsilon) * \ Q[obs_prime, i] for i in range(4)]) - Qs[t%n] action = eps_greedy_policy(Q, obs_prime, epsilon, \ env.action_space_size) Qs[(t + 1) % n] = Q[obs_prime, action] pis[(t + 1) % n] = policy_proba(policy, obs_prime, action, epsilon) tau = t - n + 1 if tau > -1: Z = 1 G = Qs[tau % n] for k in range(tau, min(tau + n - 1, T - 1)): G += Z * deltas[k % n] Z *= gamma * Z * pis[(k + 1) % n] s = states[tau % n] a = actions[tau % n] # Update state-action value function. Q[s, a] += alpha * (G - Q[s, a]) # Make policy greedy w.r.t. Q. action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def prioritized_sweeping(env, alpha, gamma, epsilon, theta, n_episodes): # Create iterators. sa_pairs = product(range(env.observation_space_size), \ range(env.action_space_size)) it_one, it_two = tee(sa_pairs) # Initialize state-action value function and model. Q = dict.fromkeys(it_one, 0) model = {pair: (0, 0) for pair in it_two} for episode in range(n_episodes): done = False obs = env.reset() action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) q = [] while not done: obs_prime, reward, done = env.step(action) model[obs, action] = (reward, obs_prime) opt_a = np.argmax([Q[obs_prime, i] for i in range(4)]) P = abs(reward + gamma * Q[obs_prime, opt_a] - Q[obs, action]) # Maintain priority queue of each state-action pair whose estimated # value changes nontrivially. Prioritized by size of change. if P > theta: # Negative P used to allow a min binary heap to be used. q.append((-P, (obs, action))) obs = obs_prime action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size) counter = 0 heapq.heapify(q) while len(q) > 0 and counter < n: counter += 1 _, (s, a) = heapq.heappop(q) r, s_prime = model[s, a] opt_a = np.argmax([Q[s_prime, i] for i in range(4)]) Q[s, a] += alpha * (reward + gamma * Q[s_prime, opt_a] - Q[s, a]) # Determine the effect the change of value has on predecessor state- # action pairs' values. for s_, a_ in env.get_predecessor_states(s): r_, _ = model[s_, a_] opt_a = np.argmax([Q[s, i] for i in range(4)]) P = abs(r_ + gamma * Q[s, opt_a] - Q[s_, a_]) # Add predecessor state-action pairs to priority queue if change # causes their value to change nontrivially. if P > theta: # If state-action pair already in queue, keep only the # higher priority entry. ls = [i for i in q if i[1] == (s_, a_)] if len(ls) > 0: if ls[0][0] > -P: q.remove(ls[0]) heapq.heapify(q) heapq.heappush(q, ((-P, (s_, a_)))) else: heapq.heappush(q, ((-P, (s_, a_)))) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return Q