def sarsa(env, alpha=0.5, gamma=1, epsilon=.1, num_episodes=200): """ Returns the Q-value estimates for an environment by using the SARSA algorithm (State-Action-Reward-State-Action). Parameters ---------- env : gym.core.Env OpenAI Gym Environment instance alpha : float Algorithm's learning rate gamma : float Discount for next rewards epsilon : float Probability of choosing an action randomly num_episodes : int Number of episodes for the policy iteration process Returns ------- numpy.ndarray Estimated Q (state-action) values list List of rewards of each episode """ # Stats tracking sum_rewards = [] # Create Q Q = c.ActionValue(u.extract_actions(env)) policy = c.EGreedyPolicy(epsilon, Q) # Run for a given number of times for t in range(num_episodes): sum_rewards.append(0) # Obtain initial state state = env.reset() # Choose action from env given e-greedy policy given Q action = policy.sample(state) # Run each episode while True: # Take action, obtain next state & reward next_state, reward, done, _ = env.step(action) # Choose next action next_action = policy.sample(next_state) # Approximate Q Q[state,action] += alpha * \ (reward + gamma * Q[next_state, next_action] - Q[state, action]) # Update state variables state = next_state action = next_action sum_rewards[t] += reward # Finish episode if done==True if done: break return Q, sum_rewards
def n_step_sarsa(num_steps, env, num_episodes, policy=None, step_size=.5, discount_rate=.9, epsilon=.3): """ Returns the Q-value estimates for an environment by using the On-Policy N-Step SARSA algorithm Parameters ---------- num_steps: int Number of steps used in value function bootstrapping env : gym.core.Env OpenAI Gym Environment instance num_episodes : int Number of episodes to be run using the environment policy : Policy instance Optional Behaviour Policy for which the Q-values will be estimated If None, an e-greedy policy will be used, based on the estimated Q-Values step_size: float Algorithm's learning rate discount_rate : float Discount rate used when estimating values with future rewards epsilon : float Probability of choosing an action randomly in e-greedy policy created if None 'policy' parameter is used Returns ------- ActionValue Estimated Q (state-action) values list List of rewards of each episode """ episode_rewards = [] # Init Q(s,a) arbitrarily, for all s in S, a in A(s) action_values = c.ActionValue(u.extract_actions(env)) # Init PI to be e-greedy w.r.t. Q, or to a fixed given policy if policy is None: policy = c.EGreedyPolicy(epsilon, action_values) # Store and access operation lists states = c.NstepMemory(num_steps) actions = c.NstepMemory(num_steps) rewards = c.NstepMemory(num_steps) for ep in range(num_episodes): # Init and store S0 states[0], rewards[0] = env.reset(), 0 # Select and store action A0 actions[0] = policy.sample(states[0]) # T: terminal_step = infinity step, terminal_step = 0, float('inf') episode_rewards.append(0) while True: if step < terminal_step: # Take action a_t. Observe and store Rt+1, St+1 states[step + 1], rewards[step + 1], done, _ = env.step( actions[step]) episode_rewards[ep] += rewards[step + 1] if done: terminal_step = step + 1 else: actions[step + 1] = policy.sample(states[step + 1]) update_step = step - num_steps + 1 if update_step >= 0: # Calculate n-step return: G G = .0 for i in range(update_step + 1, min(update_step + num_steps + 1, terminal_step)): G += discount_rate**(i - update_step - 1) * rewards[i] if update_step + num_steps < terminal_step: G += discount_rate**(num_steps) * action_values[ states[update_step + num_steps], actions[update_step + num_steps]] action_values[states[update_step], actions[update_step]] += step_size * ( G - action_values[states[update_step], actions[update_step]]) if update_step == terminal_step - 1: break else: step += 1 return action_values, episode_rewards
def double_qlearning(env, alpha=0.5, gamma=1, epsilon=0.1, num_episodes=100): """ Returns the Q-value estimates for an environment by using the Double Q-Learning algorithm. Parameters ---------- env : gym.core.Env OpenAI Gym Environment instance alpha : float Algorithm's learning rate gamma : float Discount for next rewards epsilon : float Probability of choosing an action randomly num_episodes : int Number of episodes for the policy iteration process Returns ------- numpy.ndarray Estimated Q (state-action) values list List of rewards of each episode """ # Stats tracking sum_rewards = [] # Create Q1 and Q2 Q1 = c.ActionValue(u.extract_actions(env)) Q2 = c.ActionValue(u.extract_actions(env)) # Run for a given number of times for t in range(num_episodes): sum_rewards.append(0) # Obtain initial state state = env.reset() # Run each episode while True: # Choose action from env given e-greedy policy given Q1 and Q2 policy = c.EGreedyPolicy(epsilon, Q1 + Q2) action = policy.sample(state) # Take action, obtain next state & reward next_state, reward, done, _ = env.step(action) # Choose policy to update randomly if np.random.rand() < .5: # Choose next action as max Q(S',a) or equivalently max(Q[s']) next_action = Q1.argmax(next_state) # Approximate Q Q1[state, action] += alpha * \ (reward + gamma * Q2[next_state, next_action] - Q1[state, action]) else: # Choose next action as max Q(S',a) or equivalently max(Q[s']) next_action = Q2.argmax(next_state) # Approximate Q Q2[state, action] += alpha * \ (reward + gamma * Q1[next_state, next_action] - Q2[state, action]) # Update state variable state = next_state sum_rewards[t] += reward # Finish episode if done==True if done: break return (Q1 + Q2) / 2, sum_rewards
def n_step_backup_tree(num_steps, env, num_episodes, policy=None, step_size=.5, discount_rate=.9, epsilon=.3): """ Returns the Q-value estimates for an environment by using the Off-Policy N-Step Backup Tree Algorithm Parameters ---------- num_steps: int Number of steps used in value function bootstrapping env : gym.core.Env OpenAI Gym Environment instance num_episodes : int Number of episodes to be run using the environment policy : Policy instance Optional Target Policy for which the Q-values will be estimated If None, an e-greedy policy will be used, based on the estimated Q-Values step_size: float Algorithm's learning rate discount_rate : float Discount rate used when estimating values with future rewards epsilon : float Probability of choosing an action randomly in e-greedy policy created if None 'policy' parameter is used Returns ------- ActionValue Estimated Q (state-action) values list List of rewards of each episode """ episode_rewards = [] # Init Q(s,a) arbitrarily, for all s in S, a in A(s) action_values = c.ActionValue(u.extract_actions(env)) # Init PI to be e-greedy w.r.t. Q, or to a fixed given policy if policy is None: policy = c.EGreedyPolicy(epsilon, action_values) # Store and access operation lists states = c.NstepMemory(num_steps) actions = c.NstepMemory(num_steps) old_values = c.NstepMemory(num_steps) td_errors = c.NstepMemory(num_steps) taken_policy = c.NstepMemory(num_steps) for ep in range(num_episodes): u.display_episode_log(ep + 1, num_episodes) # Init default values old_values[0], taken_policy[0] = 0, 0 # Init and store S0 states[0] = env.reset() # Select and store action A0 actions[0] = policy.sample(states[0]) # Store Q_t-1(S0, A0) as Q0 old_values[0] = action_values[states[0], actions[0]] # T: terminal_step = infinity step, terminal_step = 0, float('inf') episode_rewards.append(0) while True: if step < terminal_step: # Take action a_t. Observe and store Rt+1, St+1 states[step + 1], reward, done, _ = env.step(actions[step]) episode_rewards[ep] += reward if done: terminal_step = step + 1 td_errors[step] = reward - old_values[step] else: expected_values = [ policy[states[step + 1], action] * action_values[states[step + 1], action] for action in action_values.actions ] td_errors[step] = reward - old_values[step] + discount_rate * \ sum( expected_values ) # Select arbitrarily and store an action as A[t+1] actions[step + 1] = random.choice(action_values.actions) # Store Q_t[ A[t+1] | S[t+1] ] as Q_t old_values[step + 1] = action_values[states[step + 1], actions[step + 1]] # Store pi[A[t+1] | S[t+1]] as pi_t taken_policy[step + 1] = policy[states[step], actions[step]] update_step = step - num_steps + 1 if update_step >= 0: Z = 1. G = old_values[update_step] for k in range(update_step, min(update_step + num_steps, terminal_step)): G += Z * td_errors[k] if k != (min(update_step + num_steps, terminal_step) - 1): Z *= discount_rate * taken_policy[k + 1] action_values[states[update_step], actions[update_step]] += step_size * ( G - action_values[states[update_step], actions[update_step]]) if update_step == terminal_step - 1: break else: step += 1 return action_values, episode_rewards