def extract_policy(env, v, gamma): """ Extract the optimal policy given the optimal value-function. Inputs: env: OpenAI Gym environment. env.P: dictionary P[state][action] is tuples with (probability, nextstate, reward, terminal) probability: float nextstate: int reward: float terminal: boolean env.nS: int number of states env.nA: int number of actions v: numpy.ndarray value function gamma: float Discount factor. Number in range [0, 1) Outputs: policy: numpy.ndarray """ policy = np.zeros(env.nS, dtype=np.int32) ############################ q = action_evaluation(env, gamma, v) policy = np.apply_along_axis(np.argmax, 1, q) ############################ return policy
def value_iteration(env, gamma, max_iteration, theta): """ Implement value iteration algorithm. Inputs: env: OpenAI Gym environment. env.P: dictionary the transition probabilities of the environment P[state][action] is list of tuples. Each tuple contains probability, nextstate, reward, terminal env.nS: int number of states env.nA: int number of actions gamma: float Discount factor. max_iteration: int The maximum number of iterations to run before stopping. theta: float The threshold of convergence. Outputs: V: numpy.ndarray policy: numpy.ndarray numIterations: int Number of iterations """ V = np.zeros(env.nS) numIterations = 0 #Implement the loop part here ############################ # YOUR CODE STARTS HERE while True: delta = 0 length = len(V) q = action_evaluation(env, gamma, V) for i in range(length): old_v = V[i] V[i] = np.max(q[i]) diff = np.abs(old_v - V[i]) if diff > delta: delta = diff numIterations += 1 if delta < theta or numIterations >= max_iteration: break # YOUR CODE ENDS HERE ############################ #Extract the "optimal" policy from the value function policy = extract_policy(env, V, gamma) return V, policy, numIterations
def value_iteration(env, gamma, max_iteration, theta): """ Implement value iteration algorithm. Inputs: env: OpenAI Gym environment. env.P: dictionary the transition probabilities of the environment P[state][action] is list of tuples. Each tuple contains probability, nextstate, reward, terminal env.nS: int number of states env.nA: int number of actions gamma: float Discount factor. max_iteration: int The maximum number of iterations to run before stopping. theta: float The threshold of convergence. Outputs: V: numpy.ndarray policy: numpy.ndarray numIterations: int Number of iterations """ V = np.zeros(env.nS) numIterations = 0 #Implement the loop part here ############################ # YOUR CODE STARTS HERE iteration = True while iteration and numIterations <= 500: numIterations += 1 diff = 0 q = action_evaluation(env, gamma, V) for i, s in enumerate(V): temp = max(q[i]) diff = max(diff, temp - s) V[i] = temp if diff < theta: iteration = False # YOUR CODE ENDS HERE ############################ #Extract the "optimal" policy from the value function policy = extract_policy(env, V, gamma) return V, policy, numIterations
def policy_improvement(env, value_from_policy, policy, gamma): """ Given the value function from policy, improve the policy. Inputs: env: OpenAI Gym environment env.P: dictionary P[state][action] is tuples with (probability, nextstate, reward, terminal) probability: float nextstate: int reward: float terminal: boolean env.nS: int number of states env.nA: int number of actions value_from_policy: numpy.ndarray The value calculated from the policy policy: numpy.ndarray The previous policy. gamma: float Discount factor. Outputs: new policy: numpy.ndarray An array of integers. Each integer is the optimal action to take in that state according to the environment dynamics and the given value function. policy_stable: boolean True if the "optimal" policy is found, otherwise false """ ############################ # YOUR CODE STARTS HERE policy_stable = True q = action_evaluation(env, gamma, value_from_policy) for i in range(len(policy)): old_action = policy[i] new_action = np.argmax(q[i]) if old_action != new_action: policy_stable = False policy[i] = new_action # YOUR CODE ENDS HERE ############################ return policy, policy_stable
def policy_evaluation(env, policy, gamma, theta): """ Evaluate the value function from a given policy. Inputs: env: OpenAI Gym environment. env.P: dictionary env.nS: int number of states env.nA: int number of actions gamma: float Discount factor. policy: numpy.ndarray The policy to evaluate. Maps states to actions. theta: float The threshold of convergence. Outputs: V: numpy.ndarray The value function from the given policy. """ ############################ # YOUR CODE STARTS HERE V = np.zeros(len(policy)) iteration = True while iteration: q = action_evaluation(env, gamma, V) diff = 0 for s in range(len(V)): temp = q[s, policy[s]] diff = max(temp - V[s], diff) V[s] = temp if diff < theta: iteration = False # YOUR CODE ENDS HERE ############################ return V
def policy_evaluation(env, policy, gamma, theta): """ Evaluate the value function from a given policy. Inputs: env: OpenAI Gym environment. env.P: dictionary env.nS: int number of states env.nA: int number of actions gamma: float Discount factor. policy: numpy.ndarray The policy to evaluate. Maps states to actions. theta: float The threshold of convergence. Outputs: V: numpy.ndarray The value function from the given policy. """ ############################ # YOUR CODE STARTS HERE V = np.zeros(env.nS) while True: delta = 0 q = action_evaluation(env, gamma, V) for i in range(len(policy)): old_v = V[i] V[i] = q[i][policy[i]] diff = np.abs(old_v - V[i]) if diff > delta: delta = diff if delta < theta: break # YOUR CODE ENDS HERE ############################ return V
def extract_policy(env, v, gamma): """ Extract the optimal policy given the optimal value-function. Inputs: env: OpenAI Gym environment. env.P: dictionary P[state][action] is tuples with (probability, nextstate, reward, terminal) probability: float nextstate: int reward: float terminal: boolean env.nS: int number of states env.nA: int number of actions v: numpy.ndarray value function gamma: float Discount factor. Number in range [0, 1) Outputs: policy: numpy.ndarray """ policy = np.zeros(env.nS, dtype=np.int32) ############################ # YOUR CODE STARTS HERE length = len(policy) q = action_evaluation(env, gamma, v) for i in range(length): policy[i] = np.argmax(q[i]) # YOUR CODE ENDS HERE ############################ return policy