def off_policy_mc_prediction_weighted_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: nA = env_spec.nA # Number of actions nS = env_spec.nS # Number of states gamma = env_spec.gamma # Gamma Q = np.zeros(shape=(nS, nA)) C = np.zeros(shape=(nS, nA)) for episode in range(len(trajs)): G = 0.0 W = 1.0 for t in range(len(trajs[episode]))[::-1]: state, action, reward, next_state = trajs[episode][t] G = gamma * G + reward C[state, action] += W Q[state, action] += (W / C[state, action]) * (G - initQ[state, action]) # Update weights with weighted importance sampling ratio W = W * (pi.action_prob(state, action) / bpi.action_prob(state, action)) if W == 0: break initQ = Q.copy() return Q
def off_policy_mc_prediction_weighted_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using behavior policy bpi list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine) ##################### c = np.zeros([env_spec.nS, env_spec.nA]) for episode in trajs: g = 0 w = 1 for t in range(len(episode) - 1, -1, -1): if w != 0: st, at, rt1, st1 = episode[t] g = env_spec.gamma * g + rt1 c[st, at] += w initQ[st, at] += w / c[st, at] * (g - initQ[st, at]) w *= pi.action_prob(st, at) / bpi.action_prob(st, at) else: break return initQ
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ TD = env.TD R = env.R Q = np.zeros((env.spec.nS, env.spec.nA)) while True: delta = 0 for state in range(env.spec.nS): v = initV[state] sumV, sumQ = 0, 0 for action in range(env.spec.nA): sumQ = sum(TD[state, action, next_state] * (R[state, action, next_state] + env.spec.gamma * initV[next_state]) for next_state in range(env.spec.nS)) Q[state, action] = sumQ sumV += pi.action_prob(state, action) * sumQ initV[state] = sumV delta = max(delta, abs(v - initV[state])) if delta < theta: break return initV, Q
def off_policy_mc_prediction_weighted_importance_sampling( env_spec:EnvSpec, trajs:Iterable[Iterable[Tuple[int,int,int,int]]], bpi:Policy, pi:Policy, initQ:np.array ) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using behavior policy bpi list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine) ##################### Q = np.array(initQ) C = np.zeros_like(Q) for traj in trajs: G = 0.0 W = 1.0 for step in reversed(traj): s = step[0] a = step[1] r = step[2] G = env_spec.gamma*G + r if W == 0: break C[s][a] += W Q[s][a] += (W/C[s][a])*(G-Q[s][a]) W = W*pi.action_prob(s,a)/bpi.action_prob(s,a) return Q
def Glearning(env: EnvWithModel, rho: Policy, alpha: float, initG: np.array, beta: float, num_episodes: int): env_spec = env.spec nS, nA, gamma = env_spec.nS, env_spec.nA, env_spec.gamma G = initG.copy() pol = np.zeros((nS, nA)) for state in range(nS): elts = [0 for i in range(nA)] for action in range(nA): elts[action] = rho.action_prob(state, action) * np.exp( -beta * G[state, action]) pol[state] = np.array(elts) / np.sum(np.array(elts)) pi = ArbitraryPolicy(nS, nA, pol) rewards = [] for _ in range(num_episodes): state = env.reset() done = False totalReward = 0 t = 0 while not done: action = pi.action(state) new_state, reward, done = env.step(action) cost = -reward new_amount = 0 for i in range(nA): new_amount += rho.action_prob(new_state, i) * np.exp( -beta * G[new_state][i]) update = cost - (gamma / beta) * np.log(new_amount) G[state][action] = (1 - alpha) * G[state][action] + alpha * update elts = [0 for action in range(nA)] for i in range(nA): elts[i] = rho.action_prob(state, i) * np.exp( -beta * G[state][i]) pol[state] = np.array(elts) / np.sum(np.array(elts)) pi = ArbitraryPolicy(nS, nA, pol) state = new_state totalReward += reward * (gamma**t) t += 1 rewards.append(totalReward) return G, pi, rewards
def off_policy_mc_prediction_ordinary_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using ordinary importance # sampling (Hint: Sutton Book p. 109, every-visit implementation is fine) ##################### Q = initQ.copy() N = np.zeros((env_spec.nS, env_spec.nA)) for e in trajs: rho = 1 G = 0 for t in range( len(e) - 1, -1, -1 ): #This for loop moves backwards from from the final step to 0 (s_T-1,a_T-1,r_T,s_T) #rho importance ratio e_t = e[t] #current time step t of current episode e s, a, r, s_prime = e_t G = r + env_spec.gamma * G N[s, a] += 1 Q[s, a] += (rho * G - Q[s, a]) / N[ s, a] # Dont have to handle 0 in denominator N will always be >=1 rho = rho * pi.action_prob(s, a) / bpi.action_prob(s, a) return Q
def off_policy_mc_prediction_ordinary_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using ordinary importance # sampling (Hint: Sutton Book p. 109, every-visit implementation is fine) ##################### nS = env_spec.nS nA = env_spec.nA gamma = env_spec.gamma Q = initQ tau = np.zeros((nS, nA)) for eps_tr in trajs: G = 0 W = 1 for step in reversed(eps_tr): G = gamma * G + step[2] s = step[0] a = step[1] tau[s, a] = tau[s, a] + 1 Q[s, a] = Q[s, a] + (W / tau[s, a]) * (G - Q[s, a]) W = W * pi.action_prob(s, a) / bpi.action_prob(s, a) return Q
def off_policy_mc_prediction_weighted_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using behavior policy bpi list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine) ##################### Q = initQ C = np.zeros((env_spec.nS, env_spec.nA)) for epi in trajs: G = 0 W = 1 for (state, action, reward, next_state) in reversed(epi): if W != 0: G = (env_spec._gamma * G) + reward C[state][action] += W Q[state][action] += ((W / C[state][action]) * (G - Q[state][action])) W = W * (pi.action_prob(state, action) / bpi.action_prob(state, action)) return Q
def off_policy_mc_prediction_weighted_importance_sampling( env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, pi: Policy, initQ: np.array) -> np.array: """ input: env_spec: environment spec trajs: N trajectories generated using behavior policy bpi list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Off Policy Monte-Carlo prediction algorithm using weighted importance # sampling (Hint: Sutton Book p. 110, every-visit implementation is fine) ##################### C = np.zeros((env_spec.nS, env_spec.nA)) Q = initQ.copy() for e in trajs: G = 0 W = 1 for t in range( len(e) - 1, -1, -1): #This for loop moves backwards from the final step to 0 if W == 0: #If W is zero then Q is no longer updated break e_t = e[t] #current time step t of current episode e s, a, r, s_prime = e_t G = r + env_spec.gamma * G C[s, a] += W #C sums the weights used to update Q[s,a] Q[s, a] += (W / C[s, a]) * (G - Q[s, a]) W = W * pi.action_prob(s, a) / bpi.action_prob(s, a) return Q
def off_policy_n_step_sarsa(env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, n: int, alpha: float, initQ: np.array) -> Tuple[np.array, Policy]: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories n: how many steps? alpha: learning rate initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_star$ function; numpy array shape of [nS,nA] policy: $pi_star$; instance of policy class """ Q = np.array(initQ) eps = 0.1 pi = egreedy_policy(Q, eps) for traj in trajs: T = len(traj) for t in range(T + n - 1): tau = t - n + 1 if tau >= 0: rho = 1 G = 0 for i in range(tau + 1, min(tau + n, T - 1) + 1): rho *= pi.action_prob(traj[i][0], traj[i][1]) / bpi.action_prob( traj[i][0], traj[i][1]) for i in range(tau + 1, min(tau + n, T) + 1): G += env_spec.gamma**(i - tau - 1) * traj[i - 1][2] if tau + n < T: G += env_spec.gamma**n * Q[traj[tau + n][0]][traj[tau + n][1]] Q[traj[tau][0]][traj[tau][1]] += alpha * rho * ( G - Q[traj[tau][0]][traj[tau][1]]) pi = egreedy_policy(Q, eps) ##################### # TODO: Implement Off Policy n-Step SARSA algorithm # sampling (Hint: Sutton Book p. 149) ##################### assignment = np.zeros(env_spec.nS) for i in range(env_spec.nS): assignment[i] = np.argmax(Q[i]) return Q, optimal_policy(assignment)
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ n_s = env.spec.nS n_a = env.spec.nA trans_mat = env.TD reward_matrix = env.R delta = theta v = initV q = np.zeros((n_s, n_a)) while delta >= theta: delta = 0 for s in range(n_s): current_state_val = v[s] result = 0 for a in range(n_a): trans = trans_mat[s][a] sum_val = 0 for i in range(len(trans)): next_state = i prob = trans[i] sum_val += (prob * (reward_matrix[s][a][next_state] + (env.spec.gamma * v[next_state]))) q[s][a] = sum_val result += pi.action_prob(s, a) * sum_val v[s] = result delta = max(delta, abs(v[s] - current_state_val)) V = v Q = q return V, Q
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: nA = env.spec.nA # Number of actions nS = env.spec.nS # Number of states V = np.zeros(nS) R = env.R # Reward function T = env.TD # State transition function gamma = env.spec.gamma # Gamma """ Iteratively sweep through all states, """ while True: delta = 0 for s in range(nS): value = 0 for a in range(nA): prob = pi.action_prob(s, a) for sp in range(nS): value += (prob * (T[s, a, sp] * (R[s, a, sp] + gamma * initV[sp]))) V[s] = value delta = max(delta, abs(initV[s] - V[s])) """ Check for convergence """ if delta < theta: break initV = V.copy() # Must make an explicit copy """ With the values function having converged, we can now extract the optimal action probabilities for each state, using the Bellman optimality equation. """ Q = np.zeros(shape=(nS, nA)) for s in range(nS): for a in range(nA): for sp in range(nS): Q[s, a] += T[s,a,sp] * (R[s,a,sp] + gamma * V[sp]) return V, Q
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### nS = env.spec.nS nA = env.spec.nA gamma = env.spec.gamma TD = env.TD R = env.R V = initV Q = np.zeros((nS,nA)) while True: delta = 0; for i in range(nS): prevVal = V[i] action_sum_temp = 0 for j in range(nA): act_pr = pi.action_prob(i,j) action_sum_temp = action_sum_temp + act_pr * sum(TD[i,j,:] * (R[i,j,:] + gamma*V)) V[i] = action_sum_temp delta = max(delta, abs(V[i]-prevVal) ) if delta<theta: break for s in range(nS): for a in range(nA): Q[s,a] = sum(TD[s,a,:] * (R[s,a,:] + gamma*V)) return V, Q
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inputs: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### V = initV.copy() #Arbitrary except terminal states must be 0 delta = theta while delta >= theta: delta = 0 for s in range(env.spec.nS): v = V[s] #action probabilities: pi.action_prob(s) is an array (size nA) #transition dynamics: env.TD[state,action,state_t+1] is an array of probabilities (size nS) #rewards: env.R[state,action,state_t+1] update = 0 for a in range(env.spec.nA): #Note below after action_prob the extra zero is because the array is wrapped in another array since array[None] #puts the array in another array. Though this should always happen since action_prob shouldn't be passed an action update += pi.action_prob(s, a) * np.sum( env.TD[s, a, :] * (env.R[s, a, :] + env.spec.gamma * V)) V[s] = update delta = max(delta, np.abs(v - V[s])) Q = np.zeros((env.spec.nS, env.spec.nA)) for s in range(env.spec.nS): for a in range(env.spec.nA): Q[s, a] = np.sum(env.TD[s, a, :] * (env.R[s, a, :] + env.spec.gamma * V)) #Note: summing over the actions in pi(a|s)*Q(s,a) gives V(s) return V, Q
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### num_states = env.spec.nS num_actions = env.spec.nA V = np.array(initV) Q = np.zeros((num_states,num_actions)) R = env.R TD = env.TD change = theta + 1 while change > theta: change = 0 for i in range(num_states): old_v = V[i] new_v = 0 for j in range(num_actions): sum_a = 0 for k in range(num_states): sum_a += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k]) new_v += pi.action_prob(i,j)*sum_a change = max(change, abs(new_v-old_v)) V[i] = new_v for i in range(num_states): for j in range(num_actions): for k in range(num_states): Q[i][j] += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k]) return V, Q
def off_policy_n_step_sarsa(env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, n: int, alpha: float, initQ: np.array) -> Tuple[np.array, Policy]: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories pi: evaluation target policy n: how many steps? alpha: learning rate initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_star$ function; numpy array shape of [nS,nA] policy: $pi_star$; instance of policy class """ ##################### # TODO: Implement Off Policy n-Step SARSA algorithm # sampling (Hint: Sutton Book p. 149) ##################### class PiStar(Policy): def __init__(self, optActionProb, optPolicy): self.optActionProb = optActionProb self.optPolicy = optPolicy def action_prob(self, state, action): return self.optActionProb[state, action] def action(self, state): return self.optPolicy[state] Q = initQ.copy() for e in trajs: #Now we're inside the episode, a list of transitions (list of tuples) T = np.inf tau = -1 R = np.zeros(len(e) + 1) #zero added since R_0 doesn't exist A = np.zeros(len(e), dtype=int) #There is exactly one action per truple S = np.zeros(len(e) + 1, dtype=int) t = 0 while tau < (T - 1): #Now we iterate each transition in this episode if t < T: #Extract transition information s, a, r, s_prime = e[t] S[t] = int(s) A[t] = int(a) R[t + 1] = r S[t + 1] = int(s_prime) #Check if S_t+1 is terminal if t == (len(e) - 1): T = t + 1 else: A[t + 1] = e[t + 1][1] #Store next action tau = int(t - n + 1) if tau >= 0: #Calculate rho (importance ratio) and G (return estimate) i = tau + 1 rho = 1 while i <= min(tau + n, T - 1): #Calculate target policy pi probability (greedy policy = 1 if Q[S_i,] is max) piprob = int(Q[S[i], A[i]] == np.max(Q[S[i], :])) rho = rho * piprob / bpi.action_prob(S[i], A[i]) i += 1 i = tau + 1 G = 0 while i <= min( tau + n, T): #Compute sum of discounted rewards nsteps ahead G += env_spec.gamma**(i - tau - 1) * R[i] i += 1 #Print sum of Rewards # print('G: ',G) #Add estimated Q value n steps ahead if we dont hit termination after n steps if tau + n < T: G += (env_spec.gamma**n) * Q[S[tau + n], A[tau + n]] # print('Q(S,A) est:',Q[S[tau+n],A[tau+n]]) Q[S[tau], A[tau]] += alpha * rho * (G - Q[S[tau], A[tau]]) #Visualize Q_S # print('Q_update:\n',Q[S[tau],:].reshape((2, 2))) t += 1 optActionProb = np.zeros((env_spec.nS, env_spec.nA)) optPolicy = np.zeros(env_spec.nS) for s in range(env_spec.nS): a = np.argmax(Q[s, :]) optActionProb[s, a] = 1 optPolicy[s] = a pi = PiStar(optActionProb, optPolicy) return Q, pi
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ Q = np.zeros((env.spec.nS, env.spec.nA)) delta2 = float('inf') while delta2 >= theta: delta = 0.0 for state in range(env.spec.nS): v = initV[state] piActionState = 0 for action in range(env.spec.nA): probActionState = pi.action_prob(state, action) for sPrime in range(env.spec.nS): # current reward for state r = env.R[state, action, sPrime] # bellman equation piActionState += probActionState * env.TD[ state, action, sPrime] * (r + env.spec.gamma * initV[sPrime]) # update value prediction initV[state] = piActionState # update delta delta = max(delta, abs(v - initV[state])) delta2 = delta # update Q values delta2 = float('inf') while delta2 >= theta: delta = 0.0 for state in range(env.spec.nS): for action in range(env.spec.nA): # old q value q = Q[state][action] stateActionValue = 0 for sPrime in range(env.spec.nS): r = env.R[state, action, sPrime] stateActionValue += env.TD[state, action, sPrime] * ( r + env.spec.gamma * initV[sPrime]) Q[state][action] = stateActionValue delta = max(delta, abs(q - Q[state][action])) delta2 = delta V = initV return V, Q
def off_policy_n_step_sarsa(env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, n: int, alpha: float, initQ: np.array) -> Tuple[np.array, Policy]: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories n: how many steps? alpha: learning rate initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_star$ function; numpy array shape of [nS,nA] policy: $pi_star$; instance of policy class """ ##################### # TODO: Implement Off Policy n-Step SARSA algorithm # sampling (Hint: Sutton Book p. 149) ##################### class OptimalPolicy(Policy): def __init__(self, OptActionProb, OptAction): self.OptActionProb = OptActionProb self.OptAction = OptAction def action_prob(self, state, action): return self.OptActionProb[state, action] def action(self, state): return self.OptAction[state] nS = env_spec.nS nA = env_spec.nA gamma = env_spec.gamma Q = initQ gamma_vec = np.zeros(n + 1) gamma_vec[0] = 1 OptActionProb = (1 / nA) * np.ones((nS, nA)) OptAction = np.zeros(nS) for i in range(n): gamma_vec[i + 1] = gamma_vec[i] * gamma for eps_tr in trajs: T = len(eps_tr) - 1 for tau, step_tr in enumerate(eps_tr): rho = 1 if tau + n <= T: rewards = np.asarray(eps_tr[tau:tau + n])[:, 2] Qs_prime = Q[eps_tr[tau + n][0], eps_tr[tau + n][1]] G = sum(gamma_vec * np.append(rewards, Qs_prime)) for i in range(n): s = eps_tr[tau + i + 1][0] a = eps_tr[tau + i + 1][1] rho = rho * OptActionProb[s, a] / bpi.action_prob(s, a) else: rewards = np.asarray(eps_tr[tau:tau + n])[:, 2] rewlen = len(rewards) gamma_vec_modf = gamma_vec[0:rewlen] G = sum(gamma_vec_modf * rewards) for i in range(rewlen - 1): s = eps_tr[tau + i + 1][0] a = eps_tr[tau + i + 1][1] rho = rho * OptActionProb[s, a] / bpi.action_prob(s, a) Q[step_tr[0], step_tr[1]] = Q[step_tr[0], step_tr[1]] + alpha * rho * ( G - Q[step_tr[0], step_tr[1]]) s_a_vals = Q[step_tr[0], :] OptAction[step_tr[0]] = s_a_vals.argmax() #best_action = OptAction[s].astype(int) OptActionProb[step_tr[0], :] = 0 OptActionProb[step_tr[0], OptAction[step_tr[0]].astype(int)] = 1 pi = OptimalPolicy(OptActionProb, OptAction) return Q, pi
def off_policy_n_step_sarsa(env_spec: EnvSpec, trajs: Iterable[Iterable[Tuple[int, int, int, int]]], bpi: Policy, n: int, alpha: float, initQ: np.array) -> Tuple[np.array, Policy]: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories n: how many steps? alpha: learning rate initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_star$ function; numpy array shape of [nS,nA] policy: $pi_star$; instance of policy class """ ##################### # TODO: Implement Off Policy n-Step SARSA algorithm # sampling (Hint: Sutton Book p. 149) ##################### Q = np.zeros((env_spec.nS, env_spec.nA)) Q = initQ policy = QPolicy(Q) pi = np.zeros(env_spec.nS) for epi in trajs: T = float('inf') tau = 0 t = 0 reward = [] state = [] action = [] state.append(epi[t][0]) action.append(epi[t][1]) while tau != T - 1: if t < T: reward.append(epi[t][2]) state.append(epi[t][3]) if t == len(epi) - 1: T = t + 1 else: action.append(epi[t + 1][1]) tau = t - n + 1 if tau >= 0: G = 0 rho = 1 for i in range(tau + 1, min(tau + n, T) + 1): G += (env_spec._gamma**(i - tau - 1)) * reward[i - 1] for j in range(tau + 1, min(tau + n, T - 1) + 1): rho = rho * (policy.action_prob(state[j], action[j]) / bpi.action_prob(state[j], action[j])) if tau + n < T: G += ((env_spec._gamma**n) * Q[state[tau + n]][action[tau + n]]) Q[state[tau]][action[tau]] += ( alpha * rho * (G - Q[state[tau]][action[tau]])) t += 1 pi = policy return Q, pi
def off_policy_n_step_sarsa( env_spec:EnvSpec, trajs:Iterable[Iterable[Tuple[int,int,int,int]]], bpi:Policy, n:int, alpha:float, initQ:np.array ) -> Tuple[np.array,Policy]: """ input: env_spec: environment spec trajs: N trajectories generated using list in which each element is a tuple representing (s_t,a_t,r_{t+1},s_{t+1}) bpi: behavior policy used to generate trajectories n: how many steps? alpha: learning rate initQ: initial Q values; np array shape of [nS,nA] ret: Q: $q_star$ function; numpy array shape of [nS,nA] policy: $pi_star$; instance of policy class """ # greedy policy pi = GreedyPolicy(env_spec.nS, env_spec.nA) # loop for each episode for episode in trajs: sar = [] # select and store an action T = sys.maxsize # time step being updated tau = 0 while tau != T - 1: # for each step of episode for t in range(len(episode)): state = episode[t][0] action = episode[t][1] reward = episode[t][2] # state1 = episode[t][3] if t < T: # reached last state in episode sar.append((state, action, reward)) if t == len(episode) - 1: T = t + 1 # time of estimate tau = t - n + 1 # reached n-step if tau >= 0: rho = np.array([pi.action_prob(sar[i][0], sar[i][1]) / bpi.action_prob(sar[i][0], sar[i][1]) for i in range(tau + 1, tau + n)]).prod() g = np.array([pow(env_spec.gamma, i - tau - 1) * sar[i][2] for i in range(tau + 1, min(tau + n, T))]).sum() if tau + n < T: tauState = sar[tau + n - 1][0] tauAction = sar[tau + n - 1][1] g = g + pow(env_spec.gamma, n) * initQ[tauState, tauAction] tauState = sar[tau][0] tauAction = sar[tau][1] initQ[tauState, tauAction] = initQ[tauState, tauAction] + (alpha * rho) * (g - initQ[tauState, tauAction]) pi.p[tauState, tauAction] = 1.0 Q = initQ return Q, pi