def J_N(x, mu, N, discount_factor=0.99): # computes J(mu,N,x) if N < 0: print("N cannot be negative !") exit() elif N == 0: return 0 else: new_state = env.f(x, mu(x)) r = env.rewards[new_state[0]][new_state[1]] return r + discount_factor * J_N(env.f(x, mu(x)), mu, N - 1)
def optimal_policy(N): """ compute the optimal policy for the environnment """ """ exact probability """ p = {} """ exact reward""" r = {} for x in env.state_space: for u in env.action_space: for next_state in env.state_space: p[(x, u, next_state)] = 0 new_state = env.f(x, u) p[(x, u, new_state)] = 1 r[(x, u)] = env.rewards[new_state[0]][new_state[1]] """ compute the exact optimal policy """ Q = {} for x in env.state_space: for u in env.action_space: Q[(x, u)] = Q_N(p, r, x, u, N) # return determine_optimal_policy_from_Q(Q) return Q
def influence_of_T_on_Q(T, N): """ display the difference between the Q calculated with the MDP structure and the Q calculated with the exact p and r """ # exact probability p = {} # exact reward r = {} # instantiate the exact probability and reward function for x in env.state_space: for u in env.action_space: for next_state in env.state_space: p[(x, u, next_state)] = 0 new_state = env.f(x, u) p[(x, u, new_state)] = 1 r[(x, u)] = env.rewards[new_state[0]][new_state[1]] Q = {} for x in env.state_space: for u in env.action_space: history, p_appr, r_appr = tj.create_trajectory((3, 0), T) Q_optimal = round(fn.Q_N(p, r, x, u, N), 2) Q_learned = round(fn.Q_N(p_appr, r_appr, x, u, N), 2) Q[(x, u)] = (Q_optimal, Q_learned) for key in list(Q.keys()): diff = abs(Q[key][0] - Q[key][1]) str_x = "(x,u) = " + str(key) + " | Q_exact = " + str(Q[key][0]) + " | Q_appr = " + str(Q[key][1]) + " | diff = " + str(diff) print(str_x) return Q
def J_N(x, mu, N, discount_factor=0.99): """ computes J state-value recurrence function with policy µ """ if N < 0: print("N cannot be negative !") return None elif N == 0: return 0 else: new_state = env.f(x, mu[x]) return env.rewards[new_state[0]][new_state[1]] + discount_factor*J_N(new_state, mu, N-1)
def J_N(x, mu, r, N, discount_factor=0.99): """ computes J state-value recurrence function """ if N < 0: print("N cannot be negative !") exit() elif N == 0: return 0 else: return r[ (x, mu[x])] + discount_factor * J_N(env.f(x, mu[x]), mu, r, N - 1)
def protocol_1(discount_factor=0.99, alpha=0.05, epsilon=0.25): """ first experimental protocol """ error = [] Q = {} # initialization for x in env.state_space: for u in env.action_space: # initialize Q to 0 everywhere Q[(x, u)] = 0 for episode in range(100): # initial state state = (3, 0) for transition in range(1000): action = None p = np.random.default_rng().random() # epsilon-greedy policy if p < 1 - epsilon: # exploitation action = get_max_action(Q, state) else: # exploration action = tj.policy() next_state = env.f(state, action) # extract reward associates with state x and action u reward = env.rewards[next_state[0]][next_state[1]] # compute the max value of Q for the state x' maxQ = get_max_value(Q, next_state) # update Q Q[(state, action)] = (1 - alpha) * Q[ (state, action)] + alpha * (reward + discount_factor * maxQ) state = next_state print("episode : " + str(episode + 1)) error.append(display(Q)) return error
def protocol_3(discount_factor=0.99, alpha=0.05, epsilon=0.25): """ third experimental protocol """ error = [] Q = {} for x in env.state_space: for u in env.action_space: Q[(x, u)] = 0 for episode in range(100): state = (3, 0) buffer = [] for transition in range(1000): action = None p = np.random.default_rng().random() if p < 1 - epsilon: action = get_max_action(Q, state) else: action = tj.policy() next_state = env.f(state, action) reward = env.rewards[next_state[0]][next_state[1]] # add the transition to the buffer buffer.append((state, action, reward, next_state)) # update Q ten times using the buffer for count in range(10): index = np.random.randint(0, len(buffer)) x, u, r, next_x = buffer[index] maxQ = get_max_value(Q, next_x) Q[(state, action)] = (1 - alpha) * Q[( state, action)] + alpha * (reward + discount_factor * maxQ) state = next_state print("episode : " + str(episode + 1)) error.append(display(Q)) return error
def create_trajectory(initial_x, T): """ creates ht (with ressources limitation algorithm) """ trajectory = [] N = {} R = {} Nx = {} p = {} r = {} for x in env.state_space: # initializations for r and p computing for u in env.action_space: N[(x, u)] = 0 R[(x, u)] = 0 r[(x, u)] = -1000 for x0 in env.state_space: Nx[(x, u, x0)] = 0 p[(x, u, x0)] = 0 x = initial_x for i in range(T): # random trajectory computing u = policy() new_x = env.f(x, u) rew = env.rewards[new_x[0]][new_x[1]] trajectory.append([x, u, rew, new_x ]) # add current information to trajectory history N[(x, u)] += 1 Nx[(x, u, new_x)] += 1 R[(x, u)] += rew r[(x, u)] = R[(x, u)] / N[(x, u)] # mean of all the rewards p[(x, u, new_x)] = Nx[(x, u, new_x)] / N[ (x, u)] # probability reaching state new_x with (x,u) x = new_x return trajectory, p, r
def protocol_2(discount_factor=0.99, epsilon=0.25): """ second experimental protocol """ error = [] Q = {} for x in env.state_space: for u in env.action_space: Q[(x, u)] = 0 for episode in range(100): alpha = 0.05 state = (3, 0) for transition in range(1000): action = None p = np.random.default_rng().random() if p < 1 - epsilon: action = get_max_action(Q, state) else: action = tj.policy() next_state = env.f(state, action) reward = env.rewards[next_state[0]][next_state[1]] maxQ = get_max_value(Q, next_state) Q[(state, action)] = (1 - alpha) * Q[(state, action)] + alpha * ( reward + discount_factor * maxQ) # update Q state = next_state alpha *= 0.8 print("episode : " + str(episode + 1)) error.append(display(Q)) return error
def convergence_speed(): """ Computes and display the convergence of p and r """ p_error = [] r_error = [] T = [i for i in range(100, 1200, 50)] # we compute the error of the approximations for different trajectory length for t in T: history, p, r = tj.create_trajectory((3, 0), T) p_sum = 0 r_sum = 0 # compute the error for x in env.state_space: for u in env.action_space: new_state = env.f(x, u) p_sum += (1 - p[(x, u, new_state)]) r_sum += abs(r[(x, u)] - env.rewards[new_state[0]][new_state[1]]) p_error.append(p_sum) r_error.append(r_sum) # plot the convergence of p and r, along T fig, axs = plt.subplots(2, 1, figsize=(10, 10), constrained_layout=True) axs[0].plot(T, p_error) axs[0].set_ylabel('$p_{error}$') axs[0].set_xlabel('T') axs[0].set_title('Convergence speed of $\^p$') axs[1].plot(T, r_error) axs[1].set_ylabel('$r_{error}$') axs[1].set_xlabel('T') axs[1].set_title('Convergence speed of $\^r$') plt.show() return T, p_error, r_error
def Q_N(p, r, state, action, N, discount_factor=0.99): # computes Q state-action value recurrence function if N < 0: print("N can't be negative") return None elif N == 0: return 0 else: sum_Q = 0 for u in env.action_space: # we are only looking for the state which are 'reachable' from x since others one will have p(x'|x,u)=0 x = env.f(state, u) Qs = [] # store the reward recording for each action for u1 in env.action_space: Qs.append(Q_N(p, r, x, u1, N-1)) # look for which action gives best reward max_Q = max(Qs) # actualize the sum term in Qn recurrence formula sum_Q += p[(state, action, x)]*max_Q return r[(state, action)] + discount_factor*sum_Q
fig, ax = plt.subplots(figsize=(4, 4)) ax.axis('off') cell_text = np.asarray(env.rewards, dtype=np.str) colors = [["w" for i in range(env.m)] for j in range(env.n)] # colors = [["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"], # ["w", "w", "w", "w", "w"], ["w", "w", "w", "w", "w"]] colors[position[0]][position[1]] = "r" ax.table(cellText=cell_text, cellColours=colors, cellLoc='center', loc='center', colWidths=[0.07, 0.07, 0.07, 0.07, 0.07]) plt.title(str_x, fontdict={'fontsize': 8}) plt.show() if __name__ == '__main__': s = (3, 0) # initial state t = 0 # time str_x = "state = " + str(s) + " | t = " + str(t) print(str_x) draw(s, str_x) while True: u = policy(s) # compute the policy with actual state x = env.f(s, u, 0.1) # compute new state r = round((0.99**t)*env.rewards[x[0]][x[1]], 4) # compute reward s = x # update state t += 1 str_x = "state = " + str(x) + " | action = " + str(u) + " | reward = " + str(r) + " | t = " + str(t) print(str_x) draw(s, str_x) time.sleep(2) # in order to have time to visualizes the environment