def grid_sampling(theta, cm, K, Ke, N, epsilon): theta_list = np.random.multivariate_normal(theta, cm, K) result_list = [] for x in range(K): # concurrent_eval(theta_list, x, result_list, N) avg_reward = 0 for i in range(N): grid = Grid() grid.pi_params = theta_list[x].reshape(23, 4) grid.softmax() epi = GridEpisode(grid) avg_reward += epi.run_all_steps() result_list.append((theta_list[x], avg_reward / N)) # print(sorted(result_list, key=lambda n: n[-1], reverse=True)) elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke] # print(elite_list) theta_final = np.zeros(92) cm_final = epsilon * np.identity(92) J_final = 0 for t in elite_list: theta_final += t[0] cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta])) J_final += t[1] theta_final /= Ke cm_final /= (epsilon + Ke) # print(cm_final) J_final /= Ke return theta_final, cm_final, J_final
def qlearning_grid(lr, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) q = np.zeros((23, 4)) for x in range(epoch): s = grid.d_zero() while s != [5, 5]: # choose new_a from new_s using policy derived from q pi_temp = pe.softmax(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # print(q) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * np.max(q[grid.get_index(new_s)]) - q[grid.get_index(s), actions.index(a)]) s = new_s # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = pe.softmax(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def grid_evaluate(table, N): avg_reward = 0 for i in range(N): g = Grid() g.pi_params = table g.softmax() epi = GridEpisode(g) avg_reward += epi.run_all_steps() return avg_reward / N
def multi_grid_episode(table, l): # total_reward = 0 for i in l: grid = Grid() # print(i) grid.pi_params = table grid.softmax() epi = GridEpisode(grid) grid_q.put(epi.run_all_steps()) return 0
def grid_evaluate(t, N): reward_l = [] table = t.reshape(23, 4) for i in range(N): # concurrent_eval(theta_list, x, result_list, N) grid = Grid() # print(i) grid.pi_params = table grid.softmax() epi = GridEpisode(grid) reward_l.append(epi.run_all_steps()) return sum(reward_l) / N
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) # Initialize tabular-q arbitrarily q = np.zeros((23, 4)) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() # e ← 0 e = np.zeros((23, 4)) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] # for each time step, until s is the terminal absorbing state do while s != [5, 5]: # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # e ← γλe + ∂qw(s,a)/∂qw; e = l * grid.gamma * e e[grid.get_index(s), actions.index(a)] += 1 # δ ← r + γqw(s′,a′) − qw(s,a); delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)] # w ← w + αδe; q += lr * delta * e s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def reinforce_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # print(epoch) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 hist_s = [] hist_a = [] hist_r = [] grid.pi_params = estimation.softmax(theta, eps(x)) # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: hist_s.append(s) a = grid.pi(s) hist_a.append(a) new_s, r = grid.P_and_R(s, a) hist_r.append(r) s = new_s count += 1 # delta_j = 0 decay = 1 for i in range(len(hist_s)): g = 0 gd = 1 for j in range(i, len(hist_s)): g += gd * hist_r[j] gd *= grid.gamma theta[grid.get_index(hist_s[i]), actions.index(hist_a[i])] += lr * decay * g decay *= grid.gamma grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == epoch - 1: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards
def actor_critic_grid(lr, eps, epoch=100, searchbound=400): estimated_rewards = np.zeros(epoch) # Initialize tabular-v arbitrarily v = np.zeros(23) # theta is a representation of policy theta = np.zeros((23, 4)) grid = Grid() actions = grid.action # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() count = 0 # for each time step, until s is the terminal absorbing state do while s != [5, 5] and count < 1000: # a ∼ π(s, ·); grid.pi_params = estimation.softmax(theta, eps(x)) a = grid.pi(s) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # Critic update using TD(λ) # e ← γλe + ∂qw(s,a)/∂qw; delta = r + grid.gamma * v[grid.get_index(new_s)] - v[ grid.get_index(s)] # w←w+αδev; v[grid.get_index(s)] += lr * delta theta[grid.get_index(s), actions.index(a)] += lr * delta # print(theta) s = new_s count += 1 # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.softmax(theta, eps(x)) # grid.softmax() grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() if x == 99: print('episode: ', x, ', reward: ', estimated_rewards[x]) # decay *= decay_rate return estimated_rewards
for i in range(N): cartpole = CartPole() cartpole.pi_params = table epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() return avg_reward / N tic = time.time() theta = np.ones(92) * 0.25 theta_f = grid_param_sampling(theta, 0.5, 200) grid = Grid() grid.pi_params = theta_f.reshape(23, 4) grid.softmax() episode = GridEpisode(grid) print('optimized reward: ', episode.run_all_steps()) print('optimized theta: ', theta_f.reshape(23, 4)) # theta = np.ones(8) * 0.25 # theta_f = cartpole_sampling(theta, 0.5, 500) # cartpole = CartPole() # cartpole.pi_params = theta_f.reshape(4, 2) # episode = CartPoleEpisode(cartpole) # print('optimized reward: ', episode.run_all_steps()) # print('optimized theta: ', theta_f.reshape(4, 2)) toc = time.time() print('running time: ', (toc - tic) / 60, ' mins')