def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros( (1, len(actions) * num_tilings * (tiles_per_tiling**len(s)))) elif base == 'rbf': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * order**len(s))) for x in range(epoch): s = cartpole.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, eps) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy( pe.qw(w, new_s, actions, base, baseparams), actions, eps) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s a = new_a count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards
def qlearning_grid(lr, eps, epoch=100, searchbound=400): q = np.zeros((23, 4)) grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) for x in range(epoch): s = grid.d_zero() while s != [5, 5]: # Choose a from s using a policy derived from q; pi_temp = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * np.max(q[grid.get_index(new_s)]) - q[grid.get_index(s), actions.index(a)]) s = new_s grid.pi_params = pe.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], ', epsilon: ', eps(x)) return estimated_rewards
def sarsa_lambda_mc(lr, l, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions w = None if base == 'fourier': order = baseparams['order'] s = mc.d_zero() w = np.zeros((1, len(actions) * (order + 1) ** len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams['tiles_per_tiling'] s = mc.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = mc.d_zero() # e ← 0 e = np.zeros(w.shape) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = estimation.epsilon_greedy(fa.qw(w, s, actions, base, baseparams), actions, eps(x)) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while s[0] < mc.right_bound and count < 1e3: # Take action a and observe r and s′; new_s, r = mc.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = estimation.epsilon_greedy(fa.qw(w, new_s, actions, base, baseparams), actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] if new_s == [0.5, 0]: new_q = 0 else: new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams) # e←γλe+∂qw(s,a)/∂w; e = l * 1 * e + dqdw # δ←r+γqw(s′,a′)−qw(s,a); delta = r + 1 * new_q - q # w←w+αδe; w += lr * delta * e # print(w) s = new_s a = new_a count += 1 # print('update end') epi = MountainCarEpisode(mc) estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards
def sarsa_mountaincar(lr, baseparams, eps, epoch=100, base='fourier'): mc = MountainCar() estimated_rewards = np.zeros(epoch) actions = mc.actions w = None if base == 'fourier': order = baseparams['order'] s = mc.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = mc.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = mc.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = estimation.epsilon_greedy( fa.qw(w, s, actions, base, baseparams), actions, eps(x)) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while s[0] < mc.right_bound: # Take action a and observe r and s′; new_s, r = mc.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = estimation.epsilon_greedy( fa.qw(w, new_s, actions, base, baseparams), actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = fa.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = fa.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s a = new_a count += 1 epi = MountainCarEpisode(mc) estimated_rewards[x] = epi.run_with_w(w, eps(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) return estimated_rewards
def sarsa_lambda_grid(lr, l, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) # Initialize tabular-q arbitrarily q = np.zeros((23, 4)) # for each episode: for x in range(epoch): # s ∼ d0 s = grid.d_zero() # e ← 0 e = np.zeros((23, 4)) # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = estimation.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] # for each time step, until s is the terminal absorbing state do while s != [5, 5]: # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = estimation.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # e ← γλe + ∂qw(s,a)/∂qw; e = l * grid.gamma * e e[grid.get_index(s), actions.index(a)] += 1 # δ ← r + γqw(s′,a′) − qw(s,a); delta = r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)] # w ← w + αδe; q += lr * delta * e s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = estimation.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def run_with_w(self, w, eps, base, baseparams): reward = 0 s = self.mountaincar.d_zero() count = 0 while self.active == 1: q = fa.qw(w, s, self.mountaincar.actions, base, baseparams) # print(q) # pi = estimation.softmax(q, eps) pi = estimation.epsilon_greedy(q, self.mountaincar.actions, eps) # print(pi) a = np.random.choice(self.mountaincar.actions, 1, p=pi)[0] # print(a) # print(s[0]) # print(pi, s, a) s, r = self.mountaincar.P_and_R(s, a) count += 1 if s[0] == self.mountaincar.right_bound: self.active = 0 else: reward += r if count >= 1e3: self.active = 0 return reward
def sarsa_grid(lr, eps, epoch=100, searchbound=400): grid = Grid() grid.pi_params = np.zeros((23, 4)) grid.softmax() actions = grid.action estimated_rewards = np.zeros(epoch) q = np.zeros((23, 4)) for x in range(epoch): s = grid.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); pi_s = pe.epsilon_greedy(q[grid.get_index(s)], actions, eps(x)) a = np.random.choice(actions, 1, p=pi_s)[0] while s != [5, 5]: # print(q) # Take action a and observe r and s′; new_s, r = grid.P_and_R(s, a) # choose new_a from new_s using policy derived from q pi_temp = pe.epsilon_greedy(q[grid.get_index(new_s)], actions, eps(x)) new_a = np.random.choice(actions, 1, p=pi_temp)[0] q[grid.get_index(s), actions.index(a)] += lr * ( r + grid.gamma * q[grid.get_index(new_s), actions.index(new_a)] - q[grid.get_index(s), actions.index(a)]) s = new_s a = new_a # using q function to estimate the reward and add it to estimated_reward # print('episode: ', x, ', q function: ', q) grid.pi_params = pe.epsilon_greedy(q, actions, eps(x)) grid_epi = GridEpisode(grid, step_bound=searchbound) # print('episode: ', x, ', pi: ', grid.pi_params) estimated_rewards[x] = grid_epi.run_all_steps() print('episode: ', x, ', reward: ', estimated_rewards[x], 'epsilon: ', eps(x)) # decay *= decay_rate return estimated_rewards
def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, decaylambda(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = np.max(pe.qw(w, new_s, actions, base, baseparams)) q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards
def run_with_w(self, w, eps, base, baseparams): reward = 0 s = self.cartpole.d_zero() while self.active == 1: q = pe.qw(w, s, self.cartpole.actions, base, baseparams) pi = pe.epsilon_greedy(q, self.cartpole.actions, eps) a = np.random.choice(self.cartpole.actions, 1, p=pi)[0] s, r = self.cartpole.P_and_R(s, a) reward += 1 self.step_count += 1 if self.step_count > self.maxturn: self.active = 0 if np.abs(s[0]) > self.cartpole.edge: self.active = 0 if np.abs(s[1]) > self.cartpole.fail_angle: self.active = 0 return reward - 1