def td_cp_single(f_order, alpha): d = 4 cartpole = CartPole() print('cartpole ', f_order, ' td') weight = np.zeros((1, (f_order + 1) ** d)) # update weight in 100 loops print('alpha = ', alpha) for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s, f_order).T s = new_s print(weight) count += 1 # calculate td in another 100 loops td_list = [] for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2) s = new_s count += 1 td_list.append(0) print('square td = ', np.mean(np.array(td_list)))
def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros( (1, len(actions) * num_tilings * (tiles_per_tiling**len(s)))) elif base == 'rbf': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * order**len(s))) for x in range(epoch): s = cartpole.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, eps) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy( pe.qw(w, new_s, actions, base, baseparams), actions, eps) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s a = new_a count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards
def td_cp(lrs, f_order): d = 4 alpha_result = [] cartpole = CartPole() print('cartpole ', f_order, ' td') # kth order Fourier Basis is defined as: for alpha in lrs: weight = np.zeros((1, (f_order + 1) ** d)) # update weight in 100 loops print('alpha = ', alpha) for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s, f_order).T s = new_s count += 1 # print(weight) # calculate td in another 100 loops td_list = [] for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2) s = new_s count += 1 td_list.append(0) msv = np.mean(np.array(td_list)) print('square td = ', msv) if np.isnan(msv): alpha_result.append(1e100) else: alpha_result.append(msv) print('##########################') return alpha_result
def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, decaylambda(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = np.max(pe.qw(w, new_s, actions, base, baseparams)) q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards