def test_AwithOptimalL(): A_values = list(np.arange(0.1, 1.0, 0.1)) L = [] for a in A_values: env = LQREnv(A=np.matrix(a)) L.append(env.optimal_policy_L(0.99).item()) import matplotlib.pyplot as plt print('L: {}'.format(L)) plt.plot(A_values, L) plt.xlabel('value A') plt.ylabel('optimal L') plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=40, type=int) parser.add_argument('--stop_criterion', default=10**-3, type=float) parser.add_argument('--sample_max_steps', default="2000", choices=["2000","5000","10000","20000"]) parser.add_argument('--max_steps', default=20, type=int) parser.add_argument('--reg_opt', default="l2", choices=["l1","l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.001, type=float) parser.add_argument('--rbf_sigma', default=0.01, type=float) # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action args = parser.parse_args() params = vars(args) # env env = LQREnv() params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) # basis function n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']], n_features, params['rbf_sigma']) params['policy'] = RBFPolicy4LQR(params['basis_func']) # set the parameters for agent batch_size = params['sample_max_steps'] max_steps = params['max_steps'] agent = LSPIAgent(params) sample_filename = LQR_samples_filename[params['sample_max_steps']] # sample_filename = LQR_samples_filename["-22-10000"] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) sample = replay_buffer.sample(batch_size) print("length of sample: {}".format(len(sample[0]))) error_list, new_weights = agent.train(sample) test_agent4LQR(agent, env, gamma, ifshow=False) # clean env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=30, type=int) parser.add_argument('--stop_criterion', default=10**-3, type=float) parser.add_argument('--sample_max_steps', default="2000", choices=["2000","5000","10000","20000"]) parser.add_argument('--max_steps', default=20, type=int) parser.add_argument('--reg_opt', default="wl1", choices=["l1","l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.001, type=float) # for l1 and l2 parser.add_argument('--rbf_sigma', default=0.01, type=float) # for RBFs # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action args = parser.parse_args() params = vars(args) # env env = LQREnv() params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) # basis function # real feature = n_feature**(dim of state + dim of action) n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() sample_filename = LQR_samples_filename[params['sample_max_steps']] # sample_filename = LQR_samples_filename["-22-10000"] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) batch_size = params['sample_max_steps'] max_steps = params['max_steps'] sample = replay_buffer.sample(batch_size) print("length of sample: {}".format(len(sample[0]))) L_vec = 1.1*np.concatenate((np.max(np.abs(sample[0]), axis=0).flatten(), [np.max(np.abs(sample[1]))])) params['basis_func'] = Laplace_LQR(n_features, L_vec) params['policy'] = RBFPolicy4LQR(params['basis_func']) agent = BellmanAgent(params, n_iter_max=15) error_list, new_weights = agent.train(sample) test_agent4LQR(agent, env, gamma, ifshow=False) # clean env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', default="LQR", choices=[ "cliff-v0", "CartPole-v0", "inverted_pedulum", "LQR", "chain" ]) # gym env to train parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=50, type=int) parser.add_argument('--stop_criterion', default=10**-5, type=float) parser.add_argument('--sample_max_steps', default="2000", choices=["2000", "5000", "10000", "20000"]) parser.add_argument('--max_steps', default=500, type=int) # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--reg_opt', default="l2", choices=["l1", "l2", "wl1"]) parser.add_argument('--reg_param', default=0.001, type=float) args = parser.parse_args() params = vars(args) env = LQREnv() params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() # basis_function = ExactBasis4LQR() basis_function = RBF_LQR([params['state_dim'], params['n_actions']], n_features, 0.001) params['basis_func'] = basis_function # esitimate specific L L = np.matrix(params['L']) # set the parameters for agent # use all the samples in buffer batch_size = params['sample_max_steps'] max_steps = params['max_steps'] sample_filename = LQR_samples_filename[params['sample_max_steps']] # sample_filename = LQR_samples_filename[-22] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) samples = replay_buffer.sample(batch_size) # samples to true Q value phi_list = [] qTrue_list = [] states = samples[0] actions = samples[1] rewards = samples[2] next_states = samples[3] dones = samples[4] phi_list = basis_function.evaluate(states, actions) for i in range(len(states)): print("i: {}".format(i)) s = states[i] # print("state: {}".format(state)) # print("action: {}".format(action)) qTrue = env.true_Qvalue(L, gamma, states[i], actions[i]) # print("qTrue: {}".format(qTrue)) qTrue_list.append(qTrue) phi_list = np.array(phi_list) # print("phi_list shape: {}".format(phi_list.shape)) # print("phi_list: {}".format(phi_list[:10])) qTrue_list = np.array(qTrue_list) # print("qTrue_list: {}".format(qTrue_list[:10])) # print("qTrue_list shape: {}".format(qTrue_list.shape)) reg = LinearRegression().fit(phi_list, qTrue_list) # print("reg.get_params(): {}".format(reg.get_params())) # for state range state_low = -10.0 state_high = 10.0 states = np.linspace(state_low, state_high, 100) actions = [] # true_weights_his = [] true_estimate_error_history = [] q_true_his = [] q_estimate_his = reg.predict( basis_function.evaluate(states, -L.item() * states)) for i in range(len(states)): state = np.matrix(states[i]) action = -L * state actions.append(action.item()) q_true = env.true_Qvalue(L, gamma, state, action) # q_state = env.true_Qvalue_state(L, gamma, state) q_true_his.append(q_true) now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) # save estimate data to file dirname = "data/Regression/states[" + str(state_low) + "," + str( state_high) + "]/" try: os.mkdir(dirname) except OSError as error: print(error) # # q_true # filename = dirname + "q_true.pickle" # f = open(filename, 'wb') # pickle.dump(q_true_his, f) # f.close() # estimate # if params['basis_func'].name()[:3]=='RBF': # filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle" # else: # filename = dirname + params['basis_func'].name()+".pickle" # f1 = open(filename, 'wb') # pickle.dump(q_estimate_his, f1) # f1.close() # plot plt.figure(figsize=(10, 10)) plt.title('state from -2 to 2 and action(-L*state)') plt.subplot(411) plt.plot(states) plt.title('state') plt.subplot(412) plt.plot(actions) plt.title('actions') # plt.show() plt.subplot(413) plt.plot(states, q_true_his) plt.title('true Q') # plt.show() plt.subplot(414) plt.plot(states, q_estimate_his) plt.title('estimate Q') # plt.savefig(now+"q_true&estimate-state(-2,2)") plt.show() # for specific state # range of action state = np.matrix(-1.) actions = np.linspace(-6, 6, 100) q_true_his = [] q_estimate_his = reg.predict( basis_function.evaluate(np.full(len(actions), state), actions)) for i in range(len(actions)): action = np.matrix(actions[i]) # print("q_estimate: {}".format(q_estimate)) q_true = env.true_Qvalue(L, gamma, state, action) # print("q_true: {}".format(q_true)) q_true_his.append(q_true) # true_weights_scala = env.true_weights_scala(L, gamma) # print("true_weights_scala: {}".format(true_weights_scala)) # estimate_weights = agent.policy.weights # print("estimate_weights: {}".format(estimate_weights)) # true_estimate_error = np.linalg.norm(true_weights_scala-estimate_weights) # print("true_estimate_error: {}".format(true_estimate_error)) now = time.strftime("%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) # save data to file # note .item() only for one element dirname = "data/Estimation/state=" + str(state.item()) + "/" try: os.mkdir(dirname) except OSError as error: print(error) # # save q_true # filename = dirname + "q_true.pickle" # f = open(filename, 'wb') # pickle.dump(q_true_his, f) # f.close() # save q_estimate # if params['basis_func'].name()[:3] == 'RBF': # filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle" # else: # filename = dirname + params['basis_func'].name()+".pickle" # f1 = open(filename, 'wb') # pickle.dump(q_estimate_his, f1) # f1.close() print("q_estimate_his: {}".format(q_estimate_his)) plt.figure(figsize=(8, 6)) plt.subplot(211) plt.plot(actions, q_estimate_his) plt.title('q estimate') plt.subplot(212) plt.plot(actions, q_true_his) plt.title('q true') # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)") plt.show() env.close() replay_buffer.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=40, type=int) parser.add_argument('--stop_criterion', default=10**-5, type=float) parser.add_argument('--sample_max_steps', default="5000", choices=["2000", "5000"]) parser.add_argument('--max_steps', default=500, type=int) parser.add_argument('--reg_opt', default="l2", choices=["l1", "l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.001, type=float) parser.add_argument('--rbf_sigma', default=0.01, type=float) # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action args = parser.parse_args() params = vars(args) # env env = LQREnv() params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) # basis function n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']], n_features, params['rbf_sigma']) # esitimate specific L L = np.matrix(params['L']) # params['policy'] = ExactPolicy4LQR(params['basis_func'], L) params['policy'] = RBFPolicy4LQR(params['basis_func'], L) # set the parameters for agent batch_size = params['sample_max_steps'] max_steps = params['max_steps'] agent = LSPIAgent(params) sample_filename = LQR_samples_filename[params['sample_max_steps']] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) samples = replay_buffer.sample(batch_size) print("length of sample: {}".format(len(samples[0]))) error_list, new_weights = agent.train(samples) # for specific state # range of action for si in range(-10, 10, 5): si = -1.0 true_estimate_error_history = [] q_true_his = [] q_estimate_his = [] state = np.matrix(si) actions = np.linspace(-6, 6, 100) q_estimate_his = agent.policy.q_state_action_func( np.full(len(actions), state), actions) for i in range(len(actions)): action = np.matrix(actions[i]) # q_estimate = agent.policy.q_state_action_func(state, action)[0] # q_estimate_his.append(q_estimate) # print("q_estimate: {}".format(q_estimate)) q_true = env.true_Qvalue(L, gamma, state, action) # print("q_true: {}".format(q_true)) q_true_his.append(q_true) true_weights_scala = env.true_weights_scala(L, gamma) print("true_weights_scala: {}".format(true_weights_scala)) estimate_weights = agent.policy.weights print("estimate_weights: {}".format(estimate_weights)) true_estimate_error = np.linalg.norm(true_weights_scala - estimate_weights) print("true_estimate_error: {}".format(true_estimate_error)) # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time())) # save data to file # note .item() only for one element # dirname = "data/Estimation/state=" + str(state.item())+"/" # try: # os.mkdir(dirname) # except OSError as error: # print(error) # # save q_true # filename = dirname + "q_true.pickle" # f = open(filename, 'wb') # pickle.dump(q_true_his, f) # f.close() # save q_estimate # if params['basis_func'].name()[:3] == 'RBF': # filename = dirname + params['basis_func'].name()+"-"+str(params['basis_function_dim'])+"-"+params['reg_opt']+"-"+str(params['reg_param'])+".pickle" # else: # filename = dirname + params['basis_func'].name()+".pickle" # f1 = open(filename, 'wb') # pickle.dump(q_estimate_his, f1) # f1.close() qe_index = np.argmax(q_estimate_his) qt_index = np.argmax(q_true_his) plt.figure(figsize=(10, 8)) plt.subplot(211) ax = plt.gca() plt.plot(actions, q_estimate_his) plt.scatter(actions[qe_index], q_estimate_his[qe_index], c='r') plt.xlabel('actions') plt.ylabel('q value') plt.title('estimate q value') ax.xaxis.set_label_coords(1.02, -0.035) plt.subplot(212) plt.plot(actions, q_true_his) plt.scatter(actions[qt_index], q_true_his[qt_index], c='r') plt.title('true q value') # plt.savefig("images/rbf-lqr/"+str(n_features)+"-"+now+"q_true&estimate-action(-1,1)") plt.show() env.close() replay_buffer.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=100, type=int) parser.add_argument('--stop_criterion', default=10**-3, type=float) parser.add_argument('--sample_max_steps', default="10000", choices=["2000", "5000", "10000", "20000"]) parser.add_argument('--max_steps', default=20, type=int) parser.add_argument('--reg_opt', default="l2", choices=["l1", "l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.001, type=float) parser.add_argument('--rbf_sigma', default=0.01, type=float) # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action args = parser.parse_args() params = vars(args) # env # env = LQREnv() A = np.matrix([[0.9, 0.], [0.08, 0.9]]) B = np.matrix([[0.1], [0.6]]) Z1 = np.matrix([[1, 0], [0, 0]]) Z2 = 0.1 noise_cov = np.matrix([[1, 0], [0, 1]]) env = LQREnv(A=A, B=B, Z1=Z1, Z2=Z2, noise_cov=noise_cov) params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) # basis function n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() params['basis_func'] = RBF_LQR([params['state_dim'], params['n_actions']], n_features, params['rbf_sigma']) params['policy'] = RBFPolicy4LQR(params['basis_func']) # set the parameters for agent batch_size = params['sample_max_steps'] max_steps = params['max_steps'] agent = LSPIAgent(params) sample_filename = LQR2D_samples_filename[params['sample_max_steps']] # sample_filename = LQR_samples_filename["-22-10000"] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) sample = replay_buffer.sample(batch_size) print("length of sample: {}".format(len(sample[0]))) error_list, new_weights = agent.train(sample) # states = np.linspace(-10,10,500) states = np.linspace([-10] * env.m, [10] * env.m, 500) trueL = env.optimal_policy_L(gamma) actions_true = [] for i in range(len(states)): state = np.matrix(states[i].reshape(env.m, 1)) # action = agent.policy.get_best_action(state) # actions_estimate.append(action) actions_true.append((-trueL * state).item()) # print(actions_true) actions_estimate = agent.policy.get_best_action(states) # save agent now = time.strftime("%Y-%m-%d", time.localtime(time.time())) now2 = time.strftime("%H_%M_%S", time.localtime(time.time())) path = "data/LQR2D/" + now folder = os.path.exists(path) if not folder: os.makedirs(path) fn = path + "/data-" + str(params['reg_opt']) + "-" + str( params['reg_param']) + "-BF" + str(n_features) + "-" + params[ 'basis_func'].name() + "-" + now2 + ".pkl" f = open(fn, 'wb') pickle.dump(actions_estimate, f) f.close() # plot plt.plot(states, actions_estimate, label='estimate') # print(actions_true) plt.plot(states, actions_true, label='true') plt.legend(loc='upper right') pltfn = path + "/data-" + str(params['reg_opt']) + "-" + str( params['reg_param']) + "-BF" + str(n_features) + "-" + params[ 'basis_func'].name() + "-" + now2 + ".png" # f = open(fn, 'wb') plt.savefig(pltfn, dpi=300) # plt.show() # clean env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', default="LQR", choices=[ "cliff-v0", "CartPole-v0", "inverted_pedulum", "LQR", "chain" ]) # gym env to train parser.add_argument('--episode_num', default=10, type=int) parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=10, type=int) parser.add_argument('--stop_criterion', default=10**-5, type=float) parser.add_argument('--sample_max_steps', default="5000", choices=["2000", "5000", "10000", "20000"]) parser.add_argument('--max_steps', default=500, type=int) parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--update_freq', default=10000000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--reg_opt', default="none", choices=["l1", "l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.01, type=float) args = parser.parse_args() params = vars(args) # env env = LQREnv() params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['basis_func'] = ExactBasis4LQR() params['sample_max_steps'] = int(params['sample_max_steps']) gamma = params['weight_discount'] # Note: now init policy with specific L # the action would be related to this init L # Remember to update L! L = np.matrix(params['L']) params['policy'] = ExactPolicy4LQR(params['basis_func'], L) # set the parameters for agent batch_size = params['batch_size'] update_freq = params['update_freq'] n_episode = params['episode_num'] max_steps = params['max_steps'] agent = LSPIAgent(params) sample_filename = LQR_samples_filename[params['sample_max_steps']] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) # training to get weights -> best L sample = replay_buffer.sample(batch_size) error_list, new_weights = agent.train(sample) # log reward_his = [] estimateL_his = [] i_update = 0 for i_episode in range(n_episode): state = env.reset() i_episode_steps = 0 accu_reward = 0 # LQR never done # print("i_episode: {}".format(i_episode)) while True: i_episode_steps += 1 action = agent.get_action(state) state_, reward, done, info = env.step(action[0]) # print("state: {}".format(state)) # print("action: {}".format(action)) # print("reward: {}".format(reward)) # print("state_: {}\n".format(state_)) # replay_buffer.store(state, action, reward, state_, done) accu_reward += reward state = state_ if i_episode_steps > 20: # done # print("accu_reward {}\n".format(accu_reward)) reward_his.append(accu_reward) time.sleep(0.1) break # estimateL = agent.policy.estimate_policy_L().item() # use true Q/weights in this L to check whether it converge to optimal one true_weights = env.true_weights_scala(agent.policy.L, gamma) w3 = true_weights[2].item() w4 = true_weights[3].item() estimateL = np.matrix(w4 / (2 * w3)) estimateL_his.append(estimateL.item()) agent.policy.L = estimateL print("estimateL: {}".format(estimateL)) agent.train(sample) # now = time.strftime("%Y-%m-%d-%H_%M_%S",time.localtime(time.time())) trueL = env.optimal_policy_L(gamma).item() print("trueL: {}".format(trueL)) print("estimateL_his: {}", estimateL_his) env.close() replay_buffer.reset() # plot # plt.plot(reward_his) # plt.show() plt.plot(np.arange(n_episode), estimateL_his, label='estimate L') plt.plot(np.arange(n_episode), [trueL] * n_episode, label='optimal L') plt.ylabel('L') plt.xlabel('iteration') plt.legend(loc='upper right') plt.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weight_discount', default=0.99, type=float) # note: 1.0 only for finite parser.add_argument('--exploration', default=0.1, type=float) # 0.0 means no random action parser.add_argument('--basis_function_dim', default=20, type=int) parser.add_argument('--stop_criterion', default=10**-3, type=float) parser.add_argument('--sample_max_steps', default="5000", choices=["2000","5000","10000","20000"]) parser.add_argument('--reg_opt', default="wl1", choices=["l1","l2", "wl1", "none"]) parser.add_argument('--reg_param', default=0.001, type=float) parser.add_argument('--rbf_sigma', default=0.01, type=float) # parser.add_argument('--batch_size', default=2000, type=int) parser.add_argument('--L', default=0.1, type=float) # 0.0 means no random action args = parser.parse_args() params = vars(args) # env # env = LQREnv() # present state[1] A = np.matrix([[0.9,0.],[0.1,0.9]]) B = np.matrix([[1],[0.]]) Z1 = np.matrix([[0,0],[0,1]]) Z2 = 0.1 noise_cov = np.matrix([[0.01,0],[0,0.01]]) env = LQREnv(A=A,B=B,Z1=Z1,Z2=Z2,noise_cov=noise_cov) params['n_actions'] = env.action_space.shape[0] params['state_dim'] = env.observation_space.shape[0] params['sample_max_steps'] = int(params['sample_max_steps']) # print(params['state_dim']) # basis function n_features = params['basis_function_dim'] gamma = params['weight_discount'] # params['basis_func'] = ExactBasis4LQR() batch_size = params['sample_max_steps'] sample_filename = LQR2D_samples_filename[params['sample_max_steps']] # sample_filename = LQR_samples_filename["-22-10000"] f = open(sample_filename, 'rb') replay_buffer = pickle.load(f) sample = replay_buffer.sample(batch_size) print("length of sample: {}".format(len(sample[0]))) L_vec = 2*np.concatenate((np.max(np.abs(sample[0]), axis=0).flatten(), [np.max(np.abs(sample[1]))])) params['basis_func'] = Laplace_LQR(n_features,L_vec) params['policy'] = RBFPolicy4LQR(params['basis_func']) agent = BellmanAgent(params) error_list, new_weights = agent.train(sample) test_agent4LQR2D(agent, env, gamma, ifshow=False) env.close()