def perfect_features_test(): from src.env.Amatrix_task import Amatrix n = 3 m = 2 env = Amatrix(n, m) features = env.Amatrix # perfect features weights = np.zeros(n) config = Config() config.parameter_size = n config.alpha = 0.001 sgd = SGD(config) sample_size = 100000 for i in range(sample_size): rand_row = np.random.randint(n) target = env.sample_target(rand_row, noisy=True) pred_features = features[rand_row, :] prediction = np.dot(pred_features, weights) error = target - prediction gradient, new_stepsize, new_weight_vector = sgd.update_weight_vector( error, pred_features, weights) weights = new_weight_vector if (i + 1) % 10000 == 0: print("Sample number: {0}".format(i + 1)) print("\tPrediction error:{0}".format(error)) print("Theta star:\n{0}".format(env.theta_star)) print("Estimated theta:\n{0}".format(weights)) difference = np.sqrt(np.sum(np.square(env.theta_star - weights))) print("L2 norm of difference:\n{0}".format(difference))
def imperfect_features_test(): from src.env.Amatrix_task import Amatrix n = 10 m = 2 env = Amatrix(n, m) features = env.get_approx_A() # first m features weights = np.zeros(m) config = Config() config.parameter_size = m config.alpha = 0.001 sgd = SGD(config) sample_size = 50000 for i in range(sample_size): rand_row = np.random.randint(n) target = env.sample_target(rand_row, noisy=True) pred_features = features[rand_row, :] prediction = np.dot(pred_features, weights) error = target - prediction gradient, new_stepsize, new_weight_vector = sgd.update_weight_vector( error, pred_features, weights) weights = new_weight_vector print("Sample number: {0}".format(i + 1)) print("\tPrediction error:{0}".format(error)) print("Theta star:\n{0}".format(env.theta_star)) print("Estimated theta:\n{0}".format(weights))
def adding_bad_features_test(): from src.env.Amatrix_task import Amatrix n = 10 m = 5 env = Amatrix(n, m) features = env.get_approx_A() # first m features weights = np.zeros(m) config = Config() config.parameter_size = m config.alpha = 0.001 sgd = SGD(config) sample_size = 50000 additional_features = 30 for k in range(additional_features + 1): print("Number of features in the representation: {0}".format( sgd.parameter_size)) for i in range(sample_size): rand_row = np.random.randint(n) target = env.sample_target(rand_row, noisy=True) pred_features = features[rand_row, :] prediction = np.dot(pred_features, weights) error = target - prediction gradient, new_stepsize, new_weight_vector = sgd.update_weight_vector( error, pred_features, weights) weights = new_weight_vector if ((i + 1) % 50000) == 0: print("\tSample number: {0}".format(i + 1)) print("\t\tPrediction error: {0}".format(error)) print("Theta star:\n{0}".format(env.theta_star)) print("Estimated theta:\n{0}".format(weights)) if k < additional_features: print("Adding new feature...") new_feature = env.get_new_bad_features(1) features = np.hstack((features, new_feature)) sgd.increase_size(1) new_weights = np.zeros(m + 1) new_weights[:m] = weights m += 1 weights = new_weights
def test_function_approximator(num_features=20, initial_features=20, num_iterations=10000, chkpt=100, plot_mse=True, noisy=True, add_features=False, add_true_features=True, feature_add_interval=100, mixed_features=False): from src.step_size_methods import SGD config = Config() # task setup config.num_true_features = num_features config.num_obs_features = initial_features # same as function approximator config.max_num_features = 20000 # same as function approximator task = RandomFeatures(config) # function approximator setup approximator = LinearFunctionApproximator(config) # optimizer setup config.parameter_size = initial_features config.alpha = 0.001 optimizer = SGD(config) # for plotting mse_per_chpt = np.zeros(num_iterations // chkpt, dtype=np.float64) mse = 0 current_chpt = 0 # training loop for i in range(num_iterations): target, observable_features, best_approximation = task.sample_observation( noisy=noisy) prediction = approximator.get_prediction(observable_features) error = target - prediction _, _, new_weights = optimizer.update_weight_vector( error, observable_features, approximator.get_weight_vector()) approximator.update_weight_vector(new_weights) squared_loss = np.square(error) mse += squared_loss / chkpt if (i + 1) % chkpt == 0: # reporting and saving print("Iteration number: {0}".format(i + 1)) print("\tTarget: {0:.4f}".format(target)) print("\tPrediction: {0:.4f}".format(prediction)) print("\tMean Squared Error: {0:.4f}".format(mse)) mse_per_chpt[current_chpt] += mse mse *= 0 current_chpt += 1 if add_features and (i + 1) % feature_add_interval == 0: task.add_new_feature(k=1, true_feature=add_true_features) approximator.increase_num_features(k=1) optimizer.increase_size(k=1) if mixed_features: add_true_features = not add_true_features if plot_mse: # plots import matplotlib.pyplot as plt x_axis = np.arange(num_iterations // chkpt) plt.plot(x_axis, mse_per_chpt) plt.show() plt.close()
def sarsa_zero_test(steps=10000, add_new_centers=False, number_of_irrelevant_features=0): import matplotlib.pyplot as plt from src.env.RandomFeatures_task import LinearFunctionApproximator from src.step_size_methods.sgd import SGD # epsilon greedy policy def choose_action(av_array: np.ndarray, epsilon): p = np.random.rand() if p > epsilon: argmax_av = np.random.choice( np.flatnonzero(av_array == av_array.max())) return argmax_av else: return np.random.randint(av_array.size) # for computing action values def get_action_values(n, features, approximator_list): action_values = np.zeros(n, dtype=np.float64) for k in range(n): action_values[k] += approximator_list[k].get_prediction(features) return action_values completed_episodes_per_run = [] for _ in range(1): print("==== Results for Sarsa(0) with Epsilon Greedy Policy ====") config = Config() # setting up feature function config.state_dims = 2 config.state_lims = np.array(((-1, 1), (-1, 1)), dtype=np.float64) # config.initial_centers = np.array(((0.0,0.0), (-1.8,0), (1.8,0), (0.0,-1.8), (0.0,1.8)), dtype=np.float64) config.initial_centers = np.array( ((0.0, 0.0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) config.sigma = 0.5 config.init_noise_mean = 0.0 config.init_noise_var = 0.01 feature_function = RadialBasisFunction(config) # setting up environment config.norm_state = True env = MountainCar(config) # function approximator and optimizer parameters num_actions = 3 random_action_prob = 0.1 gamma = 0.99 config.num_obs_features = feature_function.num_features config.max_num_features = 200 # as long as this is more than 12 config.num_actions = num_actions config.alpha = 0.005 config.rescale = False config.parameter_size = feature_function.num_features function_approximator = [] optimizer = [] # one instance for each action for i in range(num_actions): function_approximator.append(LinearFunctionApproximator(config)) optimizer.append(SGD(config)) # setting up summaries all_episodes_return = [] episode_return = 0 # setting up initial state, action, features, and action values curr_s = env.get_current_state() curr_features = feature_function.get_observable_features(curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) midpoint_episode = 0 for i in range(steps): # get current action values curr_avs = get_action_values(num_actions, curr_features, function_approximator) # execute current action next_s, r, terminal = env.step(curr_a) next_features = feature_function.get_observable_features(next_s) # get next action values and action next_action_values = get_action_values(num_actions, next_features, function_approximator) next_action = choose_action(next_action_values, random_action_prob) # compute TD error for Sarsa(0) td_error = r + gamma * ( 1 - terminal) * next_action_values[next_action] - curr_avs[curr_a] # update weight vector _, ss, new_weights = optimizer[curr_a].update_weight_vector( td_error, curr_features, function_approximator[curr_a].get_weight_vector()) function_approximator[curr_a].update_weight_vector(new_weights) # set current features and action curr_features = next_features curr_a = next_action # keep track of sum of rewards episode_return += r # if terminal state if terminal: env.reset() all_episodes_return.append(episode_return) episode_return *= 0 curr_s = env.get_current_state() curr_features = feature_function.get_observable_features( curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) # if midpoint of training if (i + 1) == (steps // 2): if add_new_centers: new_centers = np.array( ((0, 0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) feature_function.add_centers(new_centers, noise_var=0, noise_mean=0) for k in range(num_actions): function_approximator[k].increase_num_features( new_centers.shape[0]) optimizer[k].increase_size(new_centers.shape[0], init_stepsize=0.25) if number_of_irrelevant_features > 0: new_feature_mean = 0.0 new_feature_var = 0.05 fake_features = True feature_function.add_feature(number_of_irrelevant_features, noise_mean=new_feature_mean, noise_var=new_feature_var, fake_feature=fake_features) for k in range(num_actions): function_approximator[k].increase_num_features( number_of_irrelevant_features) optimizer[k].increase_size( number_of_irrelevant_features) curr_features = feature_function.get_observable_features( curr_s) midpoint_episode = len(all_episodes_return) completed_episodes_per_run.append(len(all_episodes_return)) print("Number of episodes completed: {0}".format( len(all_episodes_return))) print("Average episodes completed: {0:0.4f}".format( np.average(completed_episodes_per_run))) print("Return per episode:\n", all_episodes_return) plt.plot(np.arange(len(all_episodes_return)) + 1, all_episodes_return) plt.vlines(x=midpoint_episode, ymin=-800, ymax=0) plt.ylim((-800, 0)) plt.show() plt.close()