def test_random_features_generator(sample_size=10, compute_sample_statistics=True): config = Config() config.num_true_features = 2 config.num_obs_features = 2 config.max_num_features = 20000 task = RandomFeatures(config) print("The value of theta is:\n{0}".format(task.theta)) print("The norm of theta is: {0}".format(np.linalg.norm(task.theta))) for i in range(sample_size): target, observable_features, best_approximation = task.sample_observation( noisy=False) print("The features are: {0}\tThe target is:{1}".format( observable_features, target)) if compute_sample_statistics: num_samples = 100000 samples = np.zeros(num_samples) for i in range(num_samples): target, _, _ = task.sample_observation(noisy=False) samples[i] += target # The sample average and sample variance of the target should be 0 and 1, respectively. print("The sample average of the target is: {:.2f}".format( np.average(samples))) print("The sample variance of the target is: {:.2f}".format( np.var(samples)))
def test_function_approximator(num_features=20, initial_features=20, num_iterations=10000, chkpt=100, plot_mse=True, noisy=True, add_features=False, add_true_features=True, feature_add_interval=100, mixed_features=False): from src.step_size_methods import SGD config = Config() # task setup config.num_true_features = num_features config.num_obs_features = initial_features # same as function approximator config.max_num_features = 20000 # same as function approximator task = RandomFeatures(config) # function approximator setup approximator = LinearFunctionApproximator(config) # optimizer setup config.parameter_size = initial_features config.alpha = 0.001 optimizer = SGD(config) # for plotting mse_per_chpt = np.zeros(num_iterations // chkpt, dtype=np.float64) mse = 0 current_chpt = 0 # training loop for i in range(num_iterations): target, observable_features, best_approximation = task.sample_observation( noisy=noisy) prediction = approximator.get_prediction(observable_features) error = target - prediction _, _, new_weights = optimizer.update_weight_vector( error, observable_features, approximator.get_weight_vector()) approximator.update_weight_vector(new_weights) squared_loss = np.square(error) mse += squared_loss / chkpt if (i + 1) % chkpt == 0: # reporting and saving print("Iteration number: {0}".format(i + 1)) print("\tTarget: {0:.4f}".format(target)) print("\tPrediction: {0:.4f}".format(prediction)) print("\tMean Squared Error: {0:.4f}".format(mse)) mse_per_chpt[current_chpt] += mse mse *= 0 current_chpt += 1 if add_features and (i + 1) % feature_add_interval == 0: task.add_new_feature(k=1, true_feature=add_true_features) approximator.increase_num_features(k=1) optimizer.increase_size(k=1) if mixed_features: add_true_features = not add_true_features if plot_mse: # plots import matplotlib.pyplot as plt x_axis = np.arange(num_iterations // chkpt) plt.plot(x_axis, mse_per_chpt) plt.show() plt.close()
def boyan_chain_test(steps=50000): from src.env.BoyanChain import BoyanChain from src.env.RandomFeatures_task import LinearFunctionApproximator from src.util import Config import matplotlib.pyplot as plt config = Config() checkpoint = 100 """ Environment Setup """ config.init_noise_var = 0.1 config.num_obs_features = 4 config.max_num_features = 9 """ AutoTIDBD Setup """ config.parameter_size = 4 config.theta = 0.001 config.tau = 10000 config.init_stepsize = 0.001 # to keep track of learning progress run_avg_msve = np.zeros(steps // checkpoint, dtype=np.float64) current_checkpoint = 0 avg_msve = 0 env = BoyanChain(config) approximator = LinearFunctionApproximator(config) optimizer = AutoTIDBD(config) """ Start of Learning""" curr_obs_feats = env.get_observable_features() for s in range(steps): state_value = approximator.get_prediction(curr_obs_feats) optimal_value = env.compute_true_value() # step in the environment _, r, next_obs_feats, term = env.step() next_state_value = approximator.get_prediction(next_obs_feats) # compute td error td_error = r + (1 - term) * next_state_value - state_value # update weights _, _, new_weights = optimizer.update_weight_vector( td_error, features=curr_obs_feats, weights=approximator.get_weight_vector(), discounted_next_features=next_obs_feats) approximator.update_weight_vector(new_weights) # update features curr_obs_feats = next_obs_feats # keep track of progress avg_msve += np.square(state_value - optimal_value) / checkpoint # check if terminal state if term: env.reset() curr_obs_feats = env.get_observable_features() # store learning progress so far if (s + 1) % checkpoint == 0: run_avg_msve[current_checkpoint] += avg_msve avg_msve *= 0 current_checkpoint += 1 if (s + 1) == (steps // 2): env.add_feature(k=4, noise=0.0, fake_feature=False) approximator.increase_num_features(4) optimizer.increase_size(4) curr_obs_feats = env.get_observable_features() print("The average MSVE is: {0:0.4f}".format(np.average(run_avg_msve))) xaxis = np.arange(run_avg_msve.size) + 1 plt.plot(xaxis, run_avg_msve) plt.show() plt.close()
def sarsa_zero_test(steps=10000, add_new_centers=False, number_of_irrelevant_features=0): import matplotlib.pyplot as plt from src.env.RandomFeatures_task import LinearFunctionApproximator from src.step_size_methods.sgd import SGD # epsilon greedy policy def choose_action(av_array: np.ndarray, epsilon): p = np.random.rand() if p > epsilon: argmax_av = np.random.choice( np.flatnonzero(av_array == av_array.max())) return argmax_av else: return np.random.randint(av_array.size) # for computing action values def get_action_values(n, features, approximator_list): action_values = np.zeros(n, dtype=np.float64) for k in range(n): action_values[k] += approximator_list[k].get_prediction(features) return action_values completed_episodes_per_run = [] for _ in range(1): print("==== Results for Sarsa(0) with Epsilon Greedy Policy ====") config = Config() # setting up feature function config.state_dims = 2 config.state_lims = np.array(((-1, 1), (-1, 1)), dtype=np.float64) # config.initial_centers = np.array(((0.0,0.0), (-1.8,0), (1.8,0), (0.0,-1.8), (0.0,1.8)), dtype=np.float64) config.initial_centers = np.array( ((0.0, 0.0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) config.sigma = 0.5 config.init_noise_mean = 0.0 config.init_noise_var = 0.01 feature_function = RadialBasisFunction(config) # setting up environment config.norm_state = True env = MountainCar(config) # function approximator and optimizer parameters num_actions = 3 random_action_prob = 0.1 gamma = 0.99 config.num_obs_features = feature_function.num_features config.max_num_features = 200 # as long as this is more than 12 config.num_actions = num_actions config.alpha = 0.005 config.rescale = False config.parameter_size = feature_function.num_features function_approximator = [] optimizer = [] # one instance for each action for i in range(num_actions): function_approximator.append(LinearFunctionApproximator(config)) optimizer.append(SGD(config)) # setting up summaries all_episodes_return = [] episode_return = 0 # setting up initial state, action, features, and action values curr_s = env.get_current_state() curr_features = feature_function.get_observable_features(curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) midpoint_episode = 0 for i in range(steps): # get current action values curr_avs = get_action_values(num_actions, curr_features, function_approximator) # execute current action next_s, r, terminal = env.step(curr_a) next_features = feature_function.get_observable_features(next_s) # get next action values and action next_action_values = get_action_values(num_actions, next_features, function_approximator) next_action = choose_action(next_action_values, random_action_prob) # compute TD error for Sarsa(0) td_error = r + gamma * ( 1 - terminal) * next_action_values[next_action] - curr_avs[curr_a] # update weight vector _, ss, new_weights = optimizer[curr_a].update_weight_vector( td_error, curr_features, function_approximator[curr_a].get_weight_vector()) function_approximator[curr_a].update_weight_vector(new_weights) # set current features and action curr_features = next_features curr_a = next_action # keep track of sum of rewards episode_return += r # if terminal state if terminal: env.reset() all_episodes_return.append(episode_return) episode_return *= 0 curr_s = env.get_current_state() curr_features = feature_function.get_observable_features( curr_s) curr_avs = get_action_values(num_actions, curr_features, function_approximator) curr_a = choose_action(curr_avs, random_action_prob) # if midpoint of training if (i + 1) == (steps // 2): if add_new_centers: new_centers = np.array( ((0, 0), (0.25, 0.25), (0.25, -0.25), (-0.25, -0.25), (-0.25, 0.25)), dtype=np.float64) feature_function.add_centers(new_centers, noise_var=0, noise_mean=0) for k in range(num_actions): function_approximator[k].increase_num_features( new_centers.shape[0]) optimizer[k].increase_size(new_centers.shape[0], init_stepsize=0.25) if number_of_irrelevant_features > 0: new_feature_mean = 0.0 new_feature_var = 0.05 fake_features = True feature_function.add_feature(number_of_irrelevant_features, noise_mean=new_feature_mean, noise_var=new_feature_var, fake_feature=fake_features) for k in range(num_actions): function_approximator[k].increase_num_features( number_of_irrelevant_features) optimizer[k].increase_size( number_of_irrelevant_features) curr_features = feature_function.get_observable_features( curr_s) midpoint_episode = len(all_episodes_return) completed_episodes_per_run.append(len(all_episodes_return)) print("Number of episodes completed: {0}".format( len(all_episodes_return))) print("Average episodes completed: {0:0.4f}".format( np.average(completed_episodes_per_run))) print("Return per episode:\n", all_episodes_return) plt.plot(np.arange(len(all_episodes_return)) + 1, all_episodes_return) plt.vlines(x=midpoint_episode, ymin=-800, ymax=0) plt.ylim((-800, 0)) plt.show() plt.close()