def evaluate_policies(policies_memmap, performance_memmap, evaluation_run_num, ace_run_num, config_num, policy_num, random_seed): if evaluation_run_num == 0: performance_memmap[config_num]['parameters'] = policies_memmap[ config_num]['parameters'] # Load the policy to evaluate: configuration = policies_memmap[config_num]['parameters'] weights = policies_memmap[config_num]['policies'][ace_run_num, policy_num]['weights'] actor = BinaryACE(weights.shape[0], weights.shape[1], 0.) actor.theta = weights # Handle situations where the learning process diverged: if np.any(np.isnan(weights)): # If the weights overflowed, assign NaN as return: performance_memmap[config_num]['results'][ace_run_num, policy_num, evaluation_run_num] = np.nan else: # Set up the environment: import gym_puddle # Re-import the puddleworld env in each subprocess or it sometimes isn't found during creation. env = gym.make(args.environment).unwrapped env.seed(random_seed) rng = env.np_random if args.objective == 'episodic': # Use the environment's start state: s_t = env.reset() else: raise NotImplementedError # Configure the tile coder: num_tiles_per_dim = configuration['num_tiles_per_dim'] num_tilings = configuration['num_tilings'] bias_unit = configuration['bias_unit'] tc = TileCoder( np.array([env.observation_space.low, env.observation_space.high]).T, num_tiles_per_dim, num_tilings, bias_unit) # Write the total rewards received to file: performance_memmap[config_num]['results'][ ace_run_num, policy_num, evaluation_run_num] = evaluate_policy(actor, tc, env, rng)
experience_memmap_test = np.lib.format.open_memmap( args.experience_file_test, mode='r') num_runs, num_test_eval = experience_memmap_test.shape # Generate the random seed for each run without replacement to prevent the birthday paradox: random.seed(args.random_seed) random_seeds = random.sample(range(2**32), num_runs) # Create the tile coder to be used for all parameter settings: if args.environment == 'pw': dummy_env = puddleworld() else: dummy_env = gym.make( args.environment).unwrapped # Make a dummy env to get shape info. tc = TileCoder( np.array([ dummy_env.observation_space.low, dummy_env.observation_space.high ]).T, args.num_tiles_per_dim, args.num_tilings, args.bias_unit) # Create the memmapped array of learned policies that will be populated in parallel: parameters_dtype = np.dtype([('alpha_a', float), ('alpha_w', float), ('alpha_v', float), ('lambda', float), ('eta', float), ('gamma', float), ('num_tiles_per_dim', int, (len(args.num_tiles_per_dim), )), ('num_tilings', int), ('bias_unit', bool)]) policy_dtype = np.dtype([('timesteps', int), ('weights', float, (dummy_env.action_space.n, tc.total_num_tiles))]) num_policies = num_timesteps // args.checkpoint_interval + 1 configuration_dtype = np.dtype([('parameters', parameters_dtype), ('policies', policy_dtype,
def test_aa_binary_ace_on_policy(self): env = gym.make( 'MountainCar-v0' ).unwrapped # Get the underlying environment object to bypass the built-in timestep limit. env.seed(317850564) # Seed generated by: np.random.randint(2**31 - 1) rng = env.np_random alpha_a = .1 alpha_c = .2 alpha_c2 = .001 lambda_c = .9 gamma = 1. num_runs = 10 num_episodes = 10 rewards = np.full((num_runs, num_episodes), np.nan) for run_num in tqdm(range(num_runs)): tc = TileCoder( np.array( [env.observation_space.low, env.observation_space.high]).T, [5, 5], 8, True) actor = BinaryACE(env.action_space.n, tc.total_num_tiles, alpha_a / tc.num_active_features) critic = BinaryGQ(env.action_space.n, tc.total_num_tiles, alpha_c / tc.num_active_features, alpha_c2 / tc.num_active_features, lambda_c) for episode_num in range(num_episodes): g = 0. f_t = 1. critic.z *= 0. # reset traces indices_t = tc.encode(env.reset()) for t in range(1000): a_t = rng.choice(env.action_space.n, p=actor.pi(indices_t)) # Select action. s_tp1, r_tp1, terminal, _ = env.step( a_t) # Interact with environment. indices_tp1 = tc.encode(s_tp1) critic.learn(indices_t, a_t, 1, gamma, r_tp1, indices_tp1, actor.pi(indices_tp1), 0 if terminal else gamma) q_t = critic.estimate(indices_t) actor.all_actions_learn(indices_t, q_t, f_t) indices_t = indices_tp1 f_t *= gamma g += r_tp1 if terminal: break rewards[run_num, episode_num] = g mean_rewards = np.mean(rewards, axis=0) sem_rewards = st.sem(rewards, axis=0) ci = sem_rewards * st.t.ppf((1.95) / 2, num_runs - 1) fig = plt.figure() ax = fig.add_subplot(111) ax.errorbar(np.arange(num_episodes), mean_rewards, yerr=[ci, ci], label='All Actions AC') plt.legend(loc='lower right') plt.title('Mountain Car (on-policy)') plt.xlabel('Episode Number') plt.ylabel('Total Reward') plt.ylim(-1000, 0) plt.savefig('all_actions_ac_on_policy.png') self.assertGreater(mean_rewards[-1], -250)
def test_tile_coder(self): np.random.seed(2734287609) # Define a function to approximate: def target_function(x1, x2): return np.sin(np.sqrt(x1**2 + x2**2)) / np.sqrt(x1**2 + x2**2) # A function to add noise: def noise(y): return y + .1 * np.random.randn() # Create a tile coder and weight vector: tc = TileCoder(space=[[-10, 10], [-10, 10]], num_tiles_per_dim=[11, 11], num_tilings=8, bias_unit=True) weights = np.zeros(tc.total_num_tiles) step_size = .2 / tc.num_active_features # Use the tile coder to train the weight vector from noisy samples of the target function: num_examples = 1000 for i in range(num_examples): x1, x2 = np.random.random_sample( 2) * 20 - 10 # Sample the input space uniformly. indices = tc.encode( (x1, x2)) # Compute indices for the input point. y_hat = weights[indices].sum( ) # Generate an estimate from the weights. y = noise(target_function( x1, x2)) # Get a noisy sample output from the function. weights[indices] += step_size * (y - y_hat ) # Update the weight vector. # Check the function and the learned approximation: resolution = 100 x1 = np.arange(-10, 10, 20 / resolution) x2 = np.arange(-10, 10, 20 / resolution) y = np.zeros((resolution, resolution)) y_hat = np.zeros((resolution, resolution)) for j in range(len(x1)): for k in range(len(x2)): y[j, k] = target_function(x1[j], x2[k]) # True value of the function. y_hat[j, k] = weights[tc.encode( (x1[j], x2[k]))].sum() # Learned estimate. # Visualize the function and the learned approximation: x1, x2 = np.meshgrid(x1, x2) fig = plt.figure() ax = fig.add_subplot(1, 2, 1, projection='3d') ax.plot_surface(x1, x2, y_hat, cmap='hot') ax.set_zlim(-.25, 1.) ax.set_title('Learned estimate after {} examples'.format(num_examples)) ax = fig.add_subplot(1, 2, 2, projection='3d') ax.plot_surface(x1, x2, y, cmap='hot') ax.set_zlim(-.25, 1.) ax.set_title('True function') plt.savefig('tile_coder_test.png') tolerance = .01 self.assertLess(np.mean(y - y_hat), tolerance)
def test_low_variance_binary_ace_off_policy(self): env = gym.make('MountainCar-v0').unwrapped env.seed(278422227) rng = env.np_random # ACE parameters: alpha_a = .0005 alpha_c = .1 alpha_c2 = .0005 lambda_c = 0. alpha_f = .1 eta = 1. i = lambda g=1: 1. # Uniform interest. # OffPAC parameters: # alpha_a = .01 # alpha_c = .01 # alpha_c2 = .00005 # lambda_c = .4 # alpha_f = .01 # eta = 0. # i = lambda g=1: 1. # Uniform interest. gamma = 1. num_runs = 1 num_timesteps = 20000 evaluation_interval = 1000 num_evaluation_runs = 10 rewards = np.zeros((num_runs, num_timesteps // evaluation_interval + 1, num_evaluation_runs)) for run_num in range(num_runs): tc = TileCoder( np.array( [env.observation_space.low, env.observation_space.high]).T, [5, 5], 8, True) actor = BinaryACE(env.action_space.n, tc.total_num_tiles, alpha_a / tc.num_active_features) critic = BinaryTDC(tc.total_num_tiles, alpha_c / tc.num_active_features, alpha_c2 / tc.num_active_features, lambda_c) fhat = BinaryFHat(tc.total_num_tiles, alpha_f) mu = np.ones(env.action_space.n ) / env.action_space.n # Uniform random policy. gamma_t = 0. indices_t = tc.encode(env.reset()) for t in tqdm(range(num_timesteps)): if t % evaluation_interval == 0: rewards[run_num, t // evaluation_interval] = Parallel(n_jobs=-1)( delayed(evaluate_policy)( actor, tc, num_timesteps=1000) for _ in range(num_evaluation_runs)) a_t = rng.choice(env.action_space.n, p=mu) s_tp1, r_tp1, terminal, _ = env.step(a_t) gamma_tp1 = 0. if terminal else gamma indices_tp1 = tc.encode(s_tp1) rho_t = actor.pi(indices_t)[a_t] / mu[a_t] delta_t = r_tp1 + gamma_tp1 * critic.estimate( indices_tp1) - critic.estimate(indices_t) critic.learn(delta_t, indices_t, gamma_t, indices_tp1, gamma_tp1, rho_t) i_t = i(gamma_t) f_t = fhat.estimate(indices_t) m_t = (1 - eta) * i_t + eta * f_t actor.learn(indices_t, a_t, delta_t, m_t, rho_t) fhat.learn(indices_tp1, gamma_tp1, indices_t, rho_t, i_t) gamma_t = gamma_tp1 indices_t = indices_tp1 if terminal: gamma_t = 0. s_tp1 = env.reset() rewards[run_num, -1] = Parallel(n_jobs=-1)( delayed(evaluate_policy)(actor, tc, num_timesteps=1000) for _ in range(num_evaluation_runs)) # Plot results: mean_eval_rewards = np.mean(rewards, axis=2) var_eval_rewards = np.var(rewards, axis=2) mean_rewards = np.mean(mean_eval_rewards, axis=0) sem_rewards = np.sqrt( np.sum(var_eval_rewards / num_evaluation_runs, axis=0)) / num_runs fig = plt.figure() ax = fig.add_subplot(111) x = np.array([ evaluation_interval * i for i in range(num_timesteps // evaluation_interval + 1) ]) confs = sem_rewards * st.t.ppf( (1.0 + 0.95) / 2, num_evaluation_runs - 1) label = f'$\\alpha_a$:{alpha_a}, $\\alpha_c$:{alpha_c}, $\\alpha_c2$:{alpha_c2}, $\\lambda_c$:{lambda_c}, $\\eta$:{eta}{"(OffPAC)" if eta==0. else ""}' ax.errorbar(x, mean_rewards, yerr=[confs, confs], label=label) plt.legend(loc='lower right') plt.title('Mountain Car (off-policy)') plt.xlabel('Timesteps') plt.ylabel('Total Reward') plt.ylim(-1000, 0) plt.savefig('low_variance_binary_ace_off_policy.png') self.assertGreater(mean_rewards[-1], -200)