def generate_experience_test(experience, run_num, random_seed): # Check if this run of experience has already been generated: if np.count_nonzero(experience[run_num]) != 0: return # Initialize the environment: if args.environment == 'pw': env = puddleworld() else: import gym_puddle # Re-import the puddleworld env in each subprocess or it sometimes isn't found during creation. env = gym.make(args.environment).unwrapped env.seed(random_seed) rng = env.np_random # Create the behaviour policy: mu = eval(args.behaviour_policy, { 'np': np, 'env': env }) # Give the eval'd function access to some objects. # Generate the required timesteps of experience: s_t = env.reset() a_t = rng.choice(env.action_space.n, p=mu(s_t)) t = 0 step = 0 while t != args.num_timesteps: # Take action a_t, observe next state s_tp1 and reward r_tp1: s_tp1, r_tp1, terminal, _ = env.step(a_t) # The agent is reset to a starting state after a terminal transition: if terminal: s_tp1 = env.reset() a_tp1 = rng.choice(env.action_space.n, p=mu(s_t)) step += 1 #adds every 1000th state as an evaluation state if step % 1000 == 0: # Add the transition: experience[run_num, t] = (s_t, ) step = 0 t += 1 # Update temporary variables: s_t = s_tp1 a_t = a_tp1
def generate_experience(experience, run_num, random_seed): # Check if this run of experience has already been generated: if np.count_nonzero(experience[run_num]) != 0: return # Initialize the environment: if args.environment == 'pw': env = puddleworld() else: import gym_puddle env = gym.make(args.environment).unwrapped env.seed(random_seed) rng = env.np_random # Create the behaviour policy: mu = eval(args.behaviour_policy, { 'np': np, 'env': env }) # Give the eval'd function access to some objects. # Generate the required timesteps of experience: s_t = env.reset() a_t = rng.choice(env.action_space.n, p=mu(s_t)) for t in range(args.num_timesteps): # Take action a_t, observe next state s_tp1 and reward r_tp1: s_tp1, r_tp1, terminal, _ = env.step(a_t) # The agent is reset to a starting state after a terminal transition: if terminal: s_tp1 = env.reset() a_tp1 = rng.choice(env.action_space.n, p=mu(s_t)) # Add the transition: experience[run_num, t] = (s_t, a_t, r_tp1, s_tp1, a_tp1, terminal) # Update temporary variables: s_t = s_tp1 a_t = a_tp1
def run_ace(experience_memmap, policies_memmap, performance_memmap, run_num, config_num, parameters, random_seed, experience_memmap_test, num_test_eval): # If this run and configuration has already been done (i.e., previous run timed out), exit early: if np.count_nonzero(policies_memmap[config_num]['policies'][run_num]) != 0: return alpha_a, alpha_w, alpha_v, lambda_c, eta = parameters # If this is the first run with a set of parameters, save the parameters: if run_num == 0: policies_memmap[config_num]['parameters'] = (alpha_a, alpha_w, alpha_v, lambda_c, eta, args.gamma, args.num_tiles_per_dim, args.num_tilings, args.bias_unit) performance_memmap[config_num]['parameters'] = (alpha_a, alpha_w, alpha_v, lambda_c, eta, args.gamma, args.num_tiles_per_dim, args.num_tilings, args.bias_unit) # Create the environment to evaluate the learned policy in: if args.environment == 'pw': env = puddleworld() else: import gym_puddle env = gym.make(args.environment).unwrapped env.seed(random_seed) rng = env.np_random actor = BinaryACE(env.action_space.n, tc.total_num_tiles, alpha_a / tc.num_active_features) if args.all_actions: critic = BinaryGQ(env.action_space.n, tc.total_num_tiles, alpha_w / tc.num_active_features, alpha_v / tc.num_active_features, lambda_c) elif args.critic == 'ETD': critic = BinaryETD(tc.total_num_tiles, alpha_w / tc.num_active_features, lambda_c) else: critic = BinaryTDRC(tc.total_num_tiles, alpha_w / tc.num_active_features, lambda_c) if args.direct_f: # Initialize the function approximator being used to estimate the emphatic weightings: fhat = BinaryFHat(tc.total_num_tiles, alpha_v / tc.num_active_features, args.normalize) i = eval(args.interest_function) # Create the interest function to use. mu = eval(args.behaviour_policy, { 'np': np, 'env': env }) # Create the behaviour policy and give it access to numpy and the env. policies = np.zeros(num_policies, dtype=policy_dtype) performance = np.zeros((num_policies, args.num_evaluation_runs), dtype=float) performance_excursions = np.zeros((num_policies, num_test_eval), dtype=float) np.seterr(divide='raise', over='raise', invalid='raise') try: transitions = experience_memmap[run_num] gamma_t = 0. f_t = 0. rho_tm1 = 1. indices_t = tc.encode(transitions[0][0]) for t, transition in enumerate(transitions): # Save and evaluate the learned policy if it's a checkpoint timestep: if t % args.checkpoint_interval == 0: performance[t // args.checkpoint_interval] = [ evaluate_policy(actor, tc, env, rng, args.max_timesteps) for _ in range(args.num_evaluation_runs) ] perf_excursions = [] for sample in range(num_test_eval): env.state = experience_memmap_test[run_num][sample][0] perf_excursions.append( evaluate_policy( actor, tc, env, rng, args.max_timesteps, state=experience_memmap_test[run_num][sample][0])) performance_excursions[ t // args.checkpoint_interval] = perf_excursions policies[t // args.checkpoint_interval] = (t, np.copy(actor.theta)) # Unpack the stored transition. s_t, a_t, r_tp1, s_tp1, a_tp1, terminal = transition gamma_tp1 = args.gamma if not terminal else 0 # Transition-dependent discounting. indices_tp1 = tc.encode(s_tp1) i_t = i(s_t, gamma_t) i_tp1 = i(s_tp1, gamma_tp1) # Compute importance sampling ratio for the policy: pi_t = actor.pi(indices_t) mu_t = mu(s_t) rho_t = pi_t[a_t] / mu_t[a_t] if args.direct_f: # Estimate emphatic weightings with the function approximator: f_t = fhat.estimate(indices_t) # Update the function approximator: fhat.learn(indices_tp1, gamma_tp1, indices_t, rho_t, i_tp1) else: # Estimate emphatic weightings with the follow-on trace: f_t = ( 1 - gamma_t ) * i_t + rho_tm1 * gamma_t * f_t if args.normalize else i_t + rho_tm1 * gamma_t * f_t m_t = (1 - eta) * i_t + eta * f_t if args.all_actions: critic.learn(indices_t, a_t, rho_t, gamma_t, r_tp1, indices_tp1, actor.pi(indices_tp1), gamma_tp1) q_t = critic.estimate(indices_t) actor.all_actions_learn(indices_t, q_t, m_t) elif args.critic == 'ETD': delta_t = r_tp1 + gamma_tp1 * critic.estimate( indices_tp1) - critic.estimate(indices_t) critic.learn(delta_t, indices_t, gamma_t, i_t, rho_t, f_t) actor.learn(indices_t, a_t, delta_t, m_t, rho_t) else: delta_t = r_tp1 + gamma_tp1 * critic.estimate( indices_tp1) - critic.estimate(indices_t) critic.learn(delta_t, indices_t, gamma_t, indices_tp1, gamma_tp1, rho_t) actor.learn(indices_t, a_t, delta_t, m_t, rho_t) gamma_t = gamma_tp1 indices_t = indices_tp1 rho_tm1 = rho_t # Save and evaluate the policy after the final timestep: policies[-1] = (t + 1, np.copy(actor.theta)) performance[-1] = [ evaluate_policy(actor, tc, env, rng, args.max_timesteps) for _ in range(args.num_evaluation_runs) ] perf_excursions = [] for sample in range(num_test_eval): env.state = experience_memmap_test[run_num][sample][0] perf_excursions.append( evaluate_policy( actor, tc, env, rng, args.max_timesteps, state=experience_memmap_test[run_num][sample][0])) performance_excursions[-1] = perf_excursions # Save the learned policies and their performance to the memmap: performance_memmap[config_num]['results'][run_num] = performance performance_memmap[config_num]['results_excursions'][ run_num] = performance_excursions policies_memmap[config_num]['policies'][run_num] = policies except (FloatingPointError, ValueError) as e: # Save NaN to indicate the weights overflowed and exit early: performance_memmap[config_num]['results'][run_num] = np.full_like( performance, np.NaN) performance_memmap[config_num]['results_excursions'][ run_num] = np.full_like(performance_excursions, np.NaN) policies_memmap[config_num]['policies'][run_num] = np.full_like( policies, np.NaN) return
experience_memmap = np.lib.format.open_memmap(args.experience_file, mode='r') num_runs, num_timesteps = experience_memmap.shape # Load the input data as a memmap to prevent a copy being loaded into memory in each sub-process: experience_memmap_test = np.lib.format.open_memmap( args.experience_file_test, mode='r') num_runs, num_test_eval = experience_memmap_test.shape # Generate the random seed for each run without replacement to prevent the birthday paradox: random.seed(args.random_seed) random_seeds = random.sample(range(2**32), num_runs) # Create the tile coder to be used for all parameter settings: if args.environment == 'pw': dummy_env = puddleworld() else: dummy_env = gym.make( args.environment).unwrapped # Make a dummy env to get shape info. tc = TileCoder( np.array([ dummy_env.observation_space.low, dummy_env.observation_space.high ]).T, args.num_tiles_per_dim, args.num_tilings, args.bias_unit) # Create the memmapped array of learned policies that will be populated in parallel: parameters_dtype = np.dtype([('alpha_a', float), ('alpha_w', float), ('alpha_v', float), ('lambda', float), ('eta', float), ('gamma', float), ('num_tiles_per_dim', int, (len(args.num_tiles_per_dim), )), ('num_tilings', int), ('bias_unit', bool)])
# Generate the random seed for each run without replacement to prevent the birthday problem: random.seed(args.random_seed) random_seeds = random.sample(range(2**32), args.num_runs) # Save the command line arguments in a format interpretable by argparse: output_dir = Path(args.output_dir) if args.test_data: #we consider 50 different start states for evaluation args.num_timesteps = 50 utils.save_args_to_file(args, output_dir / 'experience_test.args') # Create the memmapped structured array of experience to be populated in parallel: if args.environment == 'pw': env = puddleworld() else: env = gym.make( args.environment ).unwrapped # Make a dummy env to get shape info for observations. transition_dtype = np.dtype([('s_t', float, env.observation_space.shape)]) experience_memmap_path = str(output_dir / 'experience_test.npy') if os.path.isfile(experience_memmap_path): experience_memmap = np.lib.format.open_memmap( experience_memmap_path, mode='r+') else: experience_memmap = np.lib.format.open_memmap( experience_memmap_path, shape=(args.num_runs, args.num_timesteps),