def run(seed, episodes, evaluation_episodes, parameter_rollouts, scale, initialise_params, save_dir, title): logger = logging.getLogger() logger.setLevel(logging.INFO) alpha_param = 1.0 variances = [0.1, 0.1, 0.01] initial_params = [3., 10., 400.] env = gym.make('Platform-v0') dir = os.path.join(save_dir, title) if scale: env = ScaledStateWrapper(env) variances = [0.0001, 0.0001, 0.0001] for a in range(env.action_space.spaces[0].n): initial_params[a] = 2. * (initial_params[a] - env.action_space.spaces[1].spaces[a].low) / ( env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1. env = QPAMDPScaledParameterisedActionWrapper(env) alpha_param = 0.1 env = Monitor(env, directory=os.path.join(dir,str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) act_obs_index = [0, 1, 2, 3] param_obs_index = None discrete_agent = SarsaLambdaAgent(env.observation_space.spaces[0], env.action_space.spaces[0], alpha=1.0, gamma=0.999, temperature=1.0, cooling=0.995, lmbda=0.5, order=6, scale_alpha=True, use_softmax=True, seed=seed, observation_index=act_obs_index, gamma_step_adjust=True) agent = QPAMDPAgent(env.observation_space.spaces[0], env.action_space, alpha=alpha_param, initial_action_learning_episodes=10000, seed=seed, action_obs_index=act_obs_index, parameter_obs_index=param_obs_index, action_relearn_episodes=1000, variances=variances, parameter_updates=180, parameter_rollouts=parameter_rollouts, norm_grad=False, discrete_agent=discrete_agent, print_freq=100) agent.discrete_agent.gamma_step_adjust = True if initialise_params: for a in range(env.action_space.spaces[0].n): agent.parameter_weights[a][0,0] = initial_params[a] max_steps = 201 start_time = time.time() agent.learn(env, episodes, max_steps) end_time = time.time() print("Training took %.2f seconds" % (end_time - start_time)) env.close() returns = env.get_episode_rewards() print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.variances = 0 agent.discrete_agent.epsilon = 0. agent.discrete_agent.temperature = 0. evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def run(seed, episodes, evaluation_episodes, scale, initialise_params, save_dir, title): alpha_param = 0.1 env = gym.make('Goal-v0') env = GoalObservationWrapper(env) if scale: variances[0] = 0.0001 variances[1] = 0.0001 variances[2] = 0.0001 alpha_param = 0.06 initial_parameter_weights[0] = np.array( [[-0.375, 0.5, 0, 0.0625, 0], [0, 0, 0.8333333333333333333, 0, 0.111111111111111111111111]]) initial_parameter_weights[1] = np.array([0.857346647646219686, 0]) initial_parameter_weights[2] = np.array([-0.857346647646219686, 0]) env = ScaledStateWrapper(env) env = QPAMDPScaledParameterisedActionWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) action_obs_index = np.arange(14) param_obs_index = np.array([ np.array([10, 11, 14, 15]), # ball_features np.array([16]), # keeper_features np.array([16]), # keeper_features ]) basis = CustomFourierBasis(14, env.observation_space.spaces[0].low[:14], env.observation_space.spaces[0].high[:14]) discrete_agent = SarsaLambdaAgent(env.observation_space.spaces[0], env.action_space.spaces[0], basis=basis, seed=seed, alpha=0.01, lmbda=0.1, gamma=0.9, temperature=1.0, cooling=1.0, scale_alpha=False, use_softmax=True, observation_index=action_obs_index, gamma_step_adjust=False) agent = QPAMDPAgent( env.observation_space.spaces[0], env.action_space, alpha=alpha_param, initial_action_learning_episodes=4000, seed=seed, action_obs_index=action_obs_index, parameter_obs_index=param_obs_index, variances=variances, discrete_agent=discrete_agent, action_relearn_episodes=2000, parameter_updates=1000, parameter_rollouts=50, norm_grad=True, print_freq=100, phi0_func=lambda state: np.array([1, state[1], state[1]**2]), phi0_size=3) # Alternating learning periods from original paper: # QPAMDP(1) : init(2000), parameter_updates(50), relearn(50) # QPAMDP(infinity) : init(2000), parameter_updates(1000), relearn(2000) # needed to increase initial action learning episodes to 4000 if initialise_params: for a in range(3): agent.parameter_weights[a] = initial_parameter_weights[a] max_steps = 150 start_time = time.time() agent.learn(env, episodes, max_steps) end_time = time.time() agent.plot_reward() agent.plot_p() print("Training took %.2f seconds" % (end_time - start_time)) env.close() returns = np.array(env.get_episode_rewards()) print("Saving training results to:", os.path.join(dir, "QPAMDP{}".format(str(seed)))) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) print('Total P(S):{0:.4f}'.format((returns == 50.).sum() / len(returns))) print('Ave. last 100 episode P(S):{0:.4f}'.format( (returns[-100:] == 50.).sum() / 100.)) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.variances = 0 agent.discrete_agent.epsilon = 0. agent.discrete_agent.temperature = 0. evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) print("Ave. evaluation prob. =", sum(evaluation_returns == 50.) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def __init__(self, observation_space, action_space, alpha=0.01, initial_action_learning_episodes=10000, action_relearn_episodes=1000, parameter_updates=180, parameter_rollouts=50, action_obs_index=None, parameter_obs_index=None, discrete_agent=None, norm_grad=False, variances=None, # list of variances per continuous action parameter (one entry per action) seed=None, phi0_func=None, phi0_size=None, poly_basis=False, print_freq=1): super().__init__(observation_space, action_space) # split the action space into the discrete actions and continuous parameters self.discrete_action_space = action_space.spaces[0] self.parameter_space = action_space.spaces[1] self.num_actions = self.discrete_action_space.n nvars = self.observation_space.shape[0] self.alpha = alpha if isinstance(variances, (list, np.ndarray)): assert len(variances) == self.num_actions else: variances = variances*np.ones((self.num_actions,)) self.variances = variances self.initial_action_learning_episodes = initial_action_learning_episodes self.action_relearn_episodes = action_relearn_episodes self.parameter_updates = parameter_updates self.parameter_rollouts = parameter_rollouts self.episodes_per_cycle = self.action_relearn_episodes + self.parameter_updates * self.parameter_rollouts self.parameter_obs_index = parameter_obs_index self.norm_grad = norm_grad self.phi0_func = phi0_func self.phi0_size = phi0_size if self.phi0_size is None: assert self.phi0_func is None # raise error? Need to specify size of custom phi0_func self.print_freq = print_freq self.R = 0. self._total_episodes = 0 # initialise discrete action learner self.discrete_agent = discrete_agent if self.discrete_agent is None: self.discrete_agent = SarsaLambdaAgent(self.observation_space, self.discrete_action_space, alpha=1.0, gamma=0.999, temperature=1.0, cooling=0.995, lmbda=0.5, order=6, scale_alpha=True, use_softmax=True, seed=seed, observation_index=action_obs_index) self.np_random = None self.__seed = 0 self._seed(seed) # initialise basis for each action-parameter (one per action) if self.parameter_obs_index is not None: self.basis = [] if isinstance(self.parameter_obs_index[0], (list, np.ndarray)): if len(self.parameter_obs_index) == 1: self.parameter_obs_index = np.tile(self.parameter_obs_index, (self.num_actions, 1)) else: # different observation variables for each action-parameter assert len(self.parameter_obs_index) == self.num_actions else: assert isinstance(self.parameter_obs_index[0], int) # same observation variables for all action-parameters, duplicate them for convenience0 self.parameter_obs_index = np.tile(self.parameter_obs_index,(self.num_actions,1)) for a in range(self.num_actions): nvars = len(self.parameter_obs_index[a]) low = self.observation_space.low[self.parameter_obs_index[a]] high = self.observation_space.high[self.parameter_obs_index[a]] # self.basis.append(ScaledBasis(nvars, low, high, bias_unit=True)) if poly_basis is True: self.basis.append(PolynomialBasis(nvars, order=2, bias_unit=True)) else: self.basis.append(SimpleBasis(nvars, bias_unit=True)) # self.basis.append(SimpleBasis(nvars, bias_unit=True)) else: # use simple basis with bias unit (for parameter initialisation) # self.basis = [ScaledBasis(nvars, low, high, bias_unit=True) for _ in range(self.num_actions)] # if poly_basis is True: # self.basis = [PolynomialBasis(nvars, order=2, bias_unit=True) for _ in range(self.num_actions)] # else: # self.basis = [SimpleBasis(nvars, bias_unit=True) for _ in range(self.num_actions)] self.basis = [SimpleBasis(nvars, bias_unit=True) for _ in range(self.num_actions)] self.num_basis_functions = [self.basis[a].get_num_basis_functions() for a in range(self.num_actions)] # self.poly_basis = poly_basis # self.parameter_weights = np.zeros((self.num_actions, self.num_basis_functions)) # TODO: randomly init weights? # for multidimensional parameters self.parameter_weights = [] for a in range(self.num_actions): shape = (self.num_basis_functions[a],) param_shape = self.parameter_space.spaces[a].shape assert len(param_shape) <= 1 if len(param_shape) == 1 and param_shape[0] > 0: shape = (param_shape[0], self.num_basis_functions[a]) self.parameter_weights.append(np.zeros(shape))
class QPAMDPAgent(Agent): """ Defines an agent to optimize H(theta) using the episodic natural actor critic (eNAC) algorithm for continuous action spaces. Uses Gaussian policy for continuous actions. N.B. assumes same state variables used for all actions, and separately same for all parameters """ name = "Q-PAMDP" def __init__(self, observation_space, action_space, alpha=0.01, initial_action_learning_episodes=10000, action_relearn_episodes=1000, parameter_updates=180, parameter_rollouts=50, action_obs_index=None, parameter_obs_index=None, discrete_agent=None, norm_grad=False, variances=None, # list of variances per continuous action parameter (one entry per action) seed=None, phi0_func=None, phi0_size=None, poly_basis=False, print_freq=1): super().__init__(observation_space, action_space) # split the action space into the discrete actions and continuous parameters self.discrete_action_space = action_space.spaces[0] self.parameter_space = action_space.spaces[1] self.num_actions = self.discrete_action_space.n nvars = self.observation_space.shape[0] self.alpha = alpha if isinstance(variances, (list, np.ndarray)): assert len(variances) == self.num_actions else: variances = variances*np.ones((self.num_actions,)) self.variances = variances self.initial_action_learning_episodes = initial_action_learning_episodes self.action_relearn_episodes = action_relearn_episodes self.parameter_updates = parameter_updates self.parameter_rollouts = parameter_rollouts self.episodes_per_cycle = self.action_relearn_episodes + self.parameter_updates * self.parameter_rollouts self.parameter_obs_index = parameter_obs_index self.norm_grad = norm_grad self.phi0_func = phi0_func self.phi0_size = phi0_size if self.phi0_size is None: assert self.phi0_func is None # raise error? Need to specify size of custom phi0_func self.print_freq = print_freq self.R = 0. self._total_episodes = 0 # initialise discrete action learner self.discrete_agent = discrete_agent if self.discrete_agent is None: self.discrete_agent = SarsaLambdaAgent(self.observation_space, self.discrete_action_space, alpha=1.0, gamma=0.999, temperature=1.0, cooling=0.995, lmbda=0.5, order=6, scale_alpha=True, use_softmax=True, seed=seed, observation_index=action_obs_index) self.np_random = None self.__seed = 0 self._seed(seed) # initialise basis for each action-parameter (one per action) if self.parameter_obs_index is not None: self.basis = [] if isinstance(self.parameter_obs_index[0], (list, np.ndarray)): if len(self.parameter_obs_index) == 1: self.parameter_obs_index = np.tile(self.parameter_obs_index, (self.num_actions, 1)) else: # different observation variables for each action-parameter assert len(self.parameter_obs_index) == self.num_actions else: assert isinstance(self.parameter_obs_index[0], int) # same observation variables for all action-parameters, duplicate them for convenience0 self.parameter_obs_index = np.tile(self.parameter_obs_index,(self.num_actions,1)) for a in range(self.num_actions): nvars = len(self.parameter_obs_index[a]) low = self.observation_space.low[self.parameter_obs_index[a]] high = self.observation_space.high[self.parameter_obs_index[a]] # self.basis.append(ScaledBasis(nvars, low, high, bias_unit=True)) if poly_basis is True: self.basis.append(PolynomialBasis(nvars, order=2, bias_unit=True)) else: self.basis.append(SimpleBasis(nvars, bias_unit=True)) # self.basis.append(SimpleBasis(nvars, bias_unit=True)) else: # use simple basis with bias unit (for parameter initialisation) # self.basis = [ScaledBasis(nvars, low, high, bias_unit=True) for _ in range(self.num_actions)] # if poly_basis is True: # self.basis = [PolynomialBasis(nvars, order=2, bias_unit=True) for _ in range(self.num_actions)] # else: # self.basis = [SimpleBasis(nvars, bias_unit=True) for _ in range(self.num_actions)] self.basis = [SimpleBasis(nvars, bias_unit=True) for _ in range(self.num_actions)] self.num_basis_functions = [self.basis[a].get_num_basis_functions() for a in range(self.num_actions)] # self.poly_basis = poly_basis # self.parameter_weights = np.zeros((self.num_actions, self.num_basis_functions)) # TODO: randomly init weights? # for multidimensional parameters self.parameter_weights = [] for a in range(self.num_actions): shape = (self.num_basis_functions[a],) param_shape = self.parameter_space.spaces[a].shape assert len(param_shape) <= 1 if len(param_shape) == 1 and param_shape[0] > 0: shape = (param_shape[0], self.num_basis_functions[a]) self.parameter_weights.append(np.zeros(shape)) # self.parameter_weights.append(self.np_random.normal(loc=0.,scale=0.0001,size=shape)) # self.parameter_weights = self.np_random.random_sample((self.num_actions, self.num_basis_functions)) def act(self, state): act = self._action_policy(state) param = self._parameter_policy(state, act) return self._pad_action(act, param) def learn(self, env, max_episodes=100000, max_steps_per_episode=None): """ Learn for a given number of episodes. """ self.e = 0 if max_episodes < self.initial_action_learning_episodes: warnings.warn("Too few episodes to initialise agent!", UserWarning) print("Initial discrete action learning for %d episodes..." % self.initial_action_learning_episodes) for _ in range(self.initial_action_learning_episodes): self._rollout(env, update_actions=True, max_steps=max_steps_per_episode) self.e += 1 if self.e > max_episodes: break while True: self.discrete_agent.temperature = 0.0 self.discrete_agent.epsilon = 0.0 # update parameter policy print(self.e, "Updating parameter selection...") for _ in range(self.parameter_updates): self._parameter_update(env, max_steps_per_episode) self.e += self.parameter_rollouts if self.e > max_episodes: break if self.e > max_episodes: break self.discrete_agent.temperature = 1.0 self.discrete_agent.epsilon = 1.0 # update discrete action policy print(self.e, "Updating action selection...") for _ in range(self.action_relearn_episodes): self._rollout(env, update_actions=True, max_steps=max_steps_per_episode) self.e += 1 if self.e > max_episodes: break if self.e > max_episodes: break # no stochastic actions for evaluation? self.discrete_agent.temperature = 0.0 self.discrete_agent.epsilon = 0.0 def start_episode(self): self.discrete_agent.start_episode() def end_episode(self): self.discrete_agent.end_episode() def _seed(self, seed=None): """ NOTE: this will not reset the randomly initialised weights; use the seed parameter in the constructor instead. :param seed: :return: """ self.np_random = np.random.RandomState(seed=seed) def _get_parameters(self): """ Returns all the parameters in a vector. """ # parameters = [] # # for non-uniform parameter wieghts shapes (ragged array) # for a in range(self.num_actions): # parameters.append(self.parameter_weights[a]) # return np.ravel(self.parameter_weights) # np.array(parameters) return np.concatenate([self.parameter_weights[i].flat for i in range(len(self.parameter_weights))]) def _set_parameters(self, parameters): """ Set the parameters using a vector. """ index = 0 for action in range(self.num_actions): rows = self.parameter_weights[action].size self.parameter_weights[action] = parameters[index: index + rows].reshape(self.parameter_weights[action].shape) index += rows def _log_parameter_gradient(self, state, act, param): """ Returns the log gradient for the parameter, given the state and the value. """ features = self._compute_features(state, act) mean = self.parameter_weights[act].dot(features) grad = np.outer((param - mean),features / self.variances[act]) return grad.ravel() def log_gradient(self, state, action, param): """ Returns the log gradient for the entire policy. """ grad = np.zeros((0,)) for i in range(self.num_actions): elems = self.parameter_weights[i].size if i == action: parameter_grad = self._log_parameter_gradient(state, i, param) grad = np.append(grad, parameter_grad) else: grad = np.append(grad, np.zeros((elems,))) return grad def _pad_action(self, act, param): # Box for each parameter wrapped in a Compound action = [np.zeros(self.parameter_space.spaces[a].shape) for a in range(self.num_actions)] action[act] = param action = (act, action) return action def _rollout(self, env, update_actions=False, max_steps=None): """ Run a single episode for a maximum number of steps. """ state, _ = env.reset() states = [state] rewards = [] actions = [] terminal = False act = self._action_policy(state) acts = [act] steps = 0 if update_actions: self.discrete_agent.start_episode() while not terminal and not (max_steps is not None and steps > max_steps): param = self._parameter_policy(state, act) # print (act,param) (new_state, time_steps), reward, terminal, _ = env.step(self._pad_action(act, param)) new_act = self._action_policy(new_state) if update_actions: self.discrete_agent.step(state, act, reward, new_state, new_act, terminal, time_steps) state = new_state states.append(state) actions.append((act, param)) rewards.append(reward) act = new_act acts.append(act) steps += 1 if update_actions: self.discrete_agent.end_episode() self.R += sum(rewards) self._total_episodes += 1 if self.print_freq > 0 and self._total_episodes % self.print_freq == 0: if self.print_freq == 1: print("{0:5s} R: {1:.4f} r: {2:.4f}".format(str(self._total_episodes), self.R/self._total_episodes,sum(rewards))) else: # print("{0:5s} R: {1:.4f}".format(str(self._total_episodes), self.R/self._total_episodes)) returns = np.array(env.get_episode_rewards()) print('{0:5s} R:{1:.5f} P(S):{2:.4f}'.format(str(self._total_episodes), sum(returns) / (self._total_episodes), (np.array(returns) == 50.).sum() / len(returns))) return states, actions, rewards, acts def _enac_gradient(self, env, max_steps=None): #, phi0_func=None, phi0_size=None): """ Compute the episodic NAC gradient. phi0_func : lambda function giving the state features of s_0, the initial state in a trajectory defaults to [1.] if None phi0_size : number of features returned by phi0_func """ if self.phi0_size is None: assert self.phi0_func is None # raise error? Need to specify size of custom phi0_fun if self.phi0_func is None: self.phi0_func = lambda state: np.array([1,]) self.phi0_size = 1 returns = np.zeros((self.parameter_rollouts, 1)) param_size = self._get_parameters().size psi = np.zeros((self.parameter_rollouts, param_size + self.phi0_size)) for run in range(self.parameter_rollouts): states, actions, rewards, acts = self._rollout(env, False, max_steps) returns[run, 0] = sum(rewards) log_grad = np.zeros((param_size,)) for state, act, action in zip(states, acts, actions): log_grad += self.log_gradient(state, act, action[1]) psi[run, :] = np.append(log_grad, self.phi0_func(states[0])) grad = np.linalg.pinv(psi).dot(returns)[0:param_size, 0] return grad def _parameter_update(self, env, max_steps=None): """ Perform a single gradient update. """ grad = self._enac_gradient(env, max_steps) if np.linalg.norm(grad) > 0 and self.norm_grad: grad /= np.linalg.norm(grad) self._set_parameters(self._get_parameters() + self.alpha * grad) def _action_update(self, state, action, reward, next_state, next_action, terminal, time_steps=1): self.discrete_agent.step(state, action[0], reward, next_state, next_action[0], terminal, time_steps) def _action_policy(self, state): return self.discrete_agent.act(state) def _parameter_policy(self, state, act): return self._gaussian_policy(state, act) def _gaussian_policy(self, state, act): """ Gaussian action policy for continuous actions. """ mean = np.dot(self.parameter_weights[act], self._compute_features(state, act)) variance = 0. if self.variances is not None: if isinstance(self.variances, (list, np.ndarray)): variance = self.variances[act] else: variance = self.variances if variance == 0.: return mean else: # TODO: multivariate_normal expects variance, normal expects stdev? may be important... # this may be incorrect / unnecessary but trying to be consistent with Warwick's source code for now if isinstance(mean, np.ndarray) and len(mean) > 1: return self.np_random.multivariate_normal(mean, variance*np.eye(len(mean))) return self.np_random.normal(mean, variance) def _compute_features(self, state, act): """ Returns phi: the features after the function approximation basis has been applied. """ if self.parameter_obs_index is not None: state = state[self.parameter_obs_index[act]] return self.basis[act].compute_features(state) def __str__(self): desc = ("Q-PAMDP Agent\n"+ "Alpha: {}\n".format(self.alpha)+ "Initial Action Episodes: {}\n".format(self.initial_action_learning_episodes)+ "Action Relearn Episodes: {}\n".format(self.action_relearn_episodes)+ "Parameter Updates: {}\n".format(self.parameter_updates) + "Parameter Rollouts: {}\n".format(self.parameter_rollouts) + "Observation Index: {}\n".format(self.parameter_obs_index) + "Variances: {}\n".format(self.variances) + "Norm Grad: {}\n".format(self.norm_grad) + "Phi0 func.: {}\n".format(self.phi0_func) + "Phi0 size: {}\n".format(self.phi0_size) + "Discrete Agent: {}\n".format(self.discrete_agent) + "Seed: {}\n".format(self.__seed)) return desc
def run(seed, episodes, evaluation_episodes, parameter_updates, gamma, scale_actions, learning_rate_actor, learning_rate_actor_param, variance, title): env = gym.make('SoccerScoreGoal-v0') if scale_actions: env = SoccerScaledParameterisedActionWrapper(env) env = SoccerParameterisedActionWrapper(env) env = TimestepWrapper(env) # env = ScaledStateWrapper(env) dir = os.path.join(*("results", "soccer", title)) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) # env.seed(seed) np.random.seed(seed) action_obs_index = [5, 6, 7, 12, 13, 14, 15, 51, 52, 53] parameter_obs_index = action_obs_index print(env.action_space.spaces[0]) print(env.observation_space) discrete_agent = SarsaLambdaAgent(env.observation_space, env.action_space.spaces[0], seed=seed, alpha=learning_rate_actor, lmbda=0.5, gamma=gamma, epsilon=1.0, temperature=1.0, observation_index=action_obs_index, cooling=0.995, scale_alpha=True, use_softmax=False, gamma_step_adjust=False, order=2) agent = QPAMDPAgent( env.observation_space, env.action_space, alpha=learning_rate_actor_param, initial_action_learning_episodes=1000, seed=seed, variances=variance, discrete_agent=discrete_agent, action_relearn_episodes=1000, parameter_updates=parameter_updates, parameter_rollouts=25, norm_grad=False, action_obs_index=action_obs_index, parameter_obs_index=parameter_obs_index, print_freq=100, poly_basis=False, #phi0_func=lambda state: np.array([1, state[1], state[1] ** 2]), #phi0_size=3, ) agent.parameter_weights[0][0, 0] = 0.5 agent.parameter_weights[2][0, 0] = 0.5 print(agent) agent.learn(env, max_episodes=episodes) returns = env.get_episode_rewards() print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.variances = 0 agent.discrete_agent.epsilon = 0. agent.discrete_agent.temperature = 0. start_time_eval = time.time() evaluation_results = evaluate( env, agent, evaluation_episodes) # returns, timesteps, goals end_time_eval = time.time() print("Ave. evaluation return =", sum(evaluation_results[:, 0]) / evaluation_results.shape[0]) print("Ave. timesteps =", sum(evaluation_results[:, 1]) / evaluation_results.shape[0]) goal_timesteps = evaluation_results[:, 1][evaluation_results[:, 2] == 1] if len(goal_timesteps) > 0: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) else: print("Ave. timesteps per goal =", sum(goal_timesteps) / evaluation_results.shape[0]) print("Ave. goal prob. =", sum(evaluation_results[:, 2]) / evaluation_results.shape[0]) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_results) print("Evaluation time: %.2f seconds" % (end_time_eval - start_time_eval))