def run(self, record_J=False): self.thetas.append(self.theta) for _ in tqdm(np.arange(0, self.n_steps)): self.policy.set_theta(self.theta) trajectories = tls.collect_episodes(self.env, policy=self.policy, horizon=self.T, n_episodes=self.N) if record_J: J = tls.estimate_performance(paths=trajectories) self.Js.append(J) # Compute gradient ascent update grad = self.policy.compute_Jgrad(trajectories, gamma=self.gamma) ascent = self.update_rate.update(grad) self.theta = self.theta + ascent self.thetas.append(self.theta)
def compute_optimal_policy(self, performance=True): """Compute the optimal parameter for the parametrized policy with a gradient based update rule. Parameters ---------- performance : bool, optional If True, estimate the perfomance (average return) during the optimization using the function utils.estimate_performance. """ if performance: self.avg_returns = [] if self.policy_type == "gaussian": # History of the different values of theta self.theta_history = [] self.theta_history.append(self.theta) for i in tqdm(range(self.n_itr)): self.policy = GaussianPolicy(self.theta, self.sigma) # Simulate N trajectories with T times steps each paths = collect_episodes(self.env, policy=self.policy, horizon=self.T, n_episodes=self.N) # Average performance per iteration if performance: avg_return = estimate_performance(paths=paths) self.avg_returns.append(avg_return) # Gradient update self.theta += self.stepper.update( self.policy.compute_J_estimated_gradient(paths, self.discounts, N=self.N, T=self.T)) # Add the new theta to the history self.theta_history.append(self.theta)
def iterate(self, K, performance=False, alpha=0): """Linear Fitted Q-iteration with K iterations. Parameters ---------- K : int Number of iterations. performance : bool, optional If True, evaluate the performance of the greedy policy at each iteration. alpha : int, optional Regularization parameter for the regression. """ # Initialize Q self.Q = lambda state, action: 0 self.Q = np.vectorize(self.Q) # Build the observations matrix self.Phi = self.build_observations_matrix() # Store the average return at each iteration if performance: self.avg_returns = [] # Iterate for k in tqdm(range(K)): # Recompute y at each iteration (with the new value of Q) self.y = self.build_response_variable() # Fit a linear model to the data self.fit_linear_model(alpha=alpha) # Average performance of the greedy policy per iteration if performance: policy = self.GreedyPolicy(self) avg_return = estimate_performance(self.env, policy=policy, horizon=50, n_episodes=50, gamma=self.discount_factor) self.avg_returns.append(avg_return)
def compute_optimal_policy(self, estimate_performance=True): if estimate_performance: self.average_returns = [] self.theta = 0. self.theta_history = [] self.theta_history.append(self.theta) self.counts = {} for _ in tqdm(range(self.n_itr)): policy = create_policy(self.policy_type, { **self.policy_params, 'theta': self.theta }) paths = utils.collect_episodes(self.env, policy=policy, horizon=self.T, n_episodes=self.N) if self.exploration_bonus: self._update_rewards(paths) # performances for iteration if estimate_performance: self.average_returns.append( utils.estimate_performance(paths=paths)) self.theta += self.stepper.update( policy.compute_J_estimated_gradient( paths, self.discounts, N=self.N, T=self.T, )) self.theta_history.append(self.theta[0])
Q_fun_ = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, discount, 1)) Q_fun = lambda X: Q_fun_(X[:, 0], X[:, 1]) Q_opt = Q_fun(SA) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(S, A, Q_opt) plt.show() ################################################################# # Collect the samples using the behavioural policy ################################################################# # You should use discrete actions beh_policy = dataset = collect_episodes(env, n_episodes=100, policy=beh_policy, horizon=horizon) # define FQI # to evaluate the policy you can use estimate_performance # plot obtained Q-function against the true one J = estimate_performance(env, policy=fqi, horizon=100, n_episodes=500, gamma=discount) print('Policy performance: {}'.format(J)) plt.show()