def REINFORCE(hasBonus, stepper='constant'): parameters_simu = [] returns_simu = [] low_parameters = [] up_parameters = [] for n_simu in tqdm(range(N_simu), desc="Simulating REINFORCE algorithm"): N_tot = np.array([1 for i in range((bins + 2)**2)]) policy = Policy(-0.4) mean_parameters = [] avg_return = [] for ni in range(n_itr): paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N) grad = 0. Gsquare = 0 R = 0 for path in paths: if hasBonus: bonus = compute_bonus(path['states'], path['actions'], N_tot) for t in range(0, T): if hasBonus: vt = np.sum( np.array([ discount**(k - 1) for k in range(1, T - t + 1) ]) * (path['rewards'][t:] + bonus[t:])) else: vt = np.sum( np.array([ discount**(k - 1) for k in range(1, T - t + 1) ]) * path['rewards'][t:]) G = grad_log_policy(path['states'][t][0], path['actions'][t], policy.theta) Gsquare += np.square(G) grad += G * vt R += vt # Choosing stepper if stepper == 'constant': update = constantStepper.update(grad / N) elif stepper == 'adagrad': update = adagradStepper.update(grad / N, Gsquare / N**2) else: update = stochasticStepper.update(grad / N, ni) # Performing iteration update on parameters policy.theta = policy.theta + update # print(policy.theta) avg_return.append(R / N) mean_parameters.append(policy.theta) parameters_simu.append(np.array(mean_parameters)) returns_simu.append(np.array(avg_return)) return np.array(parameters_simu), np.array(returns_simu)
def run(self, record_J=False): self.thetas.append(self.theta) for _ in tqdm(np.arange(0, self.n_steps)): self.policy.set_theta(self.theta) trajectories = tls.collect_episodes(self.env, policy=self.policy, horizon=self.T, n_episodes=self.N) if record_J: J = tls.estimate_performance(paths=trajectories) self.Js.append(J) # Compute gradient ascent update grad = self.policy.compute_Jgrad(trajectories, gamma=self.gamma) ascent = self.update_rate.update(grad) self.theta = self.theta + ascent self.thetas.append(self.theta)
def compute_optimal_policy(self, performance=True): """Compute the optimal parameter for the parametrized policy with a gradient based update rule. Parameters ---------- performance : bool, optional If True, estimate the perfomance (average return) during the optimization using the function utils.estimate_performance. """ if performance: self.avg_returns = [] if self.policy_type == "gaussian": # History of the different values of theta self.theta_history = [] self.theta_history.append(self.theta) for i in tqdm(range(self.n_itr)): self.policy = GaussianPolicy(self.theta, self.sigma) # Simulate N trajectories with T times steps each paths = collect_episodes(self.env, policy=self.policy, horizon=self.T, n_episodes=self.N) # Average performance per iteration if performance: avg_return = estimate_performance(paths=paths) self.avg_returns.append(avg_return) # Gradient update self.theta += self.stepper.update( self.policy.compute_J_estimated_gradient(paths, self.discounts, N=self.N, T=self.T)) # Add the new theta to the history self.theta_history.append(self.theta)
def __init__(self, env, actions, n_episodes, horizon, discount_factor, beh_policy_type="uniform"): """ Parameters ---------- env : object Environment (lqg1d for instance). actions : array, shape = [n,] Discrete actions. n_episodes : int Number of episodes when generating the dataset. horizon : int Time horizon when generating the dataset. discount_factor : float Discount factor. beh_policy_type : str, optional Available values: "uniform". """ self.env = env self.actions = actions self.n_episodes = n_episodes self.horizon = horizon self.discount_factor = discount_factor if beh_policy_type == "uniform": beh_policy = UniformPolicy(actions) self.dataset = collect_episodes(env, n_episodes=n_episodes, policy=beh_policy, horizon=horizon) self.Q = lambda state, action: 0 self.Q = np.vectorize(self.Q)
def compute_optimal_policy(self, estimate_performance=True): if estimate_performance: self.average_returns = [] self.theta = 0. self.theta_history = [] self.theta_history.append(self.theta) self.counts = {} for _ in tqdm(range(self.n_itr)): policy = create_policy(self.policy_type, { **self.policy_params, 'theta': self.theta }) paths = utils.collect_episodes(self.env, policy=policy, horizon=self.T, n_episodes=self.N) if self.exploration_bonus: self._update_rewards(paths) # performances for iteration if estimate_performance: self.average_returns.append( utils.estimate_performance(paths=paths)) self.theta += self.stepper.update( policy.compute_J_estimated_gradient( paths, self.discounts, N=self.N, T=self.T, )) self.theta_history.append(self.theta[0])
# Set the discount factor for the problem discount = 0.9 # Learning rate for the gradient update learning_rate = 0.00001 ## Ex 1 : stepper = Adam() # we choose the Adam step list_mean_parameters, list_avg_returns = [], [] for e in tqdm(range(epochs), desc="Simulating {}".format('random')): stepper.reset() theta = 0 # we initialize theta avg_return, mean_parameters = [], [theta] for _ in range(n_itr): pi = policy(theta) paths = utils.collect_episodes(env, policy=pi, horizon=T, n_episodes=N) l = len(paths) avg = np.sum([paths[n]["rewards"] for n in range(l)]) / l grad_J = np.sum([ gradient_J(pi, paths[n]["states"][:, 0], paths[n]["actions"][:, 0], paths[n]["rewards"], discount) for n in range(N) ]) / N theta += stepper.update(grad_J) avg_return.append(avg) mean_parameters.append(theta) list_avg_returns.append(avg_return) list_mean_parameters.append(mean_parameters) list_avg_returns = np.array(list_avg_returns) list_mean_parameters = np.array(list_mean_parameters)
Q_fun_ = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, discount, 1)) Q_fun = lambda X: Q_fun_(X[:, 0], X[:, 1]) Q_opt = Q_fun(SA) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(S, A, Q_opt) plt.show() ################################################################# # Collect the samples using the behavioural policy ################################################################# # You should use discrete actions beh_policy = dataset = collect_episodes(env, n_episodes=100, policy=beh_policy, horizon=horizon) # define FQI # to evaluate the policy you can use estimate_performance # plot obtained Q-function against the true one J = estimate_performance(env, policy=fqi, horizon=100, n_episodes=500, gamma=discount) print('Policy performance: {}'.format(J)) plt.show()
def reinforce(env, N, T, n_itr, discount, learning_rate, init_policy, update_fun="fixed", from_scratch=True, returns=None, parameters=None, bonus=False): # Initialize the update function assert update_fun in ["annealing", "fixed"] if update_fun == "annealing": update = lambda x, t: x * learning_rate / (t + 1) else: update = lambda x, t: x * learning_rate # Initialize the output or use the ones provided if not from_scratch: assert (returns is not None) & (parameters is not None) returns_dt = returns parameters_dt = parameters else: returns_dt = pd.DataFrame( columns=["Learning Rate", "Stepper", "N", "Iteration", "Returns"]) parameters_dt = pd.DataFrame( columns=["Learning Rate", "Stepper", "N", "Iteration", "Params"]) policy = init_policy # Main loop theta = policy.theta for j in tqdm.tqdm(range(n_itr)): paths = utils.collect_episodes(env, policy=policy, horizon=T, n_episodes=N, bonus=bonus) # REINFORCE estimates returns_tab = [] grad_J_tab = [] for p in paths: grad_J_episode = 0 discounted_returns = sum( [discount**i * p["rewards"][i] for i in range(T)]) returns_tab.append(discounted_returns) returns_dt = returns_dt.append( { 'Learning Rate': str("10^{}".format( int(log10(learning_rate)))), 'Stepper': update_fun, 'N': ". {}".format(str(N)), 'Iteration': j, "Returns": discounted_returns }, ignore_index=True) for i in range(T): grad_J_episode += policy.grad_log( p["actions"][i], p["states"][i]) * sum( [discount**j * p["rewards"][j] for j in range(i, T)]) grad_J_tab.append(grad_J_episode) parameters_dt = parameters_dt.append( { 'Learning Rate': str("10^{}".format( int(log10(learning_rate)))), 'Stepper': update_fun, 'N': ". {}".format(str(N)), 'Iteration': j, "Params": policy.theta + update(grad_J_episode[0], j) }, ignore_index=True) grad_J = np.mean(grad_J_tab) # Update policy parameter theta = theta + update(grad_J, j) policy.set_theta(theta) return returns_dt, parameters_dt