Esempio n. 1
0
    def run(self, record_J=False):
        self.thetas.append(self.theta)
        for _ in tqdm(np.arange(0, self.n_steps)):
            self.policy.set_theta(self.theta)
            trajectories = tls.collect_episodes(self.env,
                                                policy=self.policy,
                                                horizon=self.T,
                                                n_episodes=self.N)

            if record_J:
                J = tls.estimate_performance(paths=trajectories)
                self.Js.append(J)

            # Compute gradient ascent update
            grad = self.policy.compute_Jgrad(trajectories, gamma=self.gamma)
            ascent = self.update_rate.update(grad)
            self.theta = self.theta + ascent
            self.thetas.append(self.theta)
    def compute_optimal_policy(self, performance=True):
        """Compute the optimal parameter for the parametrized policy with a 
        gradient based update rule.
        
        Parameters
        ----------
        performance : bool, optional
            If True, estimate the perfomance (average return) during the 
            optimization using the function utils.estimate_performance.
        """

        if performance:
            self.avg_returns = []

        if self.policy_type == "gaussian":

            # History of the different values of theta
            self.theta_history = []
            self.theta_history.append(self.theta)

            for i in tqdm(range(self.n_itr)):

                self.policy = GaussianPolicy(self.theta, self.sigma)

                # Simulate N trajectories with T times steps each
                paths = collect_episodes(self.env,
                                         policy=self.policy,
                                         horizon=self.T,
                                         n_episodes=self.N)

                # Average performance per iteration
                if performance:
                    avg_return = estimate_performance(paths=paths)
                    self.avg_returns.append(avg_return)

                # Gradient update
                self.theta += self.stepper.update(
                    self.policy.compute_J_estimated_gradient(paths,
                                                             self.discounts,
                                                             N=self.N,
                                                             T=self.T))

                # Add the new theta to the history
                self.theta_history.append(self.theta)
Esempio n. 3
0
    def iterate(self, K, performance=False, alpha=0):
        """Linear Fitted Q-iteration with K iterations.
        
        Parameters
        ----------
        K : int
            Number of iterations.
        performance : bool, optional
            If True, evaluate the performance of the greedy policy at each 
            iteration.
        alpha : int, optional
            Regularization parameter for the regression.
        """
        # Initialize Q
        self.Q = lambda state, action: 0
        self.Q = np.vectorize(self.Q)

        # Build the observations matrix
        self.Phi = self.build_observations_matrix()

        # Store the average return at each iteration
        if performance:
            self.avg_returns = []

        # Iterate
        for k in tqdm(range(K)):

            # Recompute y at each iteration (with the new value of Q)
            self.y = self.build_response_variable()

            # Fit a linear model to the data
            self.fit_linear_model(alpha=alpha)

            # Average performance of the greedy policy per iteration
            if performance:
                policy = self.GreedyPolicy(self)
                avg_return = estimate_performance(self.env,
                                                  policy=policy,
                                                  horizon=50,
                                                  n_episodes=50,
                                                  gamma=self.discount_factor)
                self.avg_returns.append(avg_return)
Esempio n. 4
0
    def compute_optimal_policy(self, estimate_performance=True):
        if estimate_performance:
            self.average_returns = []

        self.theta = 0.
        self.theta_history = []
        self.theta_history.append(self.theta)

        self.counts = {}

        for _ in tqdm(range(self.n_itr)):
            policy = create_policy(self.policy_type, {
                **self.policy_params, 'theta': self.theta
            })

            paths = utils.collect_episodes(self.env,
                                           policy=policy,
                                           horizon=self.T,
                                           n_episodes=self.N)

            if self.exploration_bonus:
                self._update_rewards(paths)

            # performances for iteration
            if estimate_performance:
                self.average_returns.append(
                    utils.estimate_performance(paths=paths))

            self.theta += self.stepper.update(
                policy.compute_J_estimated_gradient(
                    paths,
                    self.discounts,
                    N=self.N,
                    T=self.T,
                ))
            self.theta_history.append(self.theta[0])
Q_fun_ = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, discount, 1))
Q_fun = lambda X: Q_fun_(X[:, 0], X[:, 1])

Q_opt = Q_fun(SA)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(S, A, Q_opt)

plt.show()

#################################################################
# Collect the samples using the behavioural policy
#################################################################
# You should use discrete actions
beh_policy =

dataset = collect_episodes(env, n_episodes=100,
                                            policy=beh_policy, horizon=horizon)

# define FQI
# to evaluate the policy you can use estimate_performance

# plot obtained Q-function against the true one


J = estimate_performance(env, policy=fqi, horizon=100, n_episodes=500, gamma=discount)
print('Policy performance: {}'.format(J))
plt.show()