Ejemplo n.º 1
0
def REINFORCE(hasBonus, stepper='constant'):
    parameters_simu = []
    returns_simu = []

    low_parameters = []
    up_parameters = []

    for n_simu in tqdm(range(N_simu), desc="Simulating REINFORCE algorithm"):
        N_tot = np.array([1 for i in range((bins + 2)**2)])
        policy = Policy(-0.4)
        mean_parameters = []
        avg_return = []
        for ni in range(n_itr):
            paths = utils.collect_episodes(env,
                                           policy=policy,
                                           horizon=T,
                                           n_episodes=N)
            grad = 0.
            Gsquare = 0
            R = 0
            for path in paths:
                if hasBonus:
                    bonus = compute_bonus(path['states'], path['actions'],
                                          N_tot)
                for t in range(0, T):
                    if hasBonus:
                        vt = np.sum(
                            np.array([
                                discount**(k - 1) for k in range(1, T - t + 1)
                            ]) * (path['rewards'][t:] + bonus[t:]))
                    else:
                        vt = np.sum(
                            np.array([
                                discount**(k - 1) for k in range(1, T - t + 1)
                            ]) * path['rewards'][t:])
                    G = grad_log_policy(path['states'][t][0],
                                        path['actions'][t], policy.theta)
                    Gsquare += np.square(G)
                    grad += G * vt
                    R += vt

            # Choosing stepper
            if stepper == 'constant':
                update = constantStepper.update(grad / N)
            elif stepper == 'adagrad':
                update = adagradStepper.update(grad / N, Gsquare / N**2)
            else:
                update = stochasticStepper.update(grad / N, ni)
            # Performing iteration update on parameters
            policy.theta = policy.theta + update
            # print(policy.theta)
            avg_return.append(R / N)
            mean_parameters.append(policy.theta)
        parameters_simu.append(np.array(mean_parameters))
        returns_simu.append(np.array(avg_return))

    return np.array(parameters_simu), np.array(returns_simu)
Ejemplo n.º 2
0
    def run(self, record_J=False):
        self.thetas.append(self.theta)
        for _ in tqdm(np.arange(0, self.n_steps)):
            self.policy.set_theta(self.theta)
            trajectories = tls.collect_episodes(self.env,
                                                policy=self.policy,
                                                horizon=self.T,
                                                n_episodes=self.N)

            if record_J:
                J = tls.estimate_performance(paths=trajectories)
                self.Js.append(J)

            # Compute gradient ascent update
            grad = self.policy.compute_Jgrad(trajectories, gamma=self.gamma)
            ascent = self.update_rate.update(grad)
            self.theta = self.theta + ascent
            self.thetas.append(self.theta)
    def compute_optimal_policy(self, performance=True):
        """Compute the optimal parameter for the parametrized policy with a 
        gradient based update rule.
        
        Parameters
        ----------
        performance : bool, optional
            If True, estimate the perfomance (average return) during the 
            optimization using the function utils.estimate_performance.
        """

        if performance:
            self.avg_returns = []

        if self.policy_type == "gaussian":

            # History of the different values of theta
            self.theta_history = []
            self.theta_history.append(self.theta)

            for i in tqdm(range(self.n_itr)):

                self.policy = GaussianPolicy(self.theta, self.sigma)

                # Simulate N trajectories with T times steps each
                paths = collect_episodes(self.env,
                                         policy=self.policy,
                                         horizon=self.T,
                                         n_episodes=self.N)

                # Average performance per iteration
                if performance:
                    avg_return = estimate_performance(paths=paths)
                    self.avg_returns.append(avg_return)

                # Gradient update
                self.theta += self.stepper.update(
                    self.policy.compute_J_estimated_gradient(paths,
                                                             self.discounts,
                                                             N=self.N,
                                                             T=self.T))

                # Add the new theta to the history
                self.theta_history.append(self.theta)
Ejemplo n.º 4
0
    def __init__(self,
                 env,
                 actions,
                 n_episodes,
                 horizon,
                 discount_factor,
                 beh_policy_type="uniform"):
        """        
        Parameters
        ----------
        env : object
            Environment (lqg1d for instance).
        actions : array, shape = [n,]
            Discrete actions.
        n_episodes : int
            Number of episodes when generating the dataset.
        horizon : int
            Time horizon when generating the dataset.
        discount_factor : float
            Discount factor.
        beh_policy_type : str, optional
            Available values: "uniform".
        """
        self.env = env
        self.actions = actions
        self.n_episodes = n_episodes
        self.horizon = horizon
        self.discount_factor = discount_factor

        if beh_policy_type == "uniform":
            beh_policy = UniformPolicy(actions)

        self.dataset = collect_episodes(env,
                                        n_episodes=n_episodes,
                                        policy=beh_policy,
                                        horizon=horizon)
        self.Q = lambda state, action: 0
        self.Q = np.vectorize(self.Q)
Ejemplo n.º 5
0
    def compute_optimal_policy(self, estimate_performance=True):
        if estimate_performance:
            self.average_returns = []

        self.theta = 0.
        self.theta_history = []
        self.theta_history.append(self.theta)

        self.counts = {}

        for _ in tqdm(range(self.n_itr)):
            policy = create_policy(self.policy_type, {
                **self.policy_params, 'theta': self.theta
            })

            paths = utils.collect_episodes(self.env,
                                           policy=policy,
                                           horizon=self.T,
                                           n_episodes=self.N)

            if self.exploration_bonus:
                self._update_rewards(paths)

            # performances for iteration
            if estimate_performance:
                self.average_returns.append(
                    utils.estimate_performance(paths=paths))

            self.theta += self.stepper.update(
                policy.compute_J_estimated_gradient(
                    paths,
                    self.discounts,
                    N=self.N,
                    T=self.T,
                ))
            self.theta_history.append(self.theta[0])
# Set the discount factor for the problem
discount = 0.9
# Learning rate for the gradient update
learning_rate = 0.00001

## Ex 1 :
stepper = Adam()  # we choose the Adam step
list_mean_parameters, list_avg_returns = [], []

for e in tqdm(range(epochs), desc="Simulating {}".format('random')):
    stepper.reset()
    theta = 0  # we initialize theta
    avg_return, mean_parameters = [], [theta]
    for _ in range(n_itr):
        pi = policy(theta)
        paths = utils.collect_episodes(env, policy=pi, horizon=T, n_episodes=N)
        l = len(paths)
        avg = np.sum([paths[n]["rewards"] for n in range(l)]) / l
        grad_J = np.sum([
            gradient_J(pi, paths[n]["states"][:, 0], paths[n]["actions"][:, 0],
                       paths[n]["rewards"], discount) for n in range(N)
        ]) / N
        theta += stepper.update(grad_J)
        avg_return.append(avg)
        mean_parameters.append(theta)
    list_avg_returns.append(avg_return)
    list_mean_parameters.append(mean_parameters)

list_avg_returns = np.array(list_avg_returns)
list_mean_parameters = np.array(list_mean_parameters)
Q_fun_ = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, discount, 1))
Q_fun = lambda X: Q_fun_(X[:, 0], X[:, 1])

Q_opt = Q_fun(SA)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(S, A, Q_opt)

plt.show()

#################################################################
# Collect the samples using the behavioural policy
#################################################################
# You should use discrete actions
beh_policy =

dataset = collect_episodes(env, n_episodes=100,
                                            policy=beh_policy, horizon=horizon)

# define FQI
# to evaluate the policy you can use estimate_performance

# plot obtained Q-function against the true one


J = estimate_performance(env, policy=fqi, horizon=100, n_episodes=500, gamma=discount)
print('Policy performance: {}'.format(J))
plt.show()
def reinforce(env,
              N,
              T,
              n_itr,
              discount,
              learning_rate,
              init_policy,
              update_fun="fixed",
              from_scratch=True,
              returns=None,
              parameters=None,
              bonus=False):

    # Initialize the update function
    assert update_fun in ["annealing", "fixed"]
    if update_fun == "annealing":
        update = lambda x, t: x * learning_rate / (t + 1)
    else:
        update = lambda x, t: x * learning_rate

    # Initialize the output or use the ones provided
    if not from_scratch:
        assert (returns is not None) & (parameters is not None)
        returns_dt = returns
        parameters_dt = parameters
    else:
        returns_dt = pd.DataFrame(
            columns=["Learning Rate", "Stepper", "N", "Iteration", "Returns"])
        parameters_dt = pd.DataFrame(
            columns=["Learning Rate", "Stepper", "N", "Iteration", "Params"])

    policy = init_policy

    # Main loop
    theta = policy.theta
    for j in tqdm.tqdm(range(n_itr)):
        paths = utils.collect_episodes(env,
                                       policy=policy,
                                       horizon=T,
                                       n_episodes=N,
                                       bonus=bonus)

        # REINFORCE estimates
        returns_tab = []
        grad_J_tab = []
        for p in paths:
            grad_J_episode = 0
            discounted_returns = sum(
                [discount**i * p["rewards"][i] for i in range(T)])
            returns_tab.append(discounted_returns)
            returns_dt = returns_dt.append(
                {
                    'Learning Rate': str("10^{}".format(
                        int(log10(learning_rate)))),
                    'Stepper': update_fun,
                    'N': ". {}".format(str(N)),
                    'Iteration': j,
                    "Returns": discounted_returns
                },
                ignore_index=True)
            for i in range(T):
                grad_J_episode += policy.grad_log(
                    p["actions"][i], p["states"][i]) * sum(
                        [discount**j * p["rewards"][j] for j in range(i, T)])
            grad_J_tab.append(grad_J_episode)
            parameters_dt = parameters_dt.append(
                {
                    'Learning Rate': str("10^{}".format(
                        int(log10(learning_rate)))),
                    'Stepper': update_fun,
                    'N': ". {}".format(str(N)),
                    'Iteration': j,
                    "Params": policy.theta + update(grad_J_episode[0], j)
                },
                ignore_index=True)
        grad_J = np.mean(grad_J_tab)

        # Update policy parameter
        theta = theta + update(grad_J, j)
        policy.set_theta(theta)

    return returns_dt, parameters_dt