Beispiel #1
0
class SARSALambdaContinuous(TD):
    """
    Continuous version of SARSA(lambda) algorithm.

    """
    def __init__(self, approximator, policy, mdp_info, params, features):
        self.Q = Regressor(approximator, **params['approximator_params'])
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = params['algorithm_params']['lambda']

        super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info,
                                                    params, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        q_current = self.Q.predict(phi_state, action)

        alpha = self.alpha(state, action)

        self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff(
            phi_state, action)

        self._next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self._next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - q_current

        theta = self.Q.get_weights()
        theta += alpha * delta * self.e
        self.Q.set_weights(theta)

    def episode_start(self):
        self.e = np.zeros(self.Q.weights_size)
Beispiel #2
0
def learn(alg, alg_params):
    mdp = LQR.generate(dimensions=1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    agent = alg(policy, mdp.info, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return policy
Beispiel #3
0
class TrueOnlineSARSALambda(TD):
    """
    True Online SARSA(lambda) with linear function approximation.
    "True Online TD(lambda)". Seijen H. V. et al.. 2014.

    """
    def __init__(self, policy, mdp_info, learning_rate, lambda_coeff,
                 features, approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(LinearApproximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff
        self._q_old = None

        super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info,
                                                    learning_rate, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)
        q_current = self.Q.predict(phi_state, action)

        if self._q_old is None:
            self._q_old = q_current

        alpha = self.alpha(state, action)

        e_phi = self.e.dot(phi_state_action)
        self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * (
            1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - self._q_old

        theta = self.Q.get_weights()
        theta += delta * self.e + alpha * (
            self._q_old - q_current) * phi_state_action
        self.Q.set_weights(theta)

        self._q_old = q_next

    def episode_start(self):
        self._q_old = None
        self.e = np.zeros(self.Q.weights_size)
Beispiel #4
0
class SARSALambdaContinuous(TD):
    """
    Continuous version of SARSA(lambda) algorithm.

    """
    def __init__(self,
                 approximator,
                 policy,
                 mdp_info,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(approximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff

        super().__init__(self.Q, policy, mdp_info, learning_rate, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        q_current = self.Q.predict(phi_state, action)

        alpha = self.alpha(state, action)

        self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff(
            phi_state, action)

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - q_current

        theta = self.Q.get_weights()
        theta += alpha * delta * self.e
        self.Q.set_weights(theta)

    def episode_start(self):
        self.e = np.zeros(self.Q.weights_size)

        super().episode_start()
Beispiel #5
0
class TrueOnlineSARSALambda(TD):
    """
    True Online SARSA(lambda) with linear function approximation.
    "True Online TD(lambda)". Seijen H. V. et al.. 2014.

    """
    def __init__(self, policy, mdp_info, params, features):
        self.Q = Regressor(LinearApproximator, **params['approximator_params'])
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = params['algorithm_params']['lambda']
        self._q_old = None

        super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info,
                                                    params, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)
        q_current = self.Q.predict(phi_state, action)

        if self._q_old is None:
            self._q_old = q_current

        alpha = self.alpha(state, action)

        e_phi = self.e.dot(phi_state_action)
        self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * (
            1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action

        self._next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self._next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - self._q_old

        theta = self.Q.get_weights()
        theta += delta * self.e + alpha * (self._q_old -
                                           q_current) * phi_state_action
        self.Q.set_weights(theta)

        self._q_old = q_next

    def episode_start(self):
        self._q_old = None
        self.e = np.zeros(self.Q.weights_size)
Beispiel #6
0
class SARSALambdaContinuous(TD):
    """
    Continuous version of SARSA(lambda) algorithm.

    """
    def __init__(self, approximator, policy, mdp_info, learning_rate,
                 lambda_coeff, features, approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(approximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff

        super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info,
                                                    learning_rate, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        q_current = self.Q.predict(phi_state, action)

        alpha = self.alpha(state, action)

        self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff(
            phi_state, action)

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - q_current

        theta = self.Q.get_weights()
        theta += alpha * delta * self.e
        self.Q.set_weights(theta)

    def episode_start(self):
        self.e = np.zeros(self.Q.weights_size)
class SAC_AVG(Agent):
    """
    Stochastic Actor critic in the average reward setting as presented in:
    "Model-Free Reinforcement Learning with Continuous Action in Practice".
    Degris T. et al.. 2012.

    """
    def __init__(self, policy, mdp_info, alpha_theta, alpha_v, alpha_r,
                 lambda_par=.9, value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            policy (ParametricPolicy): a differentiable stochastic policy;
            mdp_info: information about the MDP;
            alpha_theta (Parameter): learning rate for policy update;
            alpha_v (Parameter): learning rate for the value function;
            alpha_r (Parameter): learning rate for the reward trace;
            lambda_par (float, 0.9): trace decay parameter;
            value_function_features (Features, None): features used by the
                value function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_v = alpha_v
        self._alpha_r = alpha_r

        self._lambda = lambda_par

        super().__init__(policy, mdp_info, policy_features)

        if self._psi is not None:
            input_shape = (self._psi.size,)
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator, input_shape=input_shape,
                            output_shape=(1,))

        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)
        self._r_bar = 0

    def episode_start(self):
        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            v_next = self._V(ss_psi) if not absorbing else 0

            # Compute TD error
            delta = r - self._r_bar + v_next - self._V(s_psi)

            # Update traces
            self._r_bar += self._alpha_r() * delta
            self._e_v = self._lambda * self._e_v + s_psi
            self._e_theta = self._lambda * self._e_theta + \
                self.policy.diff_log(s_phi, a)

            # Update value function
            delta_v = self._alpha_v(s, a) * delta * self._e_v
            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

            # Update policy
            delta_theta = self._alpha_theta(s, a) * delta * self._e_theta
            theta_new = self.policy.get_weights() + delta_theta
            self.policy.set_weights(theta_new)
Beispiel #8
0
def learn(alg):
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 2
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings - 1, [1, 1],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(
        1, [1, 1], mdp.info.observation_space.low,
        mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator,
                    input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    if alg is StochasticAC:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)
    elif alg is StochasticAC_AVG:
        agent = alg(policy,
                    mdp.info,
                    alpha_theta,
                    alpha_v,
                    alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    return policy
Beispiel #9
0
class DDPG(Agent):
    """
    Deep Deterministic Policy Gradient algorithm.
    "Continuous Control with Deep Reinforcement Learning".
    Lillicrap T. P. et al.. 2016.

    """
    def __init__(self, actor_approximator, critic_approximator, policy_class,
                 mdp_info, batch_size, initial_replay_size, max_replay_size,
                 tau, actor_params, critic_params, policy_params,
                 actor_fit_params=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            actor_approximator (object): the approximator to use for the actor;
            critic_approximator (object): the approximator to use for the
                critic;
            policy_class (Policy): class of the policy;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            actor_params (dict): parameters of the actor approximator to
                build;
            critic_params (dict): parameters of the critic approximator to
                build;
            policy_params (dict): parameters of the policy to build;
            actor_fit_params (dict, None): parameters of the fitting algorithm
                of the actor approximator;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """
        self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(critic_approximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(critic_approximator,
                                                     **target_critic_params)

        if 'loss' not in actor_params:
            actor_params['loss'] = ActorLoss(self._critic_approximator)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(actor_approximator,
                                             **actor_params)
        self._target_actor_approximator = Regressor(actor_approximator,
                                                    **target_actor_params)

        self._target_actor_approximator.model.set_weights(
            self._actor_approximator.model.get_weights())
        self._target_critic_approximator.model.set_weights(
            self._critic_approximator.model.get_weights())

        policy = policy_class(self._actor_approximator, **policy_params)
        super().__init__(policy, mdp_info)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self._critic_approximator.fit(state, action, q,
                                          **self._critic_fit_params)
            self._actor_approximator.fit(state, state,
                                         **self._actor_fit_params)

            self._update_target()

    def _update_target(self):
        """
        Update the target networks.

        """
        critic_weights = self._tau * self._critic_approximator.model.get_weights()
        critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights()
        self._target_critic_approximator.set_weights(critic_weights)

        actor_weights = self._tau * self._actor_approximator.model.get_weights()
        actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights()
        self._target_actor_approximator.set_weights(actor_weights)

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Action-values returned by the critic for ``next_state`` and the
            action returned by the actor.

        """
        a = self._target_actor_approximator(next_state)
        q = self._target_critic_approximator.predict(next_state, a)
        q *= 1 - absorbing

        return q
Beispiel #10
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 11
    alpha_r = Parameter(.0001)
    alpha_theta = Parameter(.001 / n_tilings)
    alpha_v = Parameter(.1 / n_tilings)
    tilings = Tiles.generate(n_tilings-1, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    tilings_v = tilings + Tiles.generate(1, [1, 1],
                                         mdp.info.observation_space.low,
                                         mdp.info.observation_space.high + 1e-3)
    psi = Features(tilings=tilings_v)

    input_shape = (phi.size,)

    mu = Regressor(LinearApproximator, input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    std = Regressor(LinearApproximator, input_shape=input_shape,
                    output_shape=mdp.info.action_space.shape)

    std_0 = np.sqrt(1.)
    std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size))

    policy = StateLogStdGaussianPolicy(mu, std)

    agent = SAC_AVG(policy, mdp.info,
                    alpha_theta, alpha_v, alpha_r,
                    lambda_par=.5,
                    value_function_features=psi,
                    policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    display_callback = Display(agent._V, mu, std,
                               mdp.info.observation_space.low,
                               mdp.info.observation_space.high,
                               phi, psi)
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.)
        dataset_callback.clean()
        display_callback()
        print('Mean Reward at iteration ' + str(i) + ': ' +
              str(np.sum(J) / n_steps/n_episodes))

    print('Press a button to visualize the pendulum...')
    input()
    core.evaluate(n_steps=n_steps, render=True)
class DDPG(ReparametrizationAC):
    """
    Deep Deterministic Policy Gradient algorithm.
    "Continuous Control with Deep Reinforcement Learning".
    Lillicrap T. P. et al.. 2016.

    """
    def __init__(self, mdp_info, policy_class, policy_params,
                 batch_size, initial_replay_size, max_replay_size,
                 tau, critic_params, actor_params, actor_optimizer,
                 policy_delay=1, critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            actor_params (dict): parameters of the actor approximator to
                build;
            critic_params (dict): parameters of the critic approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            policy_delay (int, 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """

        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau
        self._policy_delay = policy_delay
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator,
                                             **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target()

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()
        super().__init__(policy, mdp_info, actor_optimizer, policy_parameters)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self._critic_approximator.fit(state, action, q,
                                          **self._critic_fit_params)

            if self._fit_count % self._policy_delay == 0:
                loss = self._loss(state)
                self._optimize_actor_parameters(loss)

            self._update_target()

            self._fit_count += 1

    def _loss(self, state):
        action = self._actor_approximator(state, output_tensor=True)
        q = self._critic_approximator(state, action, output_tensor=True)

        return -q.mean()

    def _init_target(self):
        """
        Init weights for target approximators

        """
        self._target_actor_approximator.set_weights(
            self._actor_approximator.get_weights())
        self._target_critic_approximator.set_weights(
            self._critic_approximator.get_weights())

    def _update_target(self):
        """
        Update the target networks.

        """
        critic_weights = self._tau * self._critic_approximator.get_weights()
        critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights()
        self._target_critic_approximator.set_weights(critic_weights)

        actor_weights = self._tau * self._actor_approximator.get_weights()
        actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights()
        self._target_actor_approximator.set_weights(actor_weights)

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Action-values returned by the critic for ``next_state`` and the
            action returned by the actor.

        """
        a = self._target_actor_approximator(next_state)

        q = self._target_critic_approximator.predict(next_state, a)
        q *= 1 - absorbing

        return q
Beispiel #12
0
class COPDAC_Q(Agent):
    """
    Compatible off-policy deterministic actor-critic algorithm.
    "Deterministic Policy Gradient Algorithms".
    Silver D. et al.. 2014.
    
    """
    def __init__(self,
                 policy,
                 mu,
                 mdp_info,
                 alpha_theta,
                 alpha_omega,
                 alpha_v,
                 value_function_features=None,
                 policy_features=None):
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_omega = alpha_omega
        self._alpha_v = alpha_v

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size, ),
                            output_shape=(1, ))

        super().__init__(policy, mdp_info, policy_features)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            q_next = self._V(ss_psi).item() if not absorbing else 0

            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
            omega = self._A.get_weights()

            delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
            delta_theta = self._alpha_theta(s, a) * \
                omega.dot(grad_mu_s.T).dot(grad_mu_s)
            delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a)
            delta_v = self._alpha_v(s, a) * delta * s_psi

            theta_new = self._mu.get_weights() + delta_theta
            self._mu.set_weights(theta_new)

            omega_new = omega + delta_omega
            self._A.set_weights(omega_new)

            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

    def _Q(self, state, action):
        state_psi = self._psi(state) if self._psi is not None else state

        return self._V(state_psi).item() + self._A(self._nu(state,
                                                            action)).item()

    def _nu(self, state, action):
        state_phi = self.phi(state) if self.phi is not None else state
        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
        delta = action - self._mu(state_phi)

        return delta.dot(grad_mu)
Beispiel #13
0
class COPDAC_Q(Agent):
    def __init__(self,
                 policy,
                 mu,
                 mdp_info,
                 alpha_theta,
                 alpha_omega,
                 alpha_v,
                 value_function_features=None,
                 policy_features=None):

        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_omega = alpha_omega
        self._alpha_v = alpha_v

        if value_function_features is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size, ),
                            output_shape=(1, ))

        super().__init__(policy, mdp_info, policy_features)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_phi = self.phi(ss) if self.phi is not None else ss

            q_next = self._Q(ss, self._mu(ss_phi)) if not absorbing else 0

            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
            omega = self._A.get_weights()

            delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
            delta_theta = self._alpha_theta(s, a) * grad_mu_s.T.dot(
                grad_mu_s.dot(omega))
            delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a)
            delta_v = self._alpha_v(s, a) * delta * s_psi

            theta_new = self._mu.get_weights() + delta_theta
            self._mu.set_weights(theta_new)

            omega_new = omega + delta_omega
            self._A.set_weights(omega_new)

            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

            # print('V max:', np.max(v_new))
            # print('V min:', np.min(v_new))
            # print('A max:', np.max(omega_new))
            # print('A min:', np.min(omega_new))

    def _Q(self, state, action):
        state_psi = self._psi(state) if self._psi is not None else state
        return self._V(state_psi) + self._A(self._nu(state, action))

    def _nu(self, state, action):
        state_phi = self.phi(state) if self.phi is not None else state
        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
        delta = action - self._mu(state_phi)

        return delta.dot(grad_mu)
Beispiel #14
0
class COPDAC_Q(Agent):
    def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v,
                 value_function_features=None, policy_features=None):
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_omega = alpha_omega
        self._alpha_v = alpha_v

        if self._psi is not None:
            input_shape = (self._psi.size,)
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator, input_shape=input_shape,
                            output_shape=(1,))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size,),
                            output_shape=(1,))

        super().__init__(policy, mdp_info, policy_features)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            q_next = np.asscalar(self._V(ss_psi)) if not absorbing else 0

            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
            omega = self._A.get_weights()

            delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
            delta_theta = self._alpha_theta(s, a) * \
                omega.dot(grad_mu_s.T).dot(grad_mu_s)
            delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a)
            delta_v = self._alpha_v(s, a) * delta * s_psi

            theta_new = self._mu.get_weights() + delta_theta
            self._mu.set_weights(theta_new)

            omega_new = omega + delta_omega
            self._A.set_weights(omega_new)

            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

    def _Q(self, state, action):
        state_psi = self._psi(state) if self._psi is not None else state

        return np.asscalar(self._V(state_psi)) + \
            np.asscalar(self._A(self._nu(state, action)))

    def _nu(self, state, action):
        state_phi = self.phi(state) if self.phi is not None else state
        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
        delta = action - self._mu(state_phi)

        return delta.dot(grad_mu)
Beispiel #15
0
class COPDAC_Q(Agent):
    """
    Compatible off-policy deterministic actor-critic algorithm.
    "Deterministic Policy Gradient Algorithms".
    Silver D. et al.. 2014.

    """
    def __init__(self,
                 policy,
                 mu,
                 mdp_info,
                 alpha_theta,
                 alpha_omega,
                 alpha_v,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            policy (Policy): any exploration policy, possibly using the deterministic
                policy as mean regressor;
            mu (Regressor): regressor that describe the deterministic policy to be
                learned i.e., the deterministic mapping between state and action.
            alpha_theta (Parameter): learning rate for policy update;
            alpha_omega (Parameter): learning rate for the advantage function;
            alpha_v (Parameter): learning rate for the value function;
            value_function_features (Features, None): features used by the value
                function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_omega = alpha_omega
        self._alpha_v = alpha_v

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size, ),
                            output_shape=(1, ))

        super().__init__(policy, mdp_info, policy_features)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            q_next = self._V(ss_psi).item() if not absorbing else 0

            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
            omega = self._A.get_weights()

            delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
            delta_theta = self._alpha_theta(s, a) * \
                          omega.dot(grad_mu_s.T).dot(grad_mu_s)
            delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a)
            delta_v = self._alpha_v(s, a) * delta * s_psi

            theta_new = self._mu.get_weights() + delta_theta
            self._mu.set_weights(theta_new)

            omega_new = omega + delta_omega
            self._A.set_weights(omega_new)

            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

    def _Q(self, state, action):
        state_psi = self._psi(state) if self._psi is not None else state

        return self._V(state_psi).item() + self._A(self._nu(state,
                                                            action)).item()

    def _nu(self, state, action):
        state_phi = self.phi(state) if self.phi is not None else state
        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
        delta = action - self._mu(state_phi)

        return delta.dot(grad_mu)
Beispiel #16
0
class TrueOnlineSARSALambda(TD):
    """
    True Online SARSA(lambda) with linear function approximation.
    "True Online TD(lambda)". Seijen H. V. et al.. 2014.

    """
    def __init__(self,
                 policy,
                 mdp_info,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(LinearApproximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff
        self._q_old = None

        super().__init__(self.Q, policy, mdp_info, learning_rate, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)
        q_current = self.Q.predict(phi_state, action)

        if self._q_old is None:
            self._q_old = q_current

        alpha = self.alpha(state, action)

        e_phi = self.e.dot(phi_state_action)
        self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * (
            1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - self._q_old

        theta = self.Q.get_weights()
        theta += delta * self.e + alpha * (self._q_old -
                                           q_current) * phi_state_action
        self.Q.set_weights(theta)

        self._q_old = q_next

    def episode_start(self):
        self._q_old = None
        self.e = np.zeros(self.Q.weights_size)

        super().episode_start()
mdp = LQR.generate(dimensions=1)

approximator_params = dict(input_dim=mdp.info.observation_space.shape)
approximator = Regressor(LinearApproximator,
                         input_shape=mdp.info.observation_space.shape,
                         output_shape=mdp.info.action_space.shape,
                         params=approximator_params)

sigma = Regressor(LinearApproximator,
                  input_shape=mdp.info.observation_space.shape,
                  output_shape=mdp.info.action_space.shape,
                  params=approximator_params)

sigma_weights = 2 * np.ones(sigma.weights_size)
sigma.set_weights(sigma_weights)

policy = StateStdGaussianPolicy(approximator, sigma)

# Agent
learning_rate = AdaptiveParameter(value=.01)
algorithm_params = dict(learning_rate=learning_rate)

agent_test = REINFORCE(policy, mdp.info, **algorithm_params)
core = Core(agent_test, mdp)

s = np.arange(10)
a = np.arange(10)
r = np.arange(10)
ss = s + 5
ab = np.array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0])