def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(policy, mdp.info, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def __init__(self, approximator, policy, mdp_info, params, features): self.Q = Regressor(approximator, **params['approximator_params']) self.e = np.zeros(self.Q.weights_size) self._lambda = params['algorithm_params']['lambda'] super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info, params, features)
def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) super().__init__(policy, mdp_info, policy_features)
def __init__(self, policy, mdp_info, params, features): self.Q = Regressor(LinearApproximator, **params['approximator_params']) self.e = np.zeros(self.Q.weights_size) self._lambda = params['algorithm_params']['lambda'] self._q_old = None super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info, params, features)
class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * ( self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size)
def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, actor_fit_params=None, critic_fit_params=None): """ Constructor. Args: actor_approximator (object): the approximator to use for the actor; critic_approximator (object): the approximator to use for the critic; policy_class (Policy): class of the policy; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; policy_params (dict): parameters of the policy to build; actor_fit_params (dict, None): parameters of the fitting algorithm of the actor approximator; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(policy, mdp_info)
def __init__(self, policy, mdp_info, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: policy (ParametricPolicy): a differentiable stochastic policy; mdp_info: information about the MDP; alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; lambda_par (float, 0.9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._lambda = lambda_par super().__init__(policy, mdp_info, policy_features) if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size)
def __init__(self, approximator, policy, mdp_info, fit_params=None, approximator_params=None, features=None): """ Constructor. Args: approximator (object): approximator used by the algorithm and the policy. fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; """ self._fit_params = dict() if fit_params is None else fit_params self._approximator_params = dict() if approximator_params is None else\ approximator_params self.approximator = Regressor(approximator, **self._approximator_params) policy.set_q(self.approximator) super().__init__(policy, mdp_info, features)
def test_ornstein_uhlenbeck_policy(): np.random.seed(88) mu = Regressor(LinearApproximator, input_shape=(5, ), output_shape=(2, )) pi = OrnsteinUhlenbeckPolicy(mu, sigma=np.ones(1) * .2, theta=.15, dt=1e-2) w = np.random.randn(pi.weights_size) pi.set_weights(w) assert np.array_equal(pi.get_weights(), w) state = np.random.randn(5) action = pi.draw_action(state) action_test = np.array([-1.95896171, 1.91292747]) assert np.allclose(action, action_test) pi.reset() action = pi.draw_action(state) action_test = np.array([-1.94161061, 1.92233358]) assert np.allclose(action, action_test) try: pi(state, action) except NotImplementedError: pass else: assert False
def __init__(self, approximator, policy, mdp_info, n_iterations, fit_params=None, approximator_params=None, features=None, quiet=False): """ Constructor. Args: approximator (object): approximator used by the algorithm and the policy. n_iterations (int): number of iterations to perform for training; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; quiet (bool, False): whether to show the progress bar or not. """ self._n_iterations = n_iterations self._fit_params = dict() if fit_params is None else fit_params self._approximator_params = dict() if approximator_params is None else\ approximator_params self._quiet = quiet self.approximator = Regressor(approximator, **self._approximator_params) policy.set_q(self.approximator) super(BatchTD, self).__init__(policy, mdp_info, features)
class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, policy, mdp_info, params, features): self.Q = Regressor(LinearApproximator, **params['approximator_params']) self.e = np.zeros(self.Q.weights_size) self._lambda = params['algorithm_params']['lambda'] self._q_old = None super(TrueOnlineSARSALambda, self).__init__(self.Q, policy, mdp_info, params, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self._next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self._next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * (self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size)
def __init__(self, policy, mdp_info, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: policy (ParametricPolicy): a differentiable stochastic policy; mdp_info: information about the MDP; alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; alpha_r (Parameter): learning rate for the reward trace; lambda_par (float, 0.9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._alpha_r = alpha_r self._lambda = lambda_par super().__init__(policy, mdp_info, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._r_bar = 0
def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: policy (Policy): any exploration policy, possibly using the deterministic policy as mean regressor; mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta (Parameter): learning rate for policy update; alpha_omega (Parameter): learning rate for the advantage function; alpha_v (Parameter): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) super().__init__(policy, mdp_info, policy_features)
def __init__(self, mdp_info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target() policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(policy, mdp_info, actor_optimizer, policy_parameters)
def __init__(self, mdp_info, policy, critic_params, actor_optimizer, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 super().__init__(policy, mdp_info, None)
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, approximator, policy, mdp_info, params, features): self.Q = Regressor(approximator, **params['approximator_params']) self.e = np.zeros(self.Q.weights_size) self._lambda = params['algorithm_params']['lambda'] super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info, params, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self._next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self._next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size)
def __init__(self, mdp_info, policy, critic_params, actor_optimizer, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; ent_coeff (float, 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._entropy_coeff = ent_coeff self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) super().__init__(policy, mdp_info, actor_optimizer, policy.parameters())
def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size,), output_shape=(1,)) super().__init__(policy, mdp_info, policy_features)
def __init__(self, mdp_info, policy, critic_params, actor_optimizer, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; n_epochs_policy (int): number of policy updates for every dataset; batch_size (int): size of minibatches for every optimization step eps_ppo (float): value for probability ratio clipping; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 super().__init__(policy, mdp_info, None)
def __init__(self, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None super().__init__(self.Q, policy, mdp_info, learning_rate, features)
def __init__(self, approximator, policy, mdp_info, params, features=None): """ Constructor. Args: approximator (object): approximator used by the algorithm and the policy. """ self._n_iterations = params['algorithm_params']['n_iterations'] self._quiet = params['algorithm_params'].get('quiet', False) self.approximator = Regressor(approximator, **params['approximator_params']) policy.set_q(self.approximator) super(BatchTD, self).__init__(policy, mdp_info, params, features)
def experiment(n_epochs, n_steps, n_eval_episodes): np.random.seed() # MDP mdp = InvertedPendulum() # Agent n_tilings = 10 alpha_theta = ExponentialDecayParameter(1, decay_exp=1.0) alpha_omega = ExponentialDecayParameter(1.5 / n_tilings, decay_exp=2 / 3) alpha_v = ExponentialDecayParameter(1 / n_tilings, decay_exp=2 / 3) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-3 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=n_eval_episodes) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset_eval = core.evaluate(n_episodes=n_eval_episodes, render=False) J = compute_J(dataset_eval, gamma=1.0) print('Total Reward per episode at iteration ' + str(i) + ': ' + str(np.mean(J)))
class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, approximator, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff super().__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size) super().episode_start()
def __init__(self, approximator, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info, learning_rate, features)
def experiment(n_epochs, ep_per_epoch_train, ep_per_epoch_eval, n_iterations): np.random.seed() # MDP mdp = PreyPredator() basis = PolynomialBasis.generate(1, mdp.info.observation_space.shape[0]) phi = Features(basis_list=basis[1:]) # Features approximator = Regressor(LinearApproximator, input_shape=(phi.size, ), output_shape=mdp.info.action_space.shape) sigma = 1e-2 * np.eye(mdp.info.action_space.shape[0]) policy = GaussianPolicy(approximator, sigma) lr = Parameter(1e-5) #agent = GPOMDP(policy, mdp.info, lr, phi) agent = KeyboardAgent() # Train core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) print('Reward at start: ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=ep_per_epoch_train, n_episodes_per_fit=ep_per_epoch_train // n_iterations, render=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) p = policy.get_weights() print('mu: ', p) print('Reward at iteration ', i, ': ', np.mean(J)) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(dist, policy, mdp.info, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, approximator, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff super(SARSALambdaContinuous, self).__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size)
def learn(alg, **alg_params): np.random.seed(1) torch.manual_seed(1) # MDP mdp = LQR.generate(dimensions=2) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) agent_test = alg(distribution, policy, mdp.info, **alg_params) core = Core(agent_test, mdp) core.learn(n_episodes=5, n_episodes_per_fit=5) return distribution
mdp = ShipSteering() high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution_test = GaussianDiagonalDistribution(mu, sigma) agent_test = RWR(distribution_test, policy, mdp.info, beta=1.) core = Core(agent_test, mdp) s = np.arange(10) a = np.arange(10) r = np.arange(10) ss = s + 5 ab = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1]) last = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, policy, mdp_info, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None super().__init__(self.Q, policy, mdp_info, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * (self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size) super().episode_start()
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings-1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate(1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = SAC_AVG(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps/n_episodes)) print('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
class DDPG(Agent): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, actor_fit_params=None, critic_fit_params=None): """ Constructor. Args: actor_approximator (object): the approximator to use for the actor; critic_approximator (object): the approximator to use for the critic; policy_class (Policy): class of the policy; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; policy_params (dict): parameters of the policy to build; actor_fit_params (dict, None): parameters of the fitting algorithm of the actor approximator; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._actor_approximator.fit(state, state, **self._actor_fit_params) self._update_target() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class SAC_AVG(Agent): """ Stochastic Actor critic in the average reward setting as presented in: "Model-Free Reinforcement Learning with Continuous Action in Practice". Degris T. et al.. 2012. """ def __init__(self, policy, mdp_info, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: policy (ParametricPolicy): a differentiable stochastic policy; mdp_info: information about the MDP; alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; alpha_r (Parameter): learning rate for the reward trace; lambda_par (float, 0.9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._alpha_r = alpha_r self._lambda = lambda_par super().__init__(policy, mdp_info, policy_features) if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._r_bar = 0 def episode_start(self): self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss v_next = self._V(ss_psi) if not absorbing else 0 # Compute TD error delta = r - self._r_bar + v_next - self._V(s_psi) # Update traces self._r_bar += self._alpha_r() * delta self._e_v = self._lambda * self._e_v + s_psi self._e_theta = self._lambda * self._e_theta + \ self.policy.diff_log(s_phi, a) # Update value function delta_v = self._alpha_v(s, a) * delta * self._e_v v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # Update policy delta_theta = self._alpha_theta(s, a) * delta * self._e_theta theta_new = self.policy.get_weights() + delta_theta self.policy.set_weights(theta_new)
class COPDAC_Q(Agent): def __init__(self, policy, mu, mdp_info, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size,), output_shape=(1,)) super().__init__(policy, mdp_info, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = np.asscalar(self._V(ss_psi)) if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return np.asscalar(self._V(state_psi)) + \ np.asscalar(self._A(self._nu(state, action))) def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
def learn(alg): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 2 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings - 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) if alg is StochasticAC: agent = alg(policy, mdp.info, alpha_theta, alpha_v, lambda_par=.5, value_function_features=psi, policy_features=phi) elif alg is StochasticAC_AVG: agent = alg(policy, mdp.info, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return policy
import numpy as np from matplotlib import pyplot as plt from mushroom.approximators import Regressor from mushroom.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2,), output_shape=(1,)) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()