def __init__(self, mdp_info, distribution, policy, eps, kappa, features=None): """ Constructor. Args: eps ([float, Parameter]): the maximum admissible value for the Kullback-Leibler divergence between the new distribution and the previous one at each update step. kappa ([float, Parameter]): the maximum admissible value for the entropy decrease between the new distribution and the previous one at each update step. """ self._eps = to_parameter(eps) self._kappa = to_parameter(kappa) self._add_save_attr(_eps='mushroom') self._add_save_attr(_kappa='mushroom') super().__init__(mdp_info, distribution, policy, features)
def __init__(self, mdp_info, policy, actor_optimizer, critic_params, n_epochs_policy, batch_size, eps_ppo, lam, ent_coeff=0.0, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; n_epochs_policy ([int, Parameter]): number of policy updates for every dataset; batch_size ([int, Parameter]): size of minibatches for every optimization step eps_ppo ([float, Parameter]): value for probability ratio clipping; lam ([float, Parameter], 1.): lambda coefficient used by generalized advantage estimation; ent_coeff ([float, Parameter], 1.): coefficient for the entropy regularization term; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = to_parameter(n_epochs_policy) self._batch_size = to_parameter(batch_size) self._eps_ppo = to_parameter(eps_ppo) self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = to_parameter(lam) self._ent_coeff = to_parameter(ent_coeff) self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._add_save_attr(_critic_fit_params='pickle', _n_epochs_policy='mushroom', _batch_size='mushroom', _eps_ppo='mushroom', _ent_coeff='mushroom', _optimizer='torch', _lambda='mushroom', _V='mushroom', _iter='primitive') super().__init__(mdp_info, policy, None)
def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=2, noise_std=.2, noise_clip=.5, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size ((int, Parameter)): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau ((float, Parameter)): value of coefficient for soft updates; policy_delay ((int, Parameter), 2): the number of updates of the critic after which an actor update is implemented; noise_std ((float, Parameter), .2): standard deviation of the noise used for policy smoothing; noise_clip ((float, Parameter), .5): maximum absolute value for policy smoothing noise; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._noise_std = to_parameter(noise_std) self._noise_clip = to_parameter(noise_clip) if 'n_models' in critic_params.keys(): assert (critic_params['n_models'] >= 2) else: critic_params['n_models'] = 2 self._add_save_attr(_noise_std='mushroom', _noise_clip='mushroom') super().__init__(mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay, critic_fit_params)
def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta ([float, Parameter]): learning rate for policy update; alpha_omega ([float, Parameter]): learning rate for the advantage function; alpha_v ([float, Parameter]): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = to_parameter(alpha_theta) self._alpha_omega = to_parameter(alpha_omega) self._alpha_v = to_parameter(alpha_v) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) self._add_save_attr(_mu='mushroom', _psi='pickle', _alpha_theta='mushroom', _alpha_omega='mushroom', _alpha_v='mushroom', _V='mushroom', _A='mushroom') super().__init__(mdp_info, policy, policy_features)
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: alpha_theta ([float, Parameter]): learning rate for policy update; alpha_v ([float, Parameter]): learning rate for the value function; lambda_par ([float, Parameter], .9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = to_parameter(alpha_theta) self._alpha_v = to_parameter(alpha_v) self._lambda = to_parameter(lambda_par) super().__init__(mdp_info, policy, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._add_save_attr(_psi='pickle', _alpha_theta='mushroom', _alpha_v='mushroom', _lambda='mushroom', _V='mushroom', _e_v='numpy', _e_theta='numpy')
def __init__(self, initial_size, max_size, alpha, beta, epsilon=.01): """ Constructor. Args: initial_size (int): initial number of elements in the replay memory; max_size (int): maximum number of elements that the replay memory can contain; alpha (float): prioritization coefficient; beta ([float, Parameter]): importance sampling coefficient; epsilon (float, .01): small value to avoid zero probabilities. """ self._initial_size = initial_size self._max_size = max_size self._alpha = alpha self._beta = to_parameter(beta) self._epsilon = epsilon self._tree = SumTree(max_size) self._add_save_attr(_initial_size='primitive', _max_size='primitive', _alpha='primitive', _beta='primitive', _epsilon='primitive', _tree='pickle!')
def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff ([float, Parameter]): eligibility trace coefficient. """ approximator_params = dict() if approximator_params is None else \ approximator_params Q = Regressor(approximator, **approximator_params) self.e = np.zeros(Q.weights_size) self._lambda = to_parameter(lambda_coeff) self._add_save_attr(_lambda='primitive', e='numpy') super().__init__(mdp_info, policy, Q, learning_rate, features)
def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params): """ Constructor. Args: network (object): the network class used to implement the mean regressor; input_shape (tuple): the shape of the state space; output_shape (tuple): the shape of the action space; beta ((float, Parameter)): the inverse of the temperature distribution. As the temperature approaches infinity, the policy becomes more and more random. As the temperature approaches 0.0, the policy becomes more and more greedy. params (dict): parameters used by the network constructor. """ super().__init__(use_cuda) self._action_dim = output_shape[0] self._logits = Regressor(TorchApproximator, input_shape, output_shape, network=network, use_cuda=use_cuda, **params) self._beta = to_parameter(beta) self._add_save_attr( _action_dim='primitive', _beta='mushroom', _logits='mushroom' )
def __init__(self, mdp_info, policy, approximator, n_iterations, approximator_params=None, fit_params=None, quiet=False): """ Constructor. Args: n_iterations ((int, Parameter)): number of iterations to perform for training; quiet (bool, False): whether to show the progress bar or not. """ self._n_iterations = to_parameter(n_iterations) self._quiet = quiet self._target = None self._add_save_attr(_n_iterations='mushroom', _quiet='primitive', _target='pickle') super().__init__(mdp_info, policy, approximator, approximator_params, fit_params)
def __init__(self, mdp_info, policy, approximator_params=None, epsilon=1e-2, fit_params=None, features=None): """ Constructor. Args: epsilon ((float, Parameter), 1e-2): termination coefficient. """ self._epsilon = to_parameter(epsilon) k = features.size * mdp_info.action_space.n self._A = np.zeros((k, k)) self._b = np.zeros((k, 1)) self._add_save_attr(_epsilon='mushroom', _A='primitive', _b='primitive') super().__init__(mdp_info, policy, LinearApproximator, approximator_params, fit_params, features)
def set_beta(self, beta): """ Setter. Args: beta ((float, Parameter)): the inverse of the temperature distribution. """ self._beta = to_parameter(beta)
def set_epsilon(self, epsilon): """ Setter. Args: epsilon ([float, Parameter]): the exploration coefficient. It indicates the probability of performing a random actions in the current step. """ self._epsilon = to_parameter(epsilon)
def __init__(self, mu_approximator, sigma_approximator, min_a, max_a, log_std_min, log_std_max): """ Constructor. Args: mu_approximator (Regressor): a regressor computing mean in given a state; sigma_approximator (Regressor): a regressor computing the variance in given a state; min_a (np.ndarray): a vector specifying the minimum action value for each component; max_a (np.ndarray): a vector specifying the maximum action value for each component. log_std_min ([float, Parameter]): min value for the policy log std; log_std_max ([float, Parameter]): max value for the policy log std. """ self._mu_approximator = mu_approximator self._sigma_approximator = sigma_approximator self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda) self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda) self._log_std_min = to_parameter(log_std_min) self._log_std_max = to_parameter(log_std_max) self._eps_log_prob = 1e-6 use_cuda = self._mu_approximator.model.use_cuda if use_cuda: self._delta_a = self._delta_a.cuda() self._central_a = self._central_a.cuda() self._add_save_attr(_mu_approximator='mushroom', _sigma_approximator='mushroom', _delta_a='torch', _central_a='torch', _log_std_min='mushroom', _log_std_max='mushroom', _eps_log_prob='primitive')
def __init__(self, mdp_info, policy, learning_rate, off_policy=False, beta=None, delta=None): """ Constructor. Args: off_policy (bool, False): whether to use the off policy setting or the online one; beta ([float, Parameter], None): beta coefficient; delta ([float, Parameter], None): delta coefficient. """ self.off_policy = off_policy if delta is not None and beta is None: self.delta = to_parameter(delta) self.beta = None elif delta is None and beta is not None: self.delta = None self.beta = to_parameter(beta) else: raise ValueError('delta or beta parameters needed.') Q = Table(mdp_info.size) self.Q_tilde = Table(mdp_info.size) self.R_tilde = Table(mdp_info.size) self._add_save_attr(off_policy='primitive', delta='mushroom', beta='mushroom', Q_tilde='mushroom', R_tilde='mushroom') super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, distribution, policy, beta, features=None): """ Constructor. Args: beta ([float, Parameter]): the temperature for the exponential reward transformation. """ self._beta = to_parameter(beta) self._add_save_attr(_beta='mushroom') super().__init__(mdp_info, distribution, policy, features)
def __init__(self, mdp_info, policy, learning_rate, beta): """ Constructor. Args: beta ((float, Parameter)): beta coefficient. """ Q = Table(mdp_info.size) self._rho = 0. self._beta = to_parameter(beta) self._add_save_attr(_rho='primitive', _beta='mushroom') super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, epsilon): """ Constructor. Args: epsilon ([float, Parameter]): the exploration coefficient. It indicates the probability of performing a random actions in the current step. """ super().__init__() self._epsilon = to_parameter(epsilon) self._add_save_attr(_epsilon='mushroom')
def __init__(self, beta): """ Constructor. Args: beta ([float, Parameter]): the inverse of the temperature distribution. As the temperature approaches infinity, the policy becomes more and more random. As the temperature approaches 0.0, the policy becomes more and more greedy. """ super().__init__() self._beta = to_parameter(beta) self._add_save_attr(_beta='mushroom')
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: alpha_r (Parameter): learning rate for the reward trace. """ super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par, value_function_features, policy_features) self._alpha_r = to_parameter(alpha_r) self._r_bar = 0 self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')
def __init__(self, mdp_info, policy, actor_optimizer, critic_params, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._entropy_coeff = to_parameter(ent_coeff) self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) self._add_save_attr(_critic_fit_params='pickle', _entropy_coeff='mushroom', _V='mushroom') super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, trace='replacing'): """ Constructor. Args: lambda_coeff ((float, Parameter)): eligibility trace coefficient; trace (str, 'replacing'): type of eligibility trace to use. """ Q = Table(mdp_info.size) self._lambda = to_parameter(lambda_coeff) self.e = EligibilityTrace(Q.shape, trace) self._add_save_attr( _lambda='mushroom', e='mushroom' ) super().__init__(mdp_info, policy, Q, learning_rate)
def __init__(self, mdp_info, policy, approximator_params=None, epsilon=1e-2, fit_params=None, features=None): """ Constructor. Args: epsilon ([float, Parameter], 1e-2): termination coefficient. """ self._epsilon = to_parameter(epsilon) self._add_save_attr(_epsilon='mushroom') super().__init__(mdp_info, policy, LinearApproximator, approximator_params, fit_params, features)
def set_beta(self, beta): self._beta = to_parameter(beta)
def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size ((int, Parameter)): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau ((float, Parameter)): value of coefficient for soft updates; policy_delay ((int, Parameter), 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = to_parameter(batch_size) self._tau = to_parameter(tau) self._policy_delay = to_parameter(policy_delay) self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() self._add_save_attr(_critic_fit_params='pickle', _batch_size='mushroom', _tau='mushroom', _policy_delay='mushroom', _fit_count='primitive', _replay_memory='mushroom', _critic_approximator='mushroom', _target_critic_approximator='mushroom', _target_actor_approximator='mushroom') super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, log_std_min=-20, log_std_max=2, target_entropy=None, critic_fit_params=None): """ Constructor. Args: actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigm approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size ((int, Parameter)): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions ([int, Parameter]): number of samples to accumulate in the replay memory to start the policy fitting; tau ([float, Parameter]): value of coefficient for soft updates; lr_alpha ([float, Parameter]): Learning rate for the entropy coefficient; log_std_min ([float, Parameter]): Min value for the policy log std; log_std_max ([float, Parameter]): Max value for the policy log std; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = to_parameter(batch_size) self._warmup_transitions = to_parameter(warmup_transitions) self._tau = to_parameter(tau) if target_entropy is None: self._target_entropy = -np.prod( mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high, log_std_min, log_std_max) self._init_target(self._critic_approximator, self._target_critic_approximator) self._log_alpha = torch.tensor(0., dtype=torch.float32) if policy.use_cuda: self._log_alpha = self._log_alpha.cuda().requires_grad_() else: self._log_alpha.requires_grad_() self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) policy_parameters = chain( actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) self._add_save_attr(_critic_fit_params='pickle', _batch_size='mushroom', _warmup_transitions='mushroom', _tau='mushroom', _target_entropy='primitive', _replay_memory='mushroom', _critic_approximator='mushroom', _target_critic_approximator='mushroom', _log_alpha='torch', _alpha_optim='torch') super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
def __init__(self, mdp_info, policy, approximator, approximator_params, batch_size, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, clip_reward=False): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; approximator_params (dict): parameters of the approximator to build; batch_size ((int, Parameter)): the number of samples in a batch; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; clip_reward (bool, False): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = to_parameter(batch_size) self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self._initialize_regressors(approximator, apprx_params_train, apprx_params_target) policy.set_q(self.approximator) self._add_save_attr( _fit_params='pickle', _batch_size='mushroom', _n_approximators='primitive', _clip_reward='primitive', _target_update_frequency='primitive', _replay_memory='mushroom', _n_updates='primitive', approximator='mushroom', target_approximator='mushroom' ) super().__init__(mdp_info, policy)
def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, lam=1., n_epochs_line_search=10, n_epochs_cg=10, cg_damping=1e-2, cg_residual_tol=1e-10, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty; max_kl ([float, Parameter], .001): maximum kl allowed for every policy update; lam float([float, Parameter], 1.): lambda coefficient used by generalized advantage estimation; n_epochs_line_search ([int, Parameter], 10): maximum number of iterations of the line search algorithm; n_epochs_cg ([int, Parameter], 10): maximum number of iterations of the conjugate gradient algorithm; cg_damping ([float, Parameter], 1e-2): damping factor for the conjugate gradient algorithm; cg_residual_tol ([float, Parameter], 1e-10): conjugate gradient residual tolerance; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=5) if critic_fit_params is None else critic_fit_params self._n_epochs_line_search = to_parameter(n_epochs_line_search) self._n_epochs_cg = to_parameter(n_epochs_cg) self._cg_damping = to_parameter(cg_damping) self._cg_residual_tol = to_parameter(cg_residual_tol) self._max_kl = to_parameter(max_kl) self._ent_coeff = to_parameter(ent_coeff) self._lambda = to_parameter(lam) self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._old_policy = None self._add_save_attr(_critic_fit_params='pickle', _n_epochs_line_search='mushroom', _n_epochs_cg='mushroom', _cg_damping='mushroom', _cg_residual_tol='mushroom', _max_kl='mushroom', _ent_coeff='mushroom', _lambda='mushroom', _V='mushroom', _old_policy='mushroom', _iter='primitive') super().__init__(mdp_info, policy, None)