Beispiel #1
0
    def __init__(self, *args, **kwargs):
        """
        Constructor

        :param expl_std_init: initial standard deviation for the exploration strategy
        :param args: forwarded the superclass constructor
        :param kwargs: forwarded the superclass constructor
        """
        # Preprocess inputs and call HC's constructor
        expl_std_init = kwargs.pop('expl_std_init')
        if 'expl_r_init' in kwargs:
            # This is just for the ability to create one common hyper-param list for HCNormal and HCHyper
            kwargs.pop('expl_r_init')

        # Get from kwargs with default values
        expl_std_min = kwargs.pop('expl_std_min', 0.01)

        # Call HC's constructor
        super().__init__(*args, **kwargs)

        self._expl_strat = NormalParamNoise(
            param_dim=self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
Beispiel #2
0
class HCNormal(HC):
    """ Hill Climbing variant using an exploration strategy with normally distributed noise on the policy parameters """
    def __init__(self, *args, **kwargs):
        """
        Constructor

        :param expl_std_init: initial standard deviation for the exploration strategy
        :param args: forwarded the superclass constructor
        :param kwargs: forwarded the superclass constructor
        """
        # Preprocess inputs and call HC's constructor
        expl_std_init = kwargs.pop('expl_std_init')
        if 'expl_r_init' in kwargs:
            # This is just for the ability to create one common hyper-param list for HCNormal and HCHyper
            kwargs.pop('expl_r_init')

        # Get from kwargs with default values
        expl_std_min = kwargs.pop('expl_std_min', 0.01)

        # Call HC's constructor
        super().__init__(*args, **kwargs)

        self._expl_strat = NormalParamNoise(
            param_dim=self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )

    def update_expl_strat(self, rets_avg_ros: np.ndarray, ret_avg_curr: float):
        # Update the exploration distribution
        if np.max(rets_avg_ros) > ret_avg_curr:
            new_std = self._expl_strat.std / self.expl_factor
        else:
            new_std = self._expl_strat.std * self.expl_factor

        self._expl_strat.adapt(std=new_std)

        # self.logger.add_value('expl strat std', self.expl_strat.std.data.detach().numpy())
        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std))
        self.logger.add_value(
            'avg expl strat std',
            to.mean(self._expl_strat.std.data).detach().numpy())
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std))
        self.logger.add_value(
            'expl strat entropy',
            np.mean(self._expl_strat.get_entropy().detach().numpy()))
Beispiel #3
0
def test_noise_on_param(env: SimEnv, policy: Policy):
    for _ in range(5):
        # Init the exploration strategy
        param_noise_strat = NormalParamNoise(
            policy.num_param,
            full_cov=True,
            std_init=1.0,
            std_min=0.01,
            train_mean=True,
            use_cuda=policy.device != "cpu",
        )

        # Set new parameters for the exploration noise
        mean = to.rand(policy.num_param)
        cov = to.eye(policy.num_param)
        param_noise_strat.adapt(mean, cov)
        to.testing.assert_allclose(mean, param_noise_strat.noise.mean)

        # Reset exploration strategy
        param_noise_strat.reset_expl_params()

        # Sample a random observation from the environment
        obs = to.from_numpy(
            env.obs_space.sample_uniform()).to(dtype=to.get_default_dtype())

        # Get a clean and a noisy action
        act = policy(obs)  # policy expects Tensors
        sampled_param = param_noise_strat.sample_param_set(policy.param_values)
        new_policy = deepcopy(policy)
        new_policy.param_values = sampled_param
        act_noisy = new_policy(obs)  # exploration strategy expects ndarrays

        assert isinstance(act, to.Tensor)
        assert not to.equal(act, act_noisy)
Beispiel #4
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, LinearPolicy):
            print_cbt_once('PoWER was designed for linear policies.', 'y')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Beispiel #5
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: Optional[int] = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 lr: float = 5e-4,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param lr: learning rate
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        self.optim = to.optim.SGD([{
            'params': self._policy.parameters()
        }],
                                  lr=lr,
                                  momentum=0.8,
                                  dampening=0.1)
Beispiel #6
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        pop_size: Optional[int],
        num_init_states_per_domain: int,
        num_is_samples: int,
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self._bound_lo_ret = 1e-3  # the returns must not be negative, clip them to this value if so
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = self._bound_lo_ret * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Beispiel #7
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 distribution,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 num_sampler_envs: int = 4,
                 n_mc_samples_gradient=1,
                 coupling=True,
                 real_env=False,
                 lr: float = 5e-4,
                 optim: str = 'SGD',
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        self._distribution = distribution
        self._dims = distribution.get_number_of_dims()

        self._n_mc_samples_gradient = n_mc_samples_gradient
        self._coupling = coupling

        self._real_env = real_env

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        if optim == 'SGD':
            self.optim = to.optim.SGD([{
                'params': self._policy.parameters()
            }],
                                      lr=lr,
                                      momentum=0.8,
                                      dampening=0.1)
        elif optim == 'Adam':
            # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr)
            self.optim = to.optim.Adam(
                [{
                    'params': self._distribution.get_params()
                }], lr=lr)
        else:
            raise NotImplementedError

        self._iter = 0
Beispiel #8
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 extra_expl_std_init: float = 0.,
                 extra_expl_decay_iter: int = 10,
                 full_cov: bool = False,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling,
                               indirectly specifies the performance quantile $1 - \rho$ [1]
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal
                                    entries of the covariance matirx, set to 0 to disable this functionality
        :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last
                                      iteration in which the additional exploration noise is applied
        :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values,
                         else a diagonal covariance is used
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not extra_expl_std_init >= 0:
            raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0')
        if not extra_expl_decay_iter > 0:
            raise pyrado.ValueErr(given=extra_expl_decay_iter,
                                  g_constraint='0')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        if not num_is_samples <= pop_size:
            raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size)
        self.num_is_samples = int(num_is_samples)

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=full_cov,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Optionally add additional entropy
        self.extra_expl_decay_iter = extra_expl_decay_iter
        if isinstance(self._expl_strat.noise, DiagNormalNoise):
            self.extra_expl_std_init = to.ones_like(
                self._policy.param_values) * extra_expl_std_init
        elif isinstance(self._expl_strat.noise, FullNormalNoise):
            self.extra_expl_std_init = to.eye(
                self._policy.num_param) * extra_expl_std_init
        else:
            raise pyrado.TypeErr(
                msg=
                'Additional exploration entropy is only implemented for Gaussian distributions,'
                'i.e. DiagNormalNoise and FullNormalNoise')
Beispiel #9
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 eta_mean: float = 1.,
                 eta_std: float = None,
                 symm_sampling: bool = False,
                 transform_returns: bool = True,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param pop_size: number of solutions in the population
        :param eta_mean: step size factor for the mean
        :param eta_std: step size factor for the standard deviation
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.transform_returns = transform_returns

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            # Symmetric buffer needs to have an even number of samples
            if self.pop_size % 2 != 0:
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Utility coefficients (ignored for transform_returns = False)
        # Use pop_size + 1 since we are also considering the current policy
        eta_std = eta_std if eta_std is not None else (
            3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5.
        self.eta_mean_util, self.eta_std_util = self.compute_utilities(
            self.pop_size + 1, eta_mean, eta_std)

        # Learning rates [2]
        # Use pop_size + 1 since we are also considering the current policy
        self.lr_mean = 1. if transform_returns else 1e-2
        self.lr_std = 0.6 * (
            3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)
Beispiel #10
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 eps: float,
                 gamma: float,
                 num_rollouts: int,
                 pop_size: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_sampler_envs: int = 4,
                 num_epoch_dual: int = 1000,
                 use_map: bool = False,
                 grad_free_optim: bool = False,
                 lr_dual: float = 5e-4,
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param gamma: temporal discount factor; equal to 1 - reset probability
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_epoch_dual: number of epochs for the minimization of the dual function
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer
        :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`)
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        if not isinstance(policy, LinearPolicy):
            warn('REPS is designed for linear policies only!', UserWarning)

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        # Store the inputs
        self.eps = eps
        self.gamma = gamma
        self.base_seed = base_seed
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        self.kappa = to.tensor([0.], requires_grad=True)  # eta = exp(kappa)
        self._exp_min = -700.
        self._exp_max = 700.

        # Dual specific
        if grad_free_optim:
            self.optim_dual = GSS(
                [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4]))
            )
        else:
            self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5)
            # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4)
        self.num_epoch_dual = num_epoch_dual
Beispiel #11
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        eps: float,
        num_init_states_per_domain: int,
        pop_size: Optional[int],
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        softmax_transform: bool = False,
        use_map: bool = True,
        optim_mode: Optional[str] = "scipy",
        num_epoch_dual: int = 1000,
        lr_dual: float = 5e-4,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP
                           optimizer from scipy (recommended)
        :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if
                               `optim_mode = 'scipy'`
        :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'`
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)):
            print_cbt_once("REPS was designed for linear policies.", "y")

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Store the inputs
        self.eps = eps
        self.softmax_transform = softmax_transform
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=self._policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Dual optimization
        self.num_epoch_dual = num_epoch_dual
        self._log_eta = to.tensor([0.0], requires_grad=True)
        self.optim_mode = optim_mode.lower()
        if self.optim_mode == "scipy":
            pass
        elif self.optim_mode == "torch":
            self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4)
            # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5)  # used in [2], but unstable here
        else:
            raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])