Ejemplo n.º 1
0
    def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor,
                              ddp_space: BoxSpace, num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param ddp_space: space of the domain distribution parameters, indicates the lower and upper bound
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        if not isinstance(cands, to.Tensor):
            raise pyrado.TypeErr(given=cands, expected_type=to.Tensor)
        if not isinstance(cands_values, to.Tensor):
            raise pyrado.TypeErr(given=cands_values, expected_type=to.Tensor)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)

        # Normalize the input data and standardize the output data
        uc_projector = UnitCubeProjector(
            to.from_numpy(ddp_space.bound_lo).to(dtype=to.get_default_dtype()),
            to.from_numpy(ddp_space.bound_up).to(dtype=to.get_default_dtype()),
        )
        cands_norm = uc_projector.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        if cands_norm.shape[0] > cands_values.shape[0]:
            print_cbt(
                f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring "
                f"the candidates without evaluation for computing the argmax.",
                "y",
            )
            cands_norm = cands_norm[:cands_values.shape[0], :]

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint("raw_noise",
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, _ = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack(
                [to.zeros(ddp_space.flat_dim),
                 to.ones(ddp_space.flat_dim)]).to(dtype=to.float32),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples,
        )

        cand_norm = cand_norm.to(dtype=to.get_default_dtype())
        cand = uc_projector.project_back(cand_norm.detach())
        print_cbt(f"Converged to argmax of the posterior mean: {cand.numpy()}",
                  "g",
                  bright=True)
        return cand
Ejemplo n.º 2
0
    def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor,
                              uc_normalizer: UnitCubeProjector,
                              num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param uc_normalizer: unit cube normalizer used during the experiments (can be recovered form the bounds)
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        # Normalize the input data and standardize the output data
        cands_norm = uc_normalizer.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise',
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, acq_value = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack([
                to.zeros_like(uc_normalizer.bound_lo),
                to.ones_like(uc_normalizer.bound_up)
            ]),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples)

        cand = uc_normalizer.project_back(cand_norm.detach())
        print_cbt(f'Converged to argmax of the posterior mean\n{cand.numpy()}',
                  'g',
                  bright=True)
        return cand
Ejemplo n.º 3
0
    def train_argmax_policy(load_dir: str,
                            env_sim: MetaDomainRandWrapper,
                            subroutine: Algorithm,
                            num_restarts: int,
                            num_samples: int,
                            policy_param_init: to.Tensor = None,
                            valuefcn_param_init: to.Tensor = None) -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :return: the final BayRn policy
        """
        # Load the required data
        cands = to.load(osp.join(load_dir, 'candidates.pt'))
        cands_values = to.load(osp.join(load_dir,
                                        'candidates_values.pt')).unsqueeze(1)
        bounds = to.load(osp.join(load_dir, 'bounds.pt'))
        uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values,
                                                  uc_normalizer, num_restarts,
                                                  num_samples)

        # Set the domain randomizer given the hyper-parameters
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        subroutine.reset()

        # Reset the subroutine's policy (and value function)
        subroutine.policy.init_param(policy_param_init)
        if isinstance(subroutine, ActorCritic):
            subroutine.critic.value_fcn.init_param(valuefcn_param_init)
        if policy_param_init is None:
            print_cbt('Learning the argmax solution from scratch', 'y')
        else:
            print_cbt('Learning the argmax solution given an initialization',
                      'y')

        subroutine.train(
            snapshot_mode='best')  # meta_info=dict(prefix='final')
        return subroutine.policy
Ejemplo n.º 4
0
    def __init__(self,
                 save_dir: str,
                 env_sim: MetaDomainRandWrapper,
                 env_real: [RealEnv, EnvWrapper],
                 subrtn: Algorithm,
                 ddp_space: BoxSpace,
                 max_iter: int,
                 acq_fc: str,
                 acq_restarts: int,
                 acq_samples: int,
                 acq_param: dict = None,
                 num_init_cand: int = 5,
                 mc_estimator: bool = True,
                 num_eval_rollouts_real: int = 5,
                 num_eval_rollouts_sim: int = 50,
                 thold_succ: float = pyrado.inf,
                 thold_succ_subrtn: float = -pyrado.inf,
                 warmstart: bool = True,
                 policy_param_init: Optional[to.Tensor] = None,
                 valuefcn_param_init: Optional[to.Tensor] = None,
                 subrtn_snapshot_mode: str = 'best',
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param ddp_space: space holding the boundaries for the domain distribution parameters
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                      subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no
                          effect for initial policies and can be overruled by passing init policy params explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if typed_env(env_sim, MetaDomainRandWrapper) is None:
            raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn, Algorithm):
            raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)
        if num_init_cand < 1:
            raise pyrado.ValueErr(given=num_init_cand, ge_constraint='1')

        # Call InterruptableAlgorithm's constructor without specifying the policy
        super().__init__(num_checkpoints=2, init_checkpoint=-2, save_dir=save_dir, max_iter=max_iter,
                         policy=subrtn.policy, logger=logger)

        self._env_sim = env_sim
        self._env_real = env_real
        self._subrtn = subrtn
        self._subrtn.save_name = 'subrtn'
        self.ddp_space = ddp_space
        self.ddp_projector = UnitCubeProjector(to.from_numpy(self.ddp_space.bound_lo),
                                               to.from_numpy(self.ddp_space.bound_up))
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.num_init_cand = num_init_cand
        self.mc_estimator = mc_estimator
        self.policy_param_init = policy_param_init
        self.valuefcn_param_init = valuefcn_param_init.detach() if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.subrtn_snapshot_mode = subrtn_snapshot_mode
        self.thold_succ = to.tensor([thold_succ])
        self.thold_succ_subrtn = to.tensor([thold_succ_subrtn])
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion

        if self.policy_param_init is not None:
            if to.is_tensor(self.policy_param_init):
                self.policy_param_init.detach()
            else:
                self.policy_param_init = to.tensor(self.policy_param_init)

        # Save initial environments and the domain distribution parameter space
        self.save_snapshot(meta_info=None)
        pyrado.save(self.ddp_space, 'ddp_space', 'pkl', self.save_dir)
Ejemplo n.º 5
0
class BayRn(InterruptableAlgorithm):
    """
    Bayesian Domain Randomization (BayRn)

    .. note::
        A candidate is a set of parameter values for the domain parameter distribution and its value is the
        (estimated) real-world return.

    .. seealso::
        F. Muratore, C. Eilers, M. Gienger, J. Peters, "Bayesian Domain Randomization for Sim-to-Real Transfer",
        arXiv, 2020
    """

    name: str = 'bayrn'
    iteration_key: str = 'bayrn_iteration'  # logger's iteration key

    def __init__(self,
                 save_dir: str,
                 env_sim: MetaDomainRandWrapper,
                 env_real: [RealEnv, EnvWrapper],
                 subrtn: Algorithm,
                 ddp_space: BoxSpace,
                 max_iter: int,
                 acq_fc: str,
                 acq_restarts: int,
                 acq_samples: int,
                 acq_param: dict = None,
                 num_init_cand: int = 5,
                 mc_estimator: bool = True,
                 num_eval_rollouts_real: int = 5,
                 num_eval_rollouts_sim: int = 50,
                 thold_succ: float = pyrado.inf,
                 thold_succ_subrtn: float = -pyrado.inf,
                 warmstart: bool = True,
                 policy_param_init: Optional[to.Tensor] = None,
                 valuefcn_param_init: Optional[to.Tensor] = None,
                 subrtn_snapshot_mode: str = 'best',
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param ddp_space: space holding the boundaries for the domain distribution parameters
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                      subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no
                          effect for initial policies and can be overruled by passing init policy params explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if typed_env(env_sim, MetaDomainRandWrapper) is None:
            raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn, Algorithm):
            raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)
        if num_init_cand < 1:
            raise pyrado.ValueErr(given=num_init_cand, ge_constraint='1')

        # Call InterruptableAlgorithm's constructor without specifying the policy
        super().__init__(num_checkpoints=2, init_checkpoint=-2, save_dir=save_dir, max_iter=max_iter,
                         policy=subrtn.policy, logger=logger)

        self._env_sim = env_sim
        self._env_real = env_real
        self._subrtn = subrtn
        self._subrtn.save_name = 'subrtn'
        self.ddp_space = ddp_space
        self.ddp_projector = UnitCubeProjector(to.from_numpy(self.ddp_space.bound_lo),
                                               to.from_numpy(self.ddp_space.bound_up))
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.num_init_cand = num_init_cand
        self.mc_estimator = mc_estimator
        self.policy_param_init = policy_param_init
        self.valuefcn_param_init = valuefcn_param_init.detach() if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.subrtn_snapshot_mode = subrtn_snapshot_mode
        self.thold_succ = to.tensor([thold_succ])
        self.thold_succ_subrtn = to.tensor([thold_succ_subrtn])
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion

        if self.policy_param_init is not None:
            if to.is_tensor(self.policy_param_init):
                self.policy_param_init.detach()
            else:
                self.policy_param_init = to.tensor(self.policy_param_init)

        # Save initial environments and the domain distribution parameter space
        self.save_snapshot(meta_info=None)
        pyrado.save(self.ddp_space, 'ddp_space', 'pkl', self.save_dir)

    @property
    def subroutine(self) -> Algorithm:
        """ Get the policy optimization subroutine. """
        return self._subrtn

    @property
    def sample_count(self) -> int:
        return self._cnt_samples + self._subrtn.sample_count

    def stopping_criterion_met(self) -> bool:
        return self.curr_cand_value > self.thold_succ

    def train_policy_sim(self, cand: to.Tensor, prefix: str) -> float:
        """
        Train a policy in simulation for given hyper-parameters from the domain randomizer.

        :param cand: hyper-parameters for the domain parameter distribution (need be compatible with the randomizer)
        :param prefix: set a prefix to the saved file name by passing it to `meta_info`
        :return: estimated return of the trained policy in the target domain
        """
        # Save the current candidate
        to.save(cand.view(-1), osp.join(self.save_dir, f'{prefix}_candidate.pt'))

        # Set the domain randomizer
        self._env_sim.adapt_randomizer(cand.detach().cpu().numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        self._cnt_samples += self._subrtn.sample_count
        self._subrtn.reset()

        # Do a warm start if desired
        self._subrtn.init_modules(
            self.warmstart, policy_param_init=self.policy_param_init, valuefcn_param_init=self.valuefcn_param_init
        )

        # Train a policy in simulation using the subroutine
        self._subrtn.train(snapshot_mode=self.subrtn_snapshot_mode, meta_info=dict(prefix=prefix))

        # Return the estimated return of the trained policy in simulation
        avg_ret_sim = self.eval_policy(
            None, self._env_sim, self._subrtn.policy, self.mc_estimator, prefix, self.num_eval_rollouts_sim
        )
        return float(avg_ret_sim)

    def train_init_policies(self):
        """
        Initialize the algorithm with a number of random distribution parameter sets a.k.a. candidates specified by
        the user. Train a policy for every candidate. Finally, store the policies and candidates.
        """
        cands = to.empty(self.num_init_cand, self.ddp_space.shape[0])
        for i in range(self.num_init_cand):
            print_cbt(f'Generating initial domain instance and policy {i + 1} of {self.num_init_cand} ...',
                      'g', bright=True)
            # Sample random domain distribution parameters
            cands[i, :] = to.from_numpy(self.ddp_space.sample_uniform())

            # Train a policy for each candidate, repeat if the resulting policy did not exceed the success threshold
            print_cbt(f'Randomly sampled the next candidate: {cands[i].numpy()}', 'g')
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(), self.max_subrtn_rep
            )(self.train_policy_sim)
            wrapped_trn_fcn(cands[i], prefix=f'init_{i}')

        # Save candidates into a single tensor (policy is saved during training or exists already)
        pyrado.save(cands, 'candidates', 'pt', self.save_dir, meta_info=None)
        self.cands = cands

    def eval_init_policies(self):
        """
        Execute the trained initial policies on the target device and store the estimated return per candidate.
        The number of initial policies to evaluate is the number of found policies.
        """
        # Crawl through the experiment's directory
        for root, dirs, files in os.walk(self.save_dir):
            dirs.clear()  # prevents walk() from going into subdirectories
            found_policies = [p for p in files if p.startswith('init_') and p.endswith('_policy.pt')]
            found_cands = [c for c in files if c.startswith('init_') and c.endswith('_candidate.pt')]
        if not len(found_policies) == len(found_cands):
            raise pyrado.ValueErr(msg='Found a different number of initial policies than candidates!')
        elif len(found_policies) == 0:
            raise pyrado.ValueErr(msg='No policies or candidates found!')

        num_init_cand = len(found_cands)
        cands_values = to.empty(num_init_cand)

        # Load all found candidates to save them into a single tensor
        found_cands = natural_sort(found_cands)  # the order is important since it determines the rows of the tensor
        cands = to.stack([to.load(osp.join(self.save_dir, c)) for c in found_cands])

        # Evaluate learned policies from random candidates on the target environment (real-world) system
        for i in range(num_init_cand):
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'init_{i}'))
            cands_values[i] = self.eval_policy(self.save_dir, self._env_real, policy, self.mc_estimator,
                                               prefix=f'init_{i}', num_rollouts=self.num_eval_rollouts_real)

        # Save candidates's and their returns into tensors (policy is saved during training or exists already)
        # pyrado.save(cands, 'candidates', 'pt', self._save_dir, meta_info)
        pyrado.save(cands_values, 'candidates_values', 'pt', self.save_dir, meta_info=None)
        self.cands, self.cands_values = cands, cands_values

    @staticmethod
    def eval_policy(save_dir: [str, None],
                    env: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy,
                    mc_estimator: bool,
                    prefix: str,
                    num_rollouts: int,
                    num_parallel_envs: int = 1) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :param num_parallel_envs: number of environments for the parallel sampler (only used for SimEnv)
        :return: estimated return in the target domain
        """
        if save_dir is not None:
            print_cbt(f'Executing {prefix}_policy ...', 'c', bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(inner_env(env), RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env, policy, eval=True).undiscounted_return()
                # If a reward of -1 is given, skip evaluation ahead and set all returns to zero
                if rets_real[i] == -1:
                    print_cbt('Set all returns for this policy to zero.', color='c')
                    rets_real = to.zeros(num_rollouts)
                    break
        elif isinstance(inner_env(env), SimEnv):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelRolloutSampler(env, policy, num_workers=num_parallel_envs, min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(given=inner_env(env), expected_type=[RealEnv, SimEnv])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('Target domain performance', bright=True)
            print(tabulate([['mean return', to.mean(rets_real).item()],
                            ['std return', to.std(rets_real)],
                            ['min return', to.min(rets_real)],
                            ['max return', to.max(rets_real)]]))

        if mc_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(bootstrap_ci(rets_real.numpy(), np.mean,
                                              num_reps=1000, alpha=0.05, ci_sides=1, studentized=False)[1])

    def step(self, snapshot_mode: str = 'latest', meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == -2:
            # Train the initial policies in the source domain
            self.train_init_policies()
            self.reached_checkpoint()  # setting counter to -1

        if self.curr_checkpoint == -1:
            # Evaluate the initial policies in the target domain
            self.eval_init_policies()
            self.reached_checkpoint()  # setting counter to 0

        if self.curr_checkpoint == 0:
            # Normalize the input data and standardize the output data
            cands_norm = self.ddp_projector.project_to(self.cands)
            cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

            # Create and fit the GP model
            gp = SingleTaskGP(cands_norm, cands_values_stdized)
            gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5))
            mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
            fit_gpytorch_model(mll)
            print_cbt('Fitted the GP.', 'g')

            # Acquisition functions
            if self.acq_fcn_type == 'UCB':
                acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True)
            elif self.acq_fcn_type == 'EI':
                acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            elif self.acq_fcn_type == 'PI':
                acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            else:
                raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'")

            # Optimize acquisition function and get new candidate point
            cand_norm, acq_value = optimize_acqf(
                acq_function=acq_fcn,
                bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]),
                q=1,
                num_restarts=self.acq_restarts,
                raw_samples=self.acq_samples
            )
            next_cand = self.ddp_projector.project_back(cand_norm)
            print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
            self.cands = to.cat([self.cands, next_cand], dim=0)
            pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(), self.max_subrtn_rep
            )(self.train_policy_sim)
            wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}')
            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Evaluate the current policy in the target domain
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir,
                                        meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            self.curr_cand_value = self.eval_policy(
                self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}',
                self.num_eval_rollouts_real
            )
            self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0)
            pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info)

            # Store the argmax after training and evaluating
            curr_argmax_cand = BayRn.argmax_posterior_mean(
                self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples
            )
            self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
            pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 0

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # Policies of every iteration are saved by the subroutine in train_policy_sim()
        if meta_info is None:
            # This algorithm instance is not a subroutine of another algorithm
            joblib.dump(self._env_sim, osp.join(self.save_dir, 'env_sim.pkl'))
            joblib.dump(self._env_real, osp.join(self.save_dir, 'env_real.pkl'))
            pyrado.save(self.policy, 'policy', 'pt', self.save_dir, None)
        else:
            raise pyrado.ValueErr(msg=f'{self.name} is not supposed be run as a subroutine!')

    @staticmethod
    def argmax_posterior_mean(cands: to.Tensor,
                              cands_values: to.Tensor,
                              ddp_space: BoxSpace,
                              num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param ddp_space: space of the domain distribution parameters, indicates the lower and upper bound
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        if not isinstance(cands, to.Tensor):
            raise pyrado.TypeErr(given=cands, expected_type=to.Tensor)
        if not isinstance(cands_values, to.Tensor):
            raise pyrado.TypeErr(given=cands_values, expected_type=to.Tensor)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)

        # Normalize the input data and standardize the output data
        uc_projector = UnitCubeProjector(to.from_numpy(ddp_space.bound_lo), to.from_numpy(ddp_space.bound_up))
        cands_norm = uc_projector.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        if cands_norm.shape[0] > cands_values.shape[0]:
            print_cbt(f'There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring '
                      f'the candidates without evaluation for computing the argmax.', 'y')
            cands_norm = cands_norm[:cands_values.shape[0], :]

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, acq_value = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack([to.zeros(ddp_space.flat_dim), to.ones(ddp_space.flat_dim)]),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples
        )

        cand = uc_projector.project_back(cand_norm.detach())
        print_cbt(f'Converged to argmax of the posterior mean: {cand.numpy()}', 'g', bright=True)
        return cand

    @staticmethod
    def train_argmax_policy(load_dir: str,
                            env_sim: MetaDomainRandWrapper,
                            subrtn: Algorithm,
                            num_restarts: int,
                            num_samples: int,
                            policy_param_init: to.Tensor = None,
                            valuefcn_param_init: to.Tensor = None,
                            subrtn_snapshot_mode: str = 'best') -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :return: the final BayRn policy
        """
        # Load the required data
        cands = pyrado.load(None, 'candidates', 'pt', load_dir)
        cands_values = pyrado.load(None, 'candidates_values', 'pt', load_dir).unsqueeze(1)
        ddp_space = pyrado.load(None, 'ddp_space', 'pkl', load_dir)

        if cands.shape[0] > cands_values.shape[0]:
            print_cbt(
                f'There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring the'
                f'candidates without evaluation for computing the argmax.', 'y')
            cands = cands[:cands_values.shape[0], :]

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values, ddp_space, num_restarts, num_samples)

        # Set the domain randomizer
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        subrtn.reset()

        # Do a warm start
        subrtn.init_modules(
            warmstart=True, policy_param_init=policy_param_init, valuefcn_param_init=valuefcn_param_init
        )

        subrtn.train(snapshot_mode=subrtn_snapshot_mode, meta_info=dict(suffix='argmax'))
        return subrtn.policy
Ejemplo n.º 6
0
    def __init__(self,
                 save_dir: str,
                 env_sim: MetaDomainRandWrapper,
                 env_real: [RealEnv, EnvWrapper],
                 subroutine: Algorithm,
                 bounds: to.Tensor,
                 max_iter: int,
                 acq_fc: str,
                 acq_restarts: int,
                 acq_samples: int,
                 acq_param: dict = None,
                 montecarlo_estimator: bool = True,
                 num_eval_rollouts_real: int = 5,
                 num_eval_rollouts_sim: int = 50,
                 num_init_cand: int = 5,
                 thold_succ: float = pyrado.inf,
                 thold_succ_subroutine: float = -pyrado.inf,
                 warmstart: bool = True,
                 policy_param_init: to.Tensor = None,
                 valuefcn_param_init: to.Tensor = None):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param bounds: boundaries for inputs of randomization function, format: [lower, upper]
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subroutine: success threshold on the simulated system's return for the subroutine, repeat the
                                      subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no
                          effect for initial policies and can be overruled by passing init policy params explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        """
        assert isinstance(env_sim, MetaDomainRandWrapper)
        assert isinstance(subroutine, Algorithm)
        assert bounds.shape[0] == 2
        assert all(bounds[1] > bounds[0])

        # Call Algorithm's constructor without specifying the policy
        super().__init__(save_dir, max_iter, subroutine.policy, logger=None)

        # Store the inputs and initialize
        self._env_sim = env_sim
        self._env_real = env_real
        self._subroutine = subroutine
        self.bounds = bounds
        self.cand_dim = bounds.shape[1]
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.montecarlo_estimator = montecarlo_estimator
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.policy_param_init = policy_param_init.detach(
        ) if policy_param_init is not None else None
        self.valuefcn_param_init = valuefcn_param_init.detach(
        ) if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.thold_succ = to.tensor([thold_succ])
        self.thold_succ_subroutine = to.tensor([thold_succ_subroutine])
        self.max_subroutine_rep = 3  # number of tries to exceed thold_succ_subroutine during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion
        self.uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

        # Set the flag to run the initialization phase. This is overruled if load_snapshot is called.
        self.initialized = False
        if num_init_cand > 0:
            self.num_init_cand = num_init_cand
        else:
            raise pyrado.ValueErr(given=num_init_cand, g_constraint='0')

        # Save initial environments
        self.save_snapshot()
Ejemplo n.º 7
0
class BayRn(Algorithm, ABC):
    """
    Bayesian Domain Randomization (BayRn)

    .. note::
        A candidate is a set of parameter values for the domain parameter distribution

    .. seealso::
        F. Muratore, C. Eilers, M. Gienger, J. Peters, "Bayesian Domain Randomization for Sim-to-Real Transfer",
        arXiv, 2020
    """

    name: str = 'bayrn'
    iteration_key: str = 'bayrn_iteration'  # logger's iteration key

    def __init__(self,
                 save_dir: str,
                 env_sim: MetaDomainRandWrapper,
                 env_real: [RealEnv, EnvWrapper],
                 subroutine: Algorithm,
                 bounds: to.Tensor,
                 max_iter: int,
                 acq_fc: str,
                 acq_restarts: int,
                 acq_samples: int,
                 acq_param: dict = None,
                 montecarlo_estimator: bool = True,
                 num_eval_rollouts_real: int = 5,
                 num_eval_rollouts_sim: int = 50,
                 num_init_cand: int = 5,
                 thold_succ: float = pyrado.inf,
                 thold_succ_subroutine: float = -pyrado.inf,
                 warmstart: bool = True,
                 policy_param_init: to.Tensor = None,
                 valuefcn_param_init: to.Tensor = None):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param bounds: boundaries for inputs of randomization function, format: [lower, upper]
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subroutine: success threshold on the simulated system's return for the subroutine, repeat the
                                      subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy parameters with the one of the previous iteration. This option has no
                          effect for initial policies and can be overruled by passing init policy params explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        """
        assert isinstance(env_sim, MetaDomainRandWrapper)
        assert isinstance(subroutine, Algorithm)
        assert bounds.shape[0] == 2
        assert all(bounds[1] > bounds[0])

        # Call Algorithm's constructor without specifying the policy
        super().__init__(save_dir, max_iter, subroutine.policy, logger=None)

        # Store the inputs and initialize
        self._env_sim = env_sim
        self._env_real = env_real
        self._subroutine = subroutine
        self.bounds = bounds
        self.cand_dim = bounds.shape[1]
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.montecarlo_estimator = montecarlo_estimator
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.policy_param_init = policy_param_init.detach(
        ) if policy_param_init is not None else None
        self.valuefcn_param_init = valuefcn_param_init.detach(
        ) if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.thold_succ = to.tensor([thold_succ])
        self.thold_succ_subroutine = to.tensor([thold_succ_subroutine])
        self.max_subroutine_rep = 3  # number of tries to exceed thold_succ_subroutine during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion
        self.uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

        # Set the flag to run the initialization phase. This is overruled if load_snapshot is called.
        self.initialized = False
        if num_init_cand > 0:
            self.num_init_cand = num_init_cand
        else:
            raise pyrado.ValueErr(given=num_init_cand, g_constraint='0')

        # Save initial environments
        self.save_snapshot()

    def stopping_criterion_met(self) -> bool:
        return self.curr_cand_value > self.thold_succ

    def train_policy_sim(self, cand: to.Tensor, prefix: str) -> float:
        """
        Train a policy in simulation for given hyper-parameters from the domain randomizer.

        :param cand: hyper-parameters for the domain parameter distribution coming from the domain randomizer
        :param prefix: set a prefix to the saved file name by passing it to `meta_info`
        :return: estimated return of the trained policy in the target domain
        """
        # Save the individual candidate
        to.save(cand.view(-1),
                osp.join(self._save_dir, f'{prefix}_candidate.pt'))

        # Set the domain randomizer given the hyper-parameters
        self._env_sim.adapt_randomizer(cand.numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        self._subroutine.reset()

        if not self.warmstart or self._curr_iter == 0:
            # Reset the subroutine's policy (and value function)
            self._subroutine.policy.init_param(self.policy_param_init)
            if isinstance(self._subroutine, ActorCritic):
                self._subroutine.critic.value_fcn.init_param(
                    self.valuefcn_param_init)
            if self.policy_param_init is None:
                print_cbt('Learning the new solution from scratch', 'y')
            else:
                print_cbt('Learning the new solution given an initialization',
                          'y')

        elif self.warmstart and self._curr_iter > 0:
            # Continue from the previous policy (and value function)
            self._subroutine.policy.load_state_dict(
                to.load(
                    osp.join(
                        self._save_dir,
                        f'iter_{self._curr_iter - 1}_policy.pt')).state_dict())
            if isinstance(self._subroutine, ActorCritic):
                self._subroutine.critic.value_fcn.load_state_dict(
                    to.load(
                        osp.join(self._save_dir,
                                 f'iter_{self._curr_iter - 1}_valuefcn.pt')).
                    state_dict())
            print_cbt(
                f'Initialized the new solution with the results from iteration {self._curr_iter - 1}',
                'y')

        # Train a policy in simulation using the subroutine
        self._subroutine.train(snapshot_mode='best',
                               meta_info=dict(prefix=prefix))

        # Return the estimated return of the trained policy in simulation
        avg_ret_sim = self.eval_policy(None, self._env_sim,
                                       self._subroutine.policy,
                                       self.montecarlo_estimator, prefix,
                                       self.num_eval_rollouts_sim)
        return float(avg_ret_sim)

    def train_init_policies(self):
        """
        Initialize the algorithm with a number of random distribution parameter sets a.k.a. candidates specified by
        the user. Train a policy for every candidate. Finally, store the policies and candidates.
        """
        cands = to.empty(self.num_init_cand, self.cand_dim)
        for i in range(self.num_init_cand):
            print_cbt(
                f'Generating initial domain instance and policy {i + 1} of {self.num_init_cand} ...',
                'g',
                bright=True)
            # Generate random samples within bounds
            cands[i, :] = (self.bounds[1, :] - self.bounds[0, :]) * to.rand(
                self.bounds.shape[1]) + self.bounds[0, :]
            # Train a policy for each candidate, repeat if the resulting policy did not exceed the success thold
            print_cbt(
                f'Randomly sampled the next candidate: {cands[i].numpy()}',
                'g')
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subroutine.item(),
                max_iter=self.max_subroutine_rep)(self.train_policy_sim)
            wrapped_trn_fcn(cands[i], prefix=f'init_{i}')

        # Save candidates into a single tensor (policy is saved during training or exists already)
        to.save(cands, osp.join(self._save_dir, 'candidates.pt'))
        self.cands = cands

    def eval_init_policies(self):
        """
        Execute the trained initial policies on the target device and store the estimated return per candidate.
        The number of initial policies to evaluate is the number of found policies.
        """
        # Crawl through the experiment's directory
        for root, dirs, files in os.walk(self._save_dir):
            found_policies = [
                p for p in files
                if p.startswith('init_') and p.endswith('_policy.pt')
            ]
            found_cands = [
                c for c in files
                if c.startswith('init_') and c.endswith('_candidate.pt')
            ]
        if not len(found_policies) == len(found_cands):
            raise pyrado.ValueErr(
                msg=
                'Found a different number of initial policies than candidates!'
            )
        elif len(found_policies) == 0:
            raise pyrado.ValueErr(msg='No policies or candidates found!')

        num_init_cand = len(found_cands)
        cands_values = to.empty(num_init_cand)

        # Load all found candidates to save them into a single tensor
        found_cands.sort(
        )  # the order is important since it determines the rows of the tensor
        cands = to.stack(
            [to.load(osp.join(self._save_dir, c)) for c in found_cands])

        # Evaluate learned policies from random candidates on the target environment (real-world) system
        for i in range(num_init_cand):
            policy = to.load(osp.join(self._save_dir, f'init_{i}_policy.pt'))
            cands_values[i] = self.eval_policy(
                self._save_dir,
                self._env_real,
                policy,
                self.montecarlo_estimator,
                prefix=f'init_{i}',
                num_rollouts=self.num_eval_rollouts_real)

        # Save candidates's and their returns into tensors (policy is saved during training or exists already)
        to.save(cands, osp.join(self._save_dir, 'candidates.pt'))
        to.save(cands_values, osp.join(self._save_dir, 'candidates_values.pt'))
        self.cands, self.cands_values = cands, cands_values

        if isinstance(self._env_real, RealEnv):
            input('Evaluated in the target domain. Hit any key to continue.')

    @staticmethod
    def eval_policy(save_dir: [str, None],
                    env_real: [RealEnv, SimEnv, MetaDomainRandWrapper],
                    policy: Policy, montecarlo_estimator: bool, prefix: str,
                    num_rollouts: int) -> to.Tensor:
        """
        Evaluate a policy on the target system (real-world platform).
        This method is static to facilitate evaluation of specific policies in hindsight.

        :param save_dir: directory to save the snapshots i.e. the results in, if `None` nothing is saved
        :param env_real: target environment for evaluation, in the sim-2-sim case this is another simulation instance
        :param policy: policy to evaluate
        :param montecarlo_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootrapping
        :param num_rollouts: number of rollouts to collect on the target system
        :param prefix: to control the saving for the evaluation of an initial policy, `None` to deactivate
        :return: estimated return in the target domain
        """
        if isinstance(env_real, RealEnv):
            input('Evaluating in the target domain. Hit any key to continue.')
        if save_dir is not None:
            print_cbt(f'Evaluating {prefix}_policy on the target system ...',
                      'c',
                      bright=True)

        rets_real = to.zeros(num_rollouts)
        if isinstance(env_real, RealEnv):
            # Evaluate sequentially when conducting a sim-to-real experiment
            for i in range(num_rollouts):
                rets_real[i] = rollout(env_real,
                                       policy,
                                       eval=True,
                                       no_close=False).undiscounted_return()
        elif isinstance(env_real, (SimEnv, MetaDomainRandWrapper)):
            # Create a parallel sampler when conducting a sim-to-sim experiment
            sampler = ParallelSampler(env_real,
                                      policy,
                                      num_envs=1,
                                      min_rollouts=num_rollouts)
            ros = sampler.sample()
            for i in range(num_rollouts):
                rets_real[i] = ros[i].undiscounted_return()
        else:
            raise pyrado.TypeErr(
                given=env_real,
                expected_type=[RealEnv, SimEnv, MetaDomainRandWrapper])

        if save_dir is not None:
            # Save the evaluation results
            to.save(rets_real, osp.join(save_dir, f'{prefix}_returns_real.pt'))

            print_cbt('target domain performance', bright=True)
            print(
                tabulate([['mean return',
                           to.mean(rets_real).item()],
                          ['std return', to.std(rets_real)],
                          ['min return', to.min(rets_real)],
                          ['max return', to.max(rets_real)]]))

        if montecarlo_estimator:
            return to.mean(rets_real)
        else:
            return to.from_numpy(
                bootstrap_ci(rets_real.numpy(),
                             np.mean,
                             num_reps=1000,
                             alpha=0.05,
                             ci_sides=1,
                             studentized=False)[1])

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if not self.initialized:
            # Start initialization phase
            self.train_init_policies()
            self.eval_init_policies()
            self.initialized = True

        # Normalize the input data and standardize the output data
        cands_norm = self.uc_normalizer.project_to(self.cands)
        cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise',
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)
        print_cbt('Fitted the GP.', 'g')

        # Acquisition functions
        if self.acq_fcn_type == 'UCB':
            acq_fcn = UpperConfidenceBound(gp,
                                           beta=self.acq_param.get(
                                               'beta', 0.1),
                                           maximize=True)
        elif self.acq_fcn_type == 'EI':
            acq_fcn = ExpectedImprovement(
                gp, best_f=cands_values_stdized.max().item(), maximize=True)
        elif self.acq_fcn_type == 'PI':
            acq_fcn = ProbabilityOfImprovement(
                gp, best_f=cands_values_stdized.max().item(), maximize=True)
        else:
            raise pyrado.ValueErr(given=self.acq_fcn_type,
                                  eq_constraint="'UCB', 'EI', 'PI'")

        # Optimize acquisition function and get new candidate point
        cand, acq_value = optimize_acqf(
            acq_function=acq_fcn,
            bounds=to.stack([to.zeros(self.cand_dim),
                             to.ones(self.cand_dim)]),
            q=1,
            num_restarts=self.acq_restarts,
            raw_samples=self.acq_samples)
        next_cand = self.uc_normalizer.project_back(cand)
        print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
        self.cands = to.cat([self.cands, next_cand], dim=0)
        to.save(self.cands, osp.join(self._save_dir, 'candidates.pt'))

        # Train and valuate the new candidate (saves to iter_{self._curr_iter}_policy.pt)
        prefix = f'iter_{self._curr_iter}'
        wrapped_trn_fcn = until_thold_exceeded(
            self.thold_succ_subroutine.item(),
            max_iter=self.max_subroutine_rep)(self.train_policy_sim)
        wrapped_trn_fcn(cand, prefix)

        # Evaluate the current policy on the target domain
        policy = to.load(osp.join(self._save_dir, f'{prefix}_policy.pt'))
        self.curr_cand_value = self.eval_policy(self._save_dir, self._env_real,
                                                policy,
                                                self.montecarlo_estimator,
                                                prefix,
                                                self.num_eval_rollouts_real)

        self.cands_values = to.cat(
            [self.cands_values,
             self.curr_cand_value.view(1)], dim=0)
        to.save(self.cands_values,
                osp.join(self._save_dir, 'candidates_values.pt'))

        # Store the argmax after training and evaluating
        curr_argmax_cand = BayRn.argmax_posterior_mean(
            self.cands, self.cands_values.unsqueeze(1), self.uc_normalizer,
            self.acq_restarts, self.acq_samples)
        self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
        to.save(self.argmax_cand,
                osp.join(self._save_dir, 'candidates_argmax.pt'))

        self.make_snapshot(snapshot_mode, float(to.mean(self.cands_values)),
                           meta_info)

    def save_snapshot(self, meta_info: dict = None):
        # Policies (and value functions) are saved by the subroutine in train_policy_sim()
        if meta_info is None:
            # This instance is not a subroutine of a meta-algorithm
            joblib.dump(self._env_sim, osp.join(self._save_dir, 'env_sim.pkl'))
            joblib.dump(self._env_real, osp.join(self._save_dir,
                                                 'env_real.pkl'))
            to.save(self.bounds, osp.join(self._save_dir, 'bounds.pt'))
            to.save(self._subroutine.policy,
                    osp.join(self._save_dir, 'policy.pt'))
            if isinstance(self._subroutine, ActorCritic):
                to.save(self._subroutine.critic.value_fcn,
                        osp.join(self._save_dir, 'valuefcn.pt'))
        else:
            raise pyrado.ValueErr(
                msg=f'{self.name} is not supposed be run as a subroutine!')

    def load_snapshot(self, load_dir: str = None, meta_info: dict = None):
        # Get the directory to load from
        ld = load_dir if load_dir is not None else self._save_dir
        if not osp.isdir(ld):
            raise pyrado.ValueErr(msg='Given path is not a directory!')

        if meta_info is None:
            # This algorithm instance is not a subroutine of a meta-algorithm
            self._env_sim = joblib.load(osp.join(ld, 'env_sim.pkl'))
            self._env_real = joblib.load(osp.join(ld, 'env_real.pkl'))

            # Crawl through the given directory and check how many policies and candidates there are
            found_policies, found_cands = None, None
            for root, dirs, files in os.walk(ld):
                found_policies = [
                    p for p in files if p.endswith('_policy.pt')
                ]  # 'policy.pt' file should not be found
                found_cands = [c for c in files if c.endswith('_candidate.pt')]

            # Copy to the current experiment's directory. Not necessary if we are continuing in that directory.
            if ld != self._save_dir:
                for p in found_policies:
                    copyfile(osp.join(ld, p), osp.join(self._save_dir, p))
                for c in found_cands:
                    copyfile(osp.join(ld, c), osp.join(self._save_dir, c))

            if len(found_policies) > 0:
                # Load all found candidates to save them into a single tensor
                found_cands.sort(
                )  # the order is important since it determines the rows of the tensor
                self.cands = to.stack(
                    [to.load(osp.join(ld, c)) for c in found_cands])
                to.save(self.cands, osp.join(self._save_dir, 'candidates.pt'))

                # Catch the case that the algorithm stopped before evaluating a sampled candidate
                if not len(found_policies) == len(found_cands):
                    print_cbt(
                        f'Found {len(found_policies)} policies, but {len(found_cands)} candidates!',
                        'r')
                    n = len(found_cands) - len(found_policies)
                    delete = input(
                        'Delete the superfluous candidates? [y / any other]'
                    ).lower() == 'y'
                    if n > 0 and delete:
                        # Delete the superfluous candidates
                        print_cbt(f'Candidates before:\n{self.cands.numpy()}',
                                  'w')
                        self.cands = self.cands[:-n, :]
                        found_cands = found_cands[:-n]
                        to.save(self.cands,
                                osp.join(self._save_dir, 'candidates.pt'))
                        print_cbt(f'Candidates after:\n{self.cands.numpy()}',
                                  'c')
                    else:
                        raise pyrado.ShapeErr(
                            msg=f'Found {len(found_policies)} policies,'
                            f'but {len(found_cands)} candidates!')

            else:
                # Assuming not even the training of the initial policies has not been finished. Redo it all.
                print_cbt(
                    'No policies have been found. Basically starting from scratch.',
                    'c')
                self.train_init_policies()
                self.eval_init_policies()
                self.initialized = True

            try:
                # Crawl through the load_dir and copy all done evaluations.
                # Not necessary if we are continuing in that directory.
                if ld != self._save_dir:
                    for root, dirs, files in os.walk(load_dir):
                        [
                            copyfile(osp.join(load_dir, c),
                                     osp.join(self._save_dir, c))
                            for c in files if c.endswith('_returns_real.pt')
                        ]

                # Get all previously done evaluations. If we don't find any, the exception is caught.
                found_evals = None
                for root, dirs, files in os.walk(ld):
                    found_evals = [
                        v for v in files if v.endswith('_returns_real.pt')
                    ]
                found_evals.sort(
                )  # the order is important since it determines the rows of the tensor

                # Reconstruct candidates_values.pt
                self.cands_values = to.empty(self.cands.shape[0])
                for i, fe in enumerate(found_evals):
                    # Get the return estimate from the raw evaluations as in eval_policy()
                    if self.montecarlo_estimator:
                        self.cands_values[i] = to.mean(
                            to.load(osp.join(ld, fe)))
                    else:
                        self.cands_values[i] = to.from_numpy(
                            bootstrap_ci(to.load(osp.join(ld, fe)).numpy(),
                                         np.mean,
                                         num_reps=1000,
                                         alpha=0.05,
                                         ci_sides=1,
                                         studentized=False)[1])

                if len(found_evals) < len(found_cands):
                    print_cbt(
                        f'Found {len(found_evals)} real-world evaluation files but {len(found_cands)} candidates.'
                        f' Now evaluation the remaining ones.',
                        'c',
                        bright=True)
                for i in range(len(found_cands) - len(found_evals)):
                    # Evaluate the current policy on the target domain
                    if len(found_evals) < self.num_init_cand:
                        prefix = f'init_{i + len(found_evals)}'
                    else:
                        prefix = f'iter_{i + len(found_evals) - self.num_init_cand}'
                    policy = to.load(
                        osp.join(self._save_dir, f'{prefix}_policy.pt'))
                    self.cands_values[i + len(found_evals)] = self.eval_policy(
                        self._save_dir, self._env_real, policy,
                        self.montecarlo_estimator, prefix,
                        self.num_eval_rollouts_real)
                to.save(self.cands_values,
                        osp.join(self._save_dir, 'candidates_values.pt'))

                if len(found_cands) < self.num_init_cand:
                    print_cbt(
                        'Found less candidates than the number of initial candidates.',
                        'y')
                else:
                    self.initialized = True

            except (FileNotFoundError, RuntimeError):
                # If there are returns_real.pt files but len(found_policies) > 0 (was checked earlier),
                # then the initial policies have not been evaluated yet
                self.eval_init_policies()
                self.initialized = True

            # Get current iteration count
            found_iter_policies = None
            for root, dirs, files in os.walk(ld):
                found_iter_policies = [
                    p for p in files
                    if p.startswith('iter_') and p.endswith('_policy.pt')
                ]

            if not found_iter_policies:
                self._curr_iter = 0
                # We don't need to init the subroutine since it will be reset for iteration 0 anyway
            else:
                self._curr_iter = len(
                    found_iter_policies)  # continue with next

                # Initialize subroutine with previous iteration
                self._subroutine.load_snapshot(
                    ld, meta_info=dict(prefix=f'iter_{self._curr_iter - 1}'))

                # Evaluate and save the latest candidate on the target system.
                # This is the case if we found iter_i_candidate.pt but not iter_i_returns_real.pt
                if self.cands.shape[0] == self.cands_values.shape[0] + 1:
                    curr_cand_value = self.eval_policy(
                        self._save_dir,
                        self._env_real,
                        self._subroutine.policy,
                        self.montecarlo_estimator,
                        prefix=f'iter_{self._curr_iter - 1}',
                        num_rollouts=self.num_eval_rollouts_real)
                    self.cands_values = to.cat(
                        [self.cands_values,
                         curr_cand_value.view(1)], dim=0)
                    to.save(self.cands_values,
                            osp.join(self._save_dir, 'candidates_values.pt'))

                    if isinstance(self._env_real, RealEnv):
                        input(
                            'Evaluated in the target domain. Hit any key to continue.'
                        )

        else:
            raise pyrado.ValueErr(
                msg=f'{self.name} is not supposed be run as a subroutine!')

    @staticmethod
    def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor,
                              uc_normalizer: UnitCubeProjector,
                              num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param uc_normalizer: unit cube normalizer used during the experiments (can be recovered form the bounds)
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        # Normalize the input data and standardize the output data
        cands_norm = uc_normalizer.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint('raw_noise',
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, acq_value = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack([
                to.zeros_like(uc_normalizer.bound_lo),
                to.ones_like(uc_normalizer.bound_up)
            ]),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples)

        cand = uc_normalizer.project_back(cand_norm.detach())
        print_cbt(f'Converged to argmax of the posterior mean\n{cand.numpy()}',
                  'g',
                  bright=True)
        return cand

    @staticmethod
    def train_argmax_policy(load_dir: str,
                            env_sim: MetaDomainRandWrapper,
                            subroutine: Algorithm,
                            num_restarts: int,
                            num_samples: int,
                            policy_param_init: to.Tensor = None,
                            valuefcn_param_init: to.Tensor = None) -> Policy:
        """
        Train a policy based on the maximizer of the posterior mean.

        :param load_dir: directory to load from
        :param env_sim: simulation environment
        :param subroutine: algorithm which performs the policy / value-function optimization
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :return: the final BayRn policy
        """
        # Load the required data
        cands = to.load(osp.join(load_dir, 'candidates.pt'))
        cands_values = to.load(osp.join(load_dir,
                                        'candidates_values.pt')).unsqueeze(1)
        bounds = to.load(osp.join(load_dir, 'bounds.pt'))
        uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

        # Find the maximizer
        argmax_cand = BayRn.argmax_posterior_mean(cands, cands_values,
                                                  uc_normalizer, num_restarts,
                                                  num_samples)

        # Set the domain randomizer given the hyper-parameters
        env_sim.adapt_randomizer(argmax_cand.numpy())

        # Reset the subroutine's algorithm which includes resetting the exploration
        subroutine.reset()

        # Reset the subroutine's policy (and value function)
        subroutine.policy.init_param(policy_param_init)
        if isinstance(subroutine, ActorCritic):
            subroutine.critic.value_fcn.init_param(valuefcn_param_init)
        if policy_param_init is None:
            print_cbt('Learning the argmax solution from scratch', 'y')
        else:
            print_cbt('Learning the argmax solution given an initialization',
                      'y')

        subroutine.train(
            snapshot_mode='best')  # meta_info=dict(prefix='final')
        return subroutine.policy
Ejemplo n.º 8
0
    def __init__(
        self,
        subrtn: ParameterExploring,
        behavior_policy: Policy,
        num_rollouts_per_distr: int,
        metric: Union[Callable[[np.ndarray], np.ndarray], None],
        obs_dim_weight: Union[list, np.ndarray],
        std_obs_filt: int = 5,
        w_abs: float = 0.5,
        w_sq: float = 1.0,
        num_workers: int = 4,
        base_seed: int = 1001,
    ):
        """
        Constructor

        :param subrtn: wrapped algorithm to fit the domain parameter distribution
        :param behavior_policy: lower level policy used to generate the rollouts
        :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set
        :param metric: functional mapping from differences in observations to value
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric
        :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons
        :param w_abs: weight for the mean absolute errors for the default metric
        :param w_sq: weight for the mean squared errors for the default metric
        :param num_workers: number of environments for parallel sampling
        :param base_seed: seed to set for the parallel sampler in every iteration
        """
        if not isinstance(subrtn, ParameterExploring):
            raise pyrado.TypeErr(given=subrtn,
                                 expected_type=ParameterExploring)
        if not isinstance(subrtn.env, MetaDomainRandWrapper):
            raise pyrado.TypeErr(given=subrtn.env,
                                 expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn.policy, DomainDistrParamPolicy):
            raise pyrado.TypeErr(given=subrtn.policy,
                                 expected_type=DomainDistrParamPolicy)
        if not isinstance(behavior_policy, Policy):
            raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy)
        if subrtn.policy.num_param != len(subrtn.env.dp_mapping):
            raise pyrado.ShapeErr(
                msg=
                f"Number of policy parameters {subrtn.policy.num_param} does not match the"
                f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!"
            )
        if subrtn.sampler.num_init_states_per_domain != 1:
            # Only sample one rollout in every domain. This is possible since we are synchronizing the init state.
            raise pyrado.ValueErr(
                given=subrtn.sampler.num_init_states_per_domain,
                eq_constraint="1")
        if num_rollouts_per_distr < 2:
            raise pyrado.ValueErr(given=num_rollouts_per_distr,
                                  g_constraint="1")
        if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim:
            raise pyrado.ShapeErr(given=obs_dim_weight,
                                  expected_match=subrtn.env.obs_space)

        # Call Algorithm's constructor
        super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy,
                         subrtn.logger)

        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"
        self._behavior_policy = behavior_policy
        self.obs_dim_weight = np.diag(
            obs_dim_weight
        )  # weighting factor between the different observations
        self.std_obs_filt = std_obs_filt
        if metric is None or metric == "None":
            self.metric = partial(self.weighted_l1_l2_metric,
                                  w_abs=w_abs,
                                  w_sq=w_sq,
                                  obs_dim_weight=self.obs_dim_weight)
        else:
            self.metric = metric

        # Get and optionally clip the observation bounds of the environment
        elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up
        elb, eub = self.override_obs_bounds(elb, eub,
                                            subrtn.env.obs_space.labels)
        self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub)

        # Create the sampler used to execute the same policy as on the real system in the meta-randomized env
        self.base_seed = base_seed
        self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env,
                                                       self._behavior_policy,
                                                       num_workers=num_workers,
                                                       min_rollouts=1,
                                                       seed=base_seed)
        self.num_rollouts_per_distr = num_rollouts_per_distr
Ejemplo n.º 9
0
class SysIdViaEpisodicRL(Algorithm):
    """
    Wrapper to frame black-box system identification as an episodic reinforcement learning problem

    .. note::
        This algorithm was designed as a subroutine of SimOpt. However, it could also be used independently.
    """

    name: str = "sysiderl"
    iteration_key: str = "sysiderl_iteration"  # logger's iteration key

    def __init__(
        self,
        subrtn: ParameterExploring,
        behavior_policy: Policy,
        num_rollouts_per_distr: int,
        metric: Union[Callable[[np.ndarray], np.ndarray], None],
        obs_dim_weight: Union[list, np.ndarray],
        std_obs_filt: int = 5,
        w_abs: float = 0.5,
        w_sq: float = 1.0,
        num_workers: int = 4,
        base_seed: int = 1001,
    ):
        """
        Constructor

        :param subrtn: wrapped algorithm to fit the domain parameter distribution
        :param behavior_policy: lower level policy used to generate the rollouts
        :param num_rollouts_per_distr: number of rollouts per domain distribution parameter set
        :param metric: functional mapping from differences in observations to value
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions for the default metric
        :param std_obs_filt: number of standard deviations for the Gaussian filter applied to the observaitons
        :param w_abs: weight for the mean absolute errors for the default metric
        :param w_sq: weight for the mean squared errors for the default metric
        :param num_workers: number of environments for parallel sampling
        :param base_seed: seed to set for the parallel sampler in every iteration
        """
        if not isinstance(subrtn, ParameterExploring):
            raise pyrado.TypeErr(given=subrtn,
                                 expected_type=ParameterExploring)
        if not isinstance(subrtn.env, MetaDomainRandWrapper):
            raise pyrado.TypeErr(given=subrtn.env,
                                 expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn.policy, DomainDistrParamPolicy):
            raise pyrado.TypeErr(given=subrtn.policy,
                                 expected_type=DomainDistrParamPolicy)
        if not isinstance(behavior_policy, Policy):
            raise pyrado.TypeErr(given=behavior_policy, expected_type=Policy)
        if subrtn.policy.num_param != len(subrtn.env.dp_mapping):
            raise pyrado.ShapeErr(
                msg=
                f"Number of policy parameters {subrtn.policy.num_param} does not match the"
                f"number of domain distribution parameters {len(subrtn.env.dp_mapping)}!"
            )
        if subrtn.sampler.num_init_states_per_domain != 1:
            # Only sample one rollout in every domain. This is possible since we are synchronizing the init state.
            raise pyrado.ValueErr(
                given=subrtn.sampler.num_init_states_per_domain,
                eq_constraint="1")
        if num_rollouts_per_distr < 2:
            raise pyrado.ValueErr(given=num_rollouts_per_distr,
                                  g_constraint="1")
        if len(obs_dim_weight) != subrtn.env.obs_space.flat_dim:
            raise pyrado.ShapeErr(given=obs_dim_weight,
                                  expected_match=subrtn.env.obs_space)

        # Call Algorithm's constructor
        super().__init__(subrtn.save_dir, subrtn.max_iter, subrtn.policy,
                         subrtn.logger)

        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"
        self._behavior_policy = behavior_policy
        self.obs_dim_weight = np.diag(
            obs_dim_weight
        )  # weighting factor between the different observations
        self.std_obs_filt = std_obs_filt
        if metric is None or metric == "None":
            self.metric = partial(self.weighted_l1_l2_metric,
                                  w_abs=w_abs,
                                  w_sq=w_sq,
                                  obs_dim_weight=self.obs_dim_weight)
        else:
            self.metric = metric

        # Get and optionally clip the observation bounds of the environment
        elb, eub = subrtn.env.obs_space.bound_lo, subrtn.env.obs_space.bound_up
        elb, eub = self.override_obs_bounds(elb, eub,
                                            subrtn.env.obs_space.labels)
        self.obs_normalizer = UnitCubeProjector(bound_lo=elb, bound_up=eub)

        # Create the sampler used to execute the same policy as on the real system in the meta-randomized env
        self.base_seed = base_seed
        self.behavior_sampler = ParallelRolloutSampler(self._subrtn.env,
                                                       self._behavior_policy,
                                                       num_workers=num_workers,
                                                       min_rollouts=1,
                                                       seed=base_seed)
        self.num_rollouts_per_distr = num_rollouts_per_distr

    @property
    def subrtn(self) -> ParameterExploring:
        """Get the subroutine used for updating the domain parameter distribution."""
        return self._subrtn

    def reset(self, seed: int = None):
        # Reset internal variables inherited from Algorithm
        self._curr_iter = 0
        self._cnt_samples = 0
        self._highest_avg_ret = -pyrado.inf

        # Forward to subroutine
        self._subrtn.reset(seed)

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if "rollouts_real" not in meta_info:
            raise pyrado.KeyErr(keys="rollouts_real", container=meta_info)

        # Extract the initial states from the real rollouts
        rollouts_real = meta_info["rollouts_real"]
        init_states_real = [ro.states[0, :] for ro in rollouts_real]

        # Sample new policy parameters a.k.a domain distribution parameters
        param_sets = self._subrtn.expl_strat.sample_param_sets(
            nominal_params=self._subrtn.policy.param_values,
            num_samples=self._subrtn.pop_size,
            include_nominal_params=True,
        )

        # Iterate over every domain parameter distribution. We basically mimic the ParameterExplorationSampler here,
        # but we need to adapt the randomizer (and not just the domain parameters) por every policy param set
        param_samples = []
        loss_hist = []
        for idx_ps, ps in enumerate(param_sets):
            # Update the randomizer to use the new
            new_ddp_vals = self._subrtn.policy.transform_to_ddp_space(ps)
            self._subrtn.env.adapt_randomizer(
                domain_distr_param_values=new_ddp_vals.detach().cpu().numpy())
            self._subrtn.env.randomizer.randomize(
                num_samples=self.num_rollouts_per_distr)
            sampled_domain_params = self._subrtn.env.randomizer.get_params()

            # Sample the rollouts
            rollouts_sim = self.behavior_sampler.sample(init_states_real,
                                                        sampled_domain_params,
                                                        eval=True)

            # Iterate over simulated rollout with the same initial state
            for idx_real, idcs_sim in enumerate(
                    gen_ordered_batch_idcs(self.num_rollouts_per_distr,
                                           len(rollouts_sim),
                                           sorted=True)):
                # Clip the rollouts rollouts yielding two lists of pairwise equally long rollouts
                ros_real_tr, ros_sim_tr = self.truncate_rollouts(
                    [rollouts_real[idx_real]],
                    rollouts_sim[slice(idcs_sim[0], idcs_sim[-1] + 1)])

                # Check the validity of the initial states. The domain parameters will be different.
                assert len(ros_real_tr) == len(ros_sim_tr) == len(idcs_sim)
                assert check_all_equal([ro.states[0, :] for ro in ros_real_tr])
                assert check_all_equal([ro.states[0, :] for ro in ros_sim_tr])
                assert all([
                    np.allclose(r.states[0, :], s.states[0, :])
                    for r, s in zip(ros_real_tr, ros_sim_tr)
                ])

                # Compute the losses
                losses = np.asarray([
                    self.loss_fcn(ro_r, ro_s)
                    for ro_r, ro_s in zip(ros_real_tr, ros_sim_tr)
                ])

                if np.all(losses == 0.0):
                    raise pyrado.ValueErr(
                        msg=
                        "All SysIdViaEpisodicRL losses are equal to zero! Most likely the domain"
                        "randomization is too extreme, such that every trajectory is done after"
                        "one step. Check the exploration strategy.")

                # Handle zero losses by setting them to the maximum current loss
                losses[losses == 0] = np.max(losses)
                loss_hist.extend(losses)

                # We need to assign the loss value to the simulated rollout, but this one can be of a different
                # length than the real-world rollouts as well as of different length than the original
                # (non-truncated) simulated rollout. Thus, we simply write the loss value into the first step.
                for i, l in zip(range(idcs_sim[0], idcs_sim[-1] + 1), losses):
                    rollouts_sim[i].rewards[:] = 0.0
                    rollouts_sim[i].rewards[0] = -l

            # Collect the results
            param_samples.append(
                ParameterSample(params=ps, rollouts=rollouts_sim))

        # Bind the parameter samples and their rollouts in the usual container
        param_samp_res = ParameterSamplingResult(param_samples)
        self._cnt_samples += sum(
            [len(ro) for pss in param_samp_res for ro in pss.rollouts])

        # Log metrics computed from the old policy (before the update)
        loss_hist = np.asarray(loss_hist)
        self.logger.add_value("min sysid loss", np.min(loss_hist), 6)
        self.logger.add_value("median sysid loss", np.median(loss_hist), 6)
        self.logger.add_value("avg sysid loss", np.mean(loss_hist), 6)
        self.logger.add_value("max sysid loss", np.max(loss_hist), 6)
        self.logger.add_value("std sysid loss", np.std(loss_hist), 6)

        # Extract the best policy parameter sample for saving it later
        self._subrtn.best_policy_param = param_samp_res.parameters[np.argmax(
            param_samp_res.mean_returns)].clone()

        # Save snapshot data
        self.make_snapshot(snapshot_mode,
                           float(np.max(param_samp_res.mean_returns)),
                           meta_info)

        # Update the wrapped algorithm's update method
        self._subrtn.update(
            param_samp_res,
            ret_avg_curr=param_samp_res[0].mean_undiscounted_return)

    @staticmethod
    def override_obs_bounds(bound_lo: np.ndarray, bound_up: np.ndarray,
                            labels: np.ndarray) -> (np.ndarray, np.ndarray):
        """
        Default overriding method for the bounds of an observation space. This is necessary when the observations
        are scaled with their range, e.g. to compare a deviation over different kinds of observations like position and
        annular velocity. Thus, infinite bounds are not feasible.

        :param bound_lo: lower bound of the observation space
        :param bound_up: upper bound of the observation space
        :param labels: label for each dimension of the observation space to override
        :return: clipped lower and upper bound
        """
        bound_lo = ObsNormWrapper.override_bounds(bound_lo, {
            "theta_dot": -20.0,
            "alpha_dot": -20.0
        }, labels)
        bound_up = ObsNormWrapper.override_bounds(bound_up, {
            "theta_dot": 20.0,
            "alpha_dot": 20.0
        }, labels)
        return bound_lo, bound_up

    @staticmethod
    def weighted_l1_l2_metric(err: np.ndarray, w_abs: float, w_sq: float,
                              obs_dim_weight: np.ndarray):
        """
        Compute the weighted linear combination of the observation error's MAE and MSE, averaged over time

        .. note::
            In contrast to [1], we are using the mean absolute error and the mean squared error instead of the L1 and
            the L2 norm. The reason for this is that longer time series would be punished otherwise.

        :param err: error signal with time steps along the first dimension
        :param w_abs: weight for the mean absolute errors
        :param w_sq: weight for the mean squared errors
        :param obs_dim_weight: (diagonal) weight matrix for the different observation dimensions
        :return: weighted linear combination of the error's MAE and MSE, averaged over time
        """
        err_w = np.matmul(err, obs_dim_weight)
        return w_abs * np.mean(np.abs(err_w), axis=0) + w_sq * np.mean(
            np.power(err_w, 2), axis=0)

    def loss_fcn(self, rollout_real: StepSequence,
                 rollout_sim: StepSequence) -> float:
        """
        Compute the discrepancy between two time sequences of observations given metric.
        Be sure to align and truncate the rollouts beforehand.

        :param rollout_real: (concatenated) real-world rollout containing the observations
        :param rollout_sim: (concatenated) simulated rollout containing the observations
        :return: discrepancy cost summed over the observation dimensions
        """
        if len(rollout_real) != len(rollout_sim):
            raise pyrado.ShapeErr(given=rollout_real,
                                  expected_match=rollout_sim)

        # Extract the observations
        real_obs = rollout_real.get_data_values("observations",
                                                truncate_last=True)
        sim_obs = rollout_sim.get_data_values("observations",
                                              truncate_last=True)

        # Filter the observations
        real_obs = gaussian_filter1d(real_obs, self.std_obs_filt, axis=0)
        sim_obs = gaussian_filter1d(sim_obs, self.std_obs_filt, axis=0)

        # Normalize the signals
        real_obs_norm = self.obs_normalizer.project_to(real_obs)
        sim_obs_norm = self.obs_normalizer.project_to(sim_obs)

        # Compute loss based on the error
        loss_per_obs_dim = self.metric(real_obs_norm - sim_obs_norm)
        assert len(loss_per_obs_dim) == real_obs.shape[1]
        assert all(loss_per_obs_dim >= 0)
        return sum(loss_per_obs_dim)

    @staticmethod
    def truncate_rollouts(
        rollouts_real: Sequence[StepSequence],
        rollouts_sim: Sequence[StepSequence],
        replicate: bool = True
    ) -> Tuple[Sequence[StepSequence], Sequence[StepSequence]]:
        """
        In case (some of the) rollouts failed or succeed in one domain, but not in the other, we truncate the longer
        observation sequence. When truncating, we compare every of the M real rollouts to every of the N simulated
        rollouts, thus replicate the real rollout N times and the simulated rollouts M times.

        :param rollouts_real: M real-world rollouts of different length if `replicate = True`, else K real-world
                              rollouts of different length
        :param rollouts_sim: N simulated rollouts of different length if `replicate = True`, else K simulated
                              rollouts of different length
        :param replicate: if `False` the i-th rollout from `rollouts_real` is (only) compared with the i-th rollout from
                          `rollouts_sim`, in this case the number of rollouts and the initial states have to match
        :return: MxN real-world rollouts and MxN simulated rollouts of equal length if `replicate = True`, else
                 K real-world rollouts and K simulated rollouts of equal length
        """
        if not isinstance(rollouts_real[0], Iterable):
            raise pyrado.TypeErr(given=rollouts_real[0],
                                 expected_type=Iterable)
        if not isinstance(rollouts_sim[0], Iterable):
            raise pyrado.TypeErr(given=rollouts_sim[0], expected_type=Iterable)
        if not replicate and len(rollouts_real) != len(rollouts_sim):
            raise pyrado.ShapeErr(
                msg=
                "In case of a one on one comparison, the number of rollouts needs to be equal!"
            )

        # Choose the function for creating the comparison, the rollouts
        comp_fcn = product if replicate else zip

        # Go over all combinations rollouts individually
        rollouts_real_tr = []
        rollouts_sim_tr = []
        for ro_r, ro_s in comp_fcn(rollouts_real, rollouts_sim):
            # Handle rollouts of different length, assuming that they are staring at the same state
            if ro_r.length < ro_s.length:
                rollouts_real_tr.append(ro_r)
                rollouts_sim_tr.append(ro_s[:ro_r.length])
            elif ro_r.length > ro_s.length:
                rollouts_real_tr.append(ro_r[:ro_s.length])
                rollouts_sim_tr.append(ro_s)
            else:
                rollouts_real_tr.append(ro_r)
                rollouts_sim_tr.append(ro_s)

        return rollouts_real_tr, rollouts_sim_tr

    def save_snapshot(self, meta_info: dict = None):
        super().save_snapshot(meta_info)

        # ParameterExploring subroutine saves the best policy (in this case a DomainDistrParamPolicy)
        prefix = meta_info.get("prefix", "")
        if prefix != "":
            self._subrtn.save_snapshot(meta_info=dict(
                prefix=f"{prefix}_ddp"))  # save iter_X_ddp_policy.pt
        self._subrtn.save_snapshot(
            meta_info=dict(prefix="ddp"))  # override ddp_policy.pt

        joblib.dump(self._subrtn.env, osp.join(self.save_dir, "env_sim.pkl"))

        # Print the current search distribution's mean
        cpp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.policy.param_values)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cpp.detach().cpu().numpy())
        print_cbt(
            f"Current policy domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        # Set the randomizer to best fitted domain distribution
        cbp = self._subrtn.policy.transform_to_ddp_space(
            self._subrtn.best_policy_param)
        self._subrtn.env.adapt_randomizer(
            domain_distr_param_values=cbp.detach().cpu().numpy())
        print_cbt(
            f"Best fitted domain parameter distribution\n{self._subrtn.env.randomizer}",
            "g")

        if "rollouts_real" not in meta_info:
            raise pyrado.KeyErr(keys="rollouts_real", container=meta_info)
        pyrado.save(meta_info["rollouts_real"],
                    "rollouts_real.pkl",
                    self.save_dir,
                    prefix=prefix)
Ejemplo n.º 10
0
    # Adjustable experiment parameters
    set_seed(1001)
    num_init_samples = 4  # number of initial random points
    num_iter = 6  # number of BO updates
    noise_std = 0.0  # noise level
    acq_fcn = "EI"  # acquisition function (UCB / EI / PI)
    num_acq_restarts = 100  # number of restarts for optimizing the acquisition function
    num_acq_samples = 500  # number of samples for used for optimizing the acquisition function
    ucb_beta = 0.1  # UCB coefficient (only necessary if UCB is used

    # Function boundaries
    x_min_raw, x_max_raw = (-2.0, 5.0)
    x_min, x_max = (0.0, 1.0)
    bounds_raw = to.tensor([[x_min_raw], [x_max_raw]])
    bounds = to.tensor([[x_min], [x_max]])
    uc = UnitCubeProjector(bounds_raw[0, :], bounds_raw[1, :])

    # Generate initial data
    X_train_raw = (x_max_raw - x_min_raw) * to.rand(num_init_samples,
                                                    1) + x_min_raw
    y_train_raw = to.from_numpy(
        noisy_nonlin_fcn(X_train_raw.numpy(), noise_std=noise_std))
    X_train = uc.project_to(X_train_raw)
    y_mean, y_std = y_train_raw.mean(), y_train_raw.std()
    y_train = (y_train_raw - y_mean) / y_std

    # Get best observed value from dataset
    best_observed = [y_train_raw.max().item()]

    # Initialize model
    gp = SingleTaskGP(X_train, y_train)
Ejemplo n.º 11
0
if __name__ == '__main__':
    # Parse command line arguments
    args = get_argparser().parse_args()

    # Get the experiment's directory to load from
    ex_dir = ask_for_experiment() if args.ex_dir is None else args.ex_dir

    # Load the environment and the policy
    env_sim, policy, kwout = load_experiment(ex_dir, args)

    # Load the required data
    cands = to.load(osp.join(ex_dir, 'candidates.pt'))
    cands_values = to.load(osp.join(ex_dir, 'candidates_values.pt')).unsqueeze(1)
    bounds = to.load(osp.join(ex_dir, 'bounds.pt'))
    uc_normalizer = UnitCubeProjector(bounds[0, :], bounds[1, :])

    # Decide on which algorithm to use via the mode argument
    if args.mode == PPO.name:
        critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic'])
        subroutine = PPO(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine'])
    elif args.mode == PPO2.name:
        critic = GAE(kwout['value_fcn'], **kwout['hparams']['critic'])
        subroutine = PPO2(ex_dir, env_sim, policy, critic, **kwout['hparams']['subroutine'])
    elif args.mode == CEM.name:
        subroutine = CEM(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    elif args.mode == NES.name:
        subroutine = NES(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    elif args.mode == PoWER.name:
        subroutine = PoWER(ex_dir, env_sim, policy, **kwout['hparams']['subroutine'])
    else: