Ejemplo n.º 1
0
def test_magic_function_implementation_or():
    a = CustomStoppingCriterion(None, "A")
    b = CustomStoppingCriterion(None, "B")
    for criterion, expected_str in [(a | a, "(A or A)"), (b | b, "(B or B)"),
                                    (a | b, "(A or B)"), (b | a, "(B or A)")]:
        assert isinstance(criterion, _OrStoppingCriterion)
        assert str(criterion) == expected_str
Ejemplo n.º 2
0
def test_magic_function_implementation_and():
    a = CustomStoppingCriterion(None, "A")
    b = CustomStoppingCriterion(None, "B")
    for criterion, expected_str in [
        (a & a, "(A and A)"),
        (b & b, "(B and B)"),
        (a & b, "(A and B)"),
        (b & a, "(B and A)"),
    ]:
        assert isinstance(criterion, _AndStoppingCriterion)
        assert str(criterion) == expected_str
Ejemplo n.º 3
0
def test_criterion_custom(is_met_expected):
    # Assigning to a variable in a closure would redefine the scope, so rather use a list as a holding.
    was_called = [False]
    algo_expected = "ABC"

    def criterion_fn(algo):
        was_called[0] = True
        assert algo == algo_expected
        return is_met_expected

    criterion = CustomStoppingCriterion(criterion_fn, "Name")

    assert str(criterion) == "Name"
    assert criterion.is_met(algo_expected) == is_met_expected
    assert was_called[0]
Ejemplo n.º 4
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env_sim: MetaDomainRandWrapper,
        env_real: Union[RealEnv, EnvWrapper],
        subrtn: Algorithm,
        ddp_space: BoxSpace,
        max_iter: int,
        acq_fc: str,
        acq_restarts: int,
        acq_samples: int,
        acq_param: dict = None,
        num_init_cand: int = 5,
        mc_estimator: bool = True,
        num_eval_rollouts_real: int = 5,
        num_eval_rollouts_sim: int = 50,
        thold_succ: float = pyrado.inf,
        thold_succ_subrtn: float = -pyrado.inf,
        warmstart: bool = True,
        policy_param_init: Optional[to.Tensor] = None,
        valuefcn_param_init: Optional[to.Tensor] = None,
        subrtn_snapshot_mode: str = "best",
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        """
        Constructor

        .. note::
            If you want to continue an experiment, use the `load_dir` argument for the `train` call. If you want to
            initialize every of the policies with a pre-trained policy parameters use `policy_param_init`.

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env_sim: randomized simulation environment a.k.a. source domain
        :param env_real: real-world environment a.k.a. target domain
        :param subrtn: algorithm which performs the policy / value-function optimization
        :param ddp_space: space holding the boundaries for the domain distribution parameters
        :param max_iter: maximum number of iterations
        :param acq_fc: Acquisition Function
                       'UCB': Upper Confidence Bound (default $\beta = 0.1$)
                       'EI': Expected Improvement
                       'PI': Probability of Improvement
        :param acq_restarts: number of restarts for optimizing the acquisition function
        :param acq_samples: number of initial samples for optimizing the acquisition function
        :param acq_param: hyper-parameter for the acquisition function, e.g. $\beta$ for UCB
        :param num_init_cand: number of initial policies to train, ignored if `init_dir` is provided
        :param mc_estimator: estimate the return with a sample average (`True`) or a lower confidence
                                     bound (`False`) obtained from bootstrapping
        :param num_eval_rollouts_real: number of rollouts in the target domain to estimate the return
        :param num_eval_rollouts_sim: number of rollouts in simulation to estimate the return after training
        :param thold_succ: success threshold on the real system's return for BayRn, stop the algorithm if exceeded
        :param thold_succ_subrtn: success threshold on the simulated system's return for the subroutine, repeat the
                                  subroutine until the threshold is exceeded or the for a given number of iterations
        :param warmstart: initialize the policy (and value function) parameters with the one of the previous iteration.
                          This behavior can also be overruled by passing `init_policy_params` (and
                          `valuefcn_param_init`) explicitly.
        :param policy_param_init: initial policy parameter values for the subroutine, set `None` to be random
        :param valuefcn_param_init: initial value function parameter values for the subroutine, set `None` to be random
        :param subrtn_snapshot_mode: snapshot mode for saving during training of the subroutine
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if typed_env(env_sim, MetaDomainRandWrapper) is None:
            raise pyrado.TypeErr(given=env_sim,
                                 expected_type=MetaDomainRandWrapper)
        if not isinstance(subrtn, Algorithm):
            raise pyrado.TypeErr(given=subrtn, expected_type=Algorithm)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)
        if num_init_cand < 1:
            raise pyrado.ValueErr(given=num_init_cand, ge_constraint="1")

        # Call InterruptableAlgorithm's constructor without specifying the policy
        super().__init__(
            num_checkpoints=2,
            init_checkpoint=-2,
            save_dir=save_dir,
            max_iter=max_iter,
            policy=subrtn.policy,
            logger=logger,
        )

        self._env_sim = env_sim
        self._env_real = env_real
        self._subrtn = subrtn
        self._subrtn.save_name = "subrtn"
        self.ddp_space = ddp_space
        self.ddp_projector = UnitCubeProjector(
            to.from_numpy(
                self.ddp_space.bound_lo).to(dtype=to.get_default_dtype()),
            to.from_numpy(
                self.ddp_space.bound_up).to(dtype=to.get_default_dtype()),
        )
        self.cands = None  # called x in the context of GPs
        self.cands_values = None  # called y in the context of GPs
        self.argmax_cand = to.Tensor()
        self.acq_fcn_type = acq_fc.upper()
        self.acq_restarts = acq_restarts
        self.acq_samples = acq_samples
        self.acq_param = acq_param
        self.num_init_cand = num_init_cand
        self.mc_estimator = mc_estimator
        self.policy_param_init = policy_param_init
        self.valuefcn_param_init = valuefcn_param_init.detach(
        ) if valuefcn_param_init is not None else None
        self.warmstart = warmstart
        self.num_eval_rollouts_real = num_eval_rollouts_real
        self.num_eval_rollouts_sim = num_eval_rollouts_sim
        self.subrtn_snapshot_mode = subrtn_snapshot_mode
        self.thold_succ = to.tensor([thold_succ], dtype=to.get_default_dtype())
        self.thold_succ_subrtn = to.tensor([thold_succ_subrtn],
                                           dtype=to.get_default_dtype())
        self.max_subrtn_rep = 3  # number of tries to exceed thold_succ_subrtn during training in simulation
        self.curr_cand_value = -pyrado.inf  # for the stopping criterion
        self.num_workers = int(num_workers)

        if self.policy_param_init is not None:
            if to.is_tensor(self.policy_param_init):
                self.policy_param_init.detach()
            else:
                self.policy_param_init = to.tensor(self.policy_param_init)

        # Save initial environments and the domain distribution parameter space
        self.save_snapshot(meta_info=None)
        pyrado.save(self.ddp_space, "ddp_space.pkl", self.save_dir)

        self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(
            self._custom_stopping_criterion)
Ejemplo n.º 5
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        num_init_states_per_domain: int,
        num_domains: int,
        pop_size: Optional[int] = None,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param pop_size: number of solutions in the population, pass `None` to use a default that scales logarithmically
                         with the number of policy parameters
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(env, Env):
            raise pyrado.TypeErr(given=env, expected_type=Env)
        if not (isinstance(pop_size, int) or pop_size is None):
            raise pyrado.TypeErr(given=pop_size, expected_type=int)
        if isinstance(pop_size, int) and pop_size <= 0:
            raise pyrado.ValueErr(given=pop_size, g_constraint="0")

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        self._env = env

        # Auto-select population size if needed
        if pop_size is None:
            pop_size = 4 + int(3 * np.log(policy.num_param))
            print_cbt(f"Initialized population size to {pop_size}.", "y")
        self.pop_size = pop_size

        # Create sampler
        self._sampler = ParameterExplorationSampler(
            env,
            policy,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            num_workers=num_workers,
        )

        # Stopping criterion
        self.ret_avg_stack = 1e3 * np.random.randn(20)  # stack size = 20
        self.thold_ret_std = 1e-1  # algorithm terminates if below for multiple iterations

        # Saving the best policy (this is not the mean for policy parameter exploration)
        self.best_policy_param = policy.param_values.clone()

        # Set this in subclasses
        self._expl_strat = None

        self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(
            self._custom_stopping_criterion)
Ejemplo n.º 6
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        inputs: to.Tensor,
        targets: to.Tensor,
        policy: Policy,
        max_iter: int,
        max_iter_no_improvement: int = 30,
        optim_class=optim.Adam,
        optim_hparam: dict = None,
        loss_fcn=nn.MSELoss(),
        batch_size: int = 256,
        ratio_train: float = 0.8,
        max_grad_norm: Optional[float] = None,
        lr_scheduler=None,
        lr_scheduler_hparam: Optional[dict] = None,
        logger: StepLogger = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param inputs: input data set, where the samples are along the first dimension
        :param targets: target data set, where the samples are along the first dimension
        :param policy: Pyrado policy (subclass of PyTorch's Module) to train
        :param max_iter: maximum number of iterations
        :param max_iter_no_improvement: if the performance on the validation set did not improve for this many
                                        iterations, the policy is considered to have converged, i.e. training stops
        :param optim_class: PyTorch optimizer class
        :param optim_hparam: hyper-parameters for the PyTorch optimizer
        :param loss_fcn: loss function for training, by default `torch.nn.MSELoss()`
        :param batch_size: number of samples per policy update batch
        :param ratio_train: ratio of the training samples w.r.t. the total sample count
        :param max_grad_norm: maximum L2 norm of the gradients for clipping, set to `None` to disable gradient clipping
        :param lr_scheduler: learning rate scheduler that does one step per epoch (pass through the whole data set)
        :param lr_scheduler_hparam: hyper-parameters for the learning rate scheduler
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(inputs, to.Tensor):
            raise pyrado.TypeErr(given=inputs, expected_type=to.Tensor)
        if not isinstance(targets, to.Tensor):
            raise pyrado.TypeErr(given=targets, expected_type=to.Tensor)
        if not isinstance(ratio_train, float):
            raise pyrado.TypeErr(given=ratio_train, expected_type=float)
        if not (0 < ratio_train < 1):
            raise pyrado.ValueErr(given=ratio_train,
                                  g_constraint="0",
                                  l_constraint="1")

        # Call Algorithm's constructor
        super().__init__(save_dir, max_iter, policy, logger)

        # Construct the dataset (samples along rows)
        inputs = to.atleast_2d(
            inputs).T if inputs.ndimension() == 1 else inputs
        targets = to.atleast_2d(
            targets).T if targets.ndimension() == 1 else targets
        if inputs.shape[0] != targets.shape[0]:
            raise pyrado.ShapeErr(given=targets, expected_match=inputs)
        num_samples_all = inputs.shape[0]
        dataset = TensorDataset(
            inputs, targets)  # shared for training and validation loaders

        # Create training and validation loader
        idcs_all = to.randperm(num_samples_all)
        num_samples_trn = int(ratio_train * num_samples_all)
        num_samples_val = num_samples_all - num_samples_trn
        idcs_trn, idcs_val = idcs_all[:num_samples_trn], idcs_all[
            num_samples_trn:]
        self.loader_trn = DataLoader(
            dataset,
            batch_size=min(batch_size, num_samples_trn),
            drop_last=True,
            sampler=SubsetRandomSampler(idcs_trn),
        )
        self.loader_val = DataLoader(
            dataset,
            batch_size=min(batch_size, num_samples_val),
            drop_last=True,
            sampler=SubsetRandomSampler(idcs_val),
        )

        # Set defaults which can be overwritten by passing optim_hparam, and create the optimizer
        optim_hparam = merge_dicts(
            [dict(lr=5e-3, eps=1e-8, weight_decay=1e-4), optim_hparam])
        self.optim = optim_class([{
            "params": self._policy.parameters()
        }], **optim_hparam)

        self.batch_size = batch_size
        self.ratio_train = ratio_train
        self.loss_fcn = loss_fcn
        self.max_grad_norm = max_grad_norm
        self._lr_scheduler = lr_scheduler
        self._lr_scheduler_hparam = lr_scheduler_hparam
        if lr_scheduler is not None and lr_scheduler_hparam is not None:
            self._lr_scheduler = lr_scheduler(self.optim,
                                              **lr_scheduler_hparam)

        # Stopping criterion
        self._curr_loss_val = pyrado.inf
        self._best_loss_val = pyrado.inf
        self._cnt_iter_no_improvement = 0
        self._max_iter_no_improvement = max_iter_no_improvement

        self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(
            self._custom_stopping_criterion)
Ejemplo n.º 7
0
    def step(self, snapshot_mode: str, meta_info: dict = None):

        if isinstance(inner_env(self._env), BallOnPlate5DSim):
            ctrl_gains = to.tensor(
                [
                    [0.1401, 0, 0, 0, -0.09819, -0.1359, 0, 0.545, 0, 0, 0, -0.01417, -0.04427, 0],
                    [0, 0.1381, 0, 0.2518, 0, 0, -0.2142, 0, 0.5371, 0, 0.03336, 0, 0, -0.1262],
                    [0, 0, 0.1414, 0.0002534, 0, 0, -0.0002152, 0, 0, 0.5318, 0, 0, 0, -0.0001269],
                    [0, -0.479, -0.0004812, 39.24, 0, 0, -15.44, 0, -1.988, -0.001934, 9.466, 0, 0, -13.14],
                    [0.3039, 0, 0, 0, 25.13, 15.66, 0, 1.284, 0, 0, 0, 7.609, 6.296, 0],
                ]
            )

            # Compensate for the mismatching different state definition
            if self.ball_z_dim_mismatch:
                ctrl_gains = insert_tensor_col(ctrl_gains, 7, to.zeros((5, 1)))  # ball z position
                ctrl_gains = insert_tensor_col(ctrl_gains, -1, to.zeros((5, 1)))  # ball z velocity

        elif isinstance(inner_env(self._env), QBallBalancerSim):
            # Since the control module can by tricky to install (recommended using anaconda), we only load it if needed
            import control

            # System modeling
            dp = self._env.domain_param
            dp["J_eq"] = self._env._J_eq
            dp["B_eq_v"] = self._env._B_eq_v
            dp["c_kin"] = self._env._c_kin
            dp["zeta"] = self._env._zeta
            dp["A_m"] = self._env._A_m

            A = np.zeros((self._env.obs_space.flat_dim, self._env.obs_space.flat_dim))
            A[: self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim // 2 :] = np.eye(
                self._env.obs_space.flat_dim // 2
            )
            A[4, 4] = -dp["B_eq_v"] / dp["J_eq"]
            A[5, 5] = -dp["B_eq_v"] / dp["J_eq"]
            A[6, 0] = dp["c_kin"] * dp["ball_mass"] * dp["gravity_const"] * dp["ball_radius"] ** 2 / dp["zeta"]
            A[6, 6] = -dp["c_kin"] * dp["ball_radius"] ** 2 / dp["zeta"]
            A[7, 1] = dp["c_kin"] * dp["ball_mass"] * dp["gravity_const"] * dp["ball_radius"] ** 2 / dp["zeta"]
            A[7, 7] = -dp["c_kin"] * dp["ball_radius"] ** 2 / dp["zeta"]
            B = np.zeros((self._env.obs_space.flat_dim, self._env.act_space.flat_dim))
            B[4, 0] = dp["A_m"] / dp["J_eq"]
            B[5, 1] = dp["A_m"] / dp["J_eq"]
            # C = np.zeros((self._env.obs_space.flat_dim // 2, self._env.obs_space.flat_dim))
            # C[:self._env.obs_space.flat_dim // 2, :self._env.obs_space.flat_dim // 2] =
            # np.eye(self._env.obs_space.flat_dim // 2)
            # D = np.zeros((self._env.obs_space.flat_dim // 2, self._env.act_space.flat_dim))

            # Get the weighting matrices from the environment
            if not isinstance(self._env.task.rew_fcn, QuadrErrRewFcn):
                # The environment uses a reward function compatible with the LQR
                Q = self._env.task.rew_fcn.Q
                R = self._env.task.rew_fcn.R
            else:
                # The environment does not use a reward function compatible with the LQR, apply some fine tuning
                Q = np.diag([1e2, 1e2, 5e2, 5e2, 1e-2, 1e-2, 5e0, 5e0])
                R = np.diag([1e-2, 1e-2])

            # Solve the continuous time Riccati eq
            K, _, self.eigvals = control.lqr(A, B, Q, R)  # for discrete system pass dt
            ctrl_gains = to.from_numpy(K).to(to.get_default_dtype())

        else:
            raise pyrado.TypeErr(given=inner_env(self._env), expected_type=[BallOnPlate5DSim, QBallBalancerSim])

        # Assign the controller gains
        self._policy.init_param(-1 * ctrl_gains)  # in classical control it is u = -K*x; here a = psi(s)*s

        # Sample rollouts to evaluate the LQR
        ros = self.sampler.sample()

        # Logging
        rets = [ro.undiscounted_return() for ro in ros]
        self.logger.add_value("max return", np.max(rets), 4)
        self.logger.add_value("median return", np.median(rets), 4)
        self.logger.add_value("min return", np.min(rets), 4)
        self.logger.add_value("avg return", np.mean(rets), 4)
        self.logger.add_value("std return", np.std(rets), 4)
        self.logger.add_value("avg rollout len", np.mean([ro.length for ro in ros]), 4)
        self.logger.add_value("num total samples", self._cnt_samples)
        self.logger.add_value(
            "min mag policy param", self._policy.param_values[to.argmin(abs(self._policy.param_values))]
        )
        self.logger.add_value(
            "max mag policy param", self._policy.param_values[to.argmax(abs(self._policy.param_values))]
        )

        # Save snapshot data
        self.make_snapshot(snapshot_mode, float(np.mean(rets)), meta_info)

        self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(self._custom_stopping_criterion)
Ejemplo n.º 8
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: DomainRandWrapperBuffer,
        subrtn_cand: Algorithm,
        subrtn_refs: Algorithm,
        max_iter: int,
        alpha: float,
        beta: float,
        nG: int,
        nJ: int,
        ntau: int,
        nc_init: int,
        nr_init: int,
        sequence_cand: callable,
        sequence_refs: callable,
        warmstart_cand: bool = False,
        warmstart_refs: bool = True,
        cand_policy_param_init: Optional[to.Tensor] = None,
        cand_critic_param_init: Optional[to.Tensor] = None,
        num_bs_reps: int = 1000,
        studentized_ci: bool = False,
        base_seed: int = None,
        logger: Optional[StepLogger] = None,
    ):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param subrtn_cand: the algorithm that is called at every iteration of SPOTA to yield a candidate policy
        :param subrtn_refs: the algorithm that is called at every iteration of SPOTA to yield reference policies
        :param max_iter: maximum number of iterations that SPOTA algorithm runs.
                         Each of these iterations includes multiple iterations of the subroutine.
        :param alpha: confidence level for the upper confidence bound (UCBOG)
        :param beta: optimality gap threshold for training
        :param nG: number of reference solutions
        :param nJ: number of samples for Monte-Carlo approximation of the optimality gap
        :param ntau: number of rollouts per domain parameter set
        :param nc_init: initial number of domains for training the candidate solution
        :param nr_init: initial number of domains for training the reference solutions
        :param sequence_cand: mathematical sequence for the number of domains for training the candidate solution
        :param sequence_refs: mathematical sequence for the number of domains for training the reference solutions
        :param warmstart_cand: flag if the next candidate solution should be initialized with the previous one
        :param warmstart_refs: flag if the reference solutions should be initialized with the current candidate
        :param cand_policy_param_init: initial policy parameter values for the candidate, set None to be random
        :param cand_critic_param_init: initial critic parameter values for the candidate, set None to be random
        :param num_bs_reps: number of replications for the statistical bootstrap
        :param studentized_ci: flag if a student T distribution should be applied for the confidence interval
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not typed_env(env, DomainRandWrapperBuffer
                         ):  # there is a domain randomization wrapper
            raise pyrado.TypeErr(
                msg=
                "There must be a DomainRandWrapperBuffer in the environment chain."
            )
        if not isinstance(subrtn_cand, Algorithm):
            raise pyrado.TypeErr(given=subrtn_cand, expected_type=Algorithm)
        if not isinstance(subrtn_refs, Algorithm):
            raise pyrado.TypeErr(given=subrtn_refs, expected_type=Algorithm)

        # Call InterruptableAlgorithm's constructor without specifying the policy
        super().__init__(num_checkpoints=2,
                         save_dir=save_dir,
                         max_iter=max_iter,
                         policy=None,
                         logger=logger)

        # Get the randomized environment (recommended to make it the most outer one in the chain)
        self.env_dr = typed_env(env, DomainRandWrapperBuffer)

        # Candidate and reference solutions, and optimality gap
        self.Gn_diffs = None
        self.ucbog = pyrado.inf  # upper confidence bound on the optimality gap
        self._subrtn_cand = subrtn_cand
        self._subrtn_refs = subrtn_refs
        assert id(self._subrtn_cand) != id(self._subrtn_refs)
        assert id(self._subrtn_cand.policy) != id(self._subrtn_refs.policy)
        assert id(self._subrtn_cand.expl_strat) != id(
            self._subrtn_refs.expl_strat)
        self._subrtn_cand.save_name = "subrtn_cand"
        self._subrtn_refs.save_name = "subrtn_refs"

        self.alpha = alpha
        self.beta = beta
        self.warmstart_cand = warmstart_cand
        self.warmstart_refs = warmstart_refs
        self.cand_policy_param_init = cand_policy_param_init.detach(
        ) if cand_policy_param_init is not None else None
        self.cand_critic_param_init = cand_critic_param_init.detach(
        ) if cand_critic_param_init is not None else None
        self.nG = nG
        self.nJ = nJ
        self.ntau = ntau
        self.nc_init = nc_init
        self.nr_init = nr_init
        self.seq_cand = sequence_cand
        self.seq_ref = sequence_refs
        self.num_bs_reps = num_bs_reps
        self.studentized_ci = studentized_ci
        self.base_seed = np.random.randint(
            low=10000) if base_seed is None else base_seed

        # Save initial environment and randomizer
        self.save_snapshot(meta_info=None)

        self.stopping_criterion = self.stopping_criterion | CustomStoppingCriterion(
            self._custom_stopping_criterion)