Example #1
0
    def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor,
                              ddp_space: BoxSpace, num_restarts: int,
                              num_samples: int) -> to.Tensor:
        """
        Compute the GP input with the maximal posterior mean.

        :param cands: candidates a.k.a. x
        :param cands_values: observed values a.k.a. y
        :param ddp_space: space of the domain distribution parameters, indicates the lower and upper bound
        :param num_restarts: number of restarts for the optimization of the acquisition function
        :param num_samples: number of samples for the optimization of the acquisition function
        :return: un-normalized candidate with maximum posterior value a.k.a. x
        """
        if not isinstance(cands, to.Tensor):
            raise pyrado.TypeErr(given=cands, expected_type=to.Tensor)
        if not isinstance(cands_values, to.Tensor):
            raise pyrado.TypeErr(given=cands_values, expected_type=to.Tensor)
        if not isinstance(ddp_space, BoxSpace):
            raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace)

        # Normalize the input data and standardize the output data
        uc_projector = UnitCubeProjector(
            to.from_numpy(ddp_space.bound_lo).to(dtype=to.get_default_dtype()),
            to.from_numpy(ddp_space.bound_up).to(dtype=to.get_default_dtype()),
        )
        cands_norm = uc_projector.project_to(cands)
        cands_values_stdized = standardize(cands_values)

        if cands_norm.shape[0] > cands_values.shape[0]:
            print_cbt(
                f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring "
                f"the candidates without evaluation for computing the argmax.",
                "y",
            )
            cands_norm = cands_norm[:cands_values.shape[0], :]

        # Create and fit the GP model
        gp = SingleTaskGP(cands_norm, cands_values_stdized)
        gp.likelihood.noise_covar.register_constraint("raw_noise",
                                                      GreaterThan(1e-5))
        mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
        fit_gpytorch_model(mll)

        # Find position with maximal posterior mean
        cand_norm, _ = optimize_acqf(
            acq_function=PosteriorMean(gp),
            bounds=to.stack(
                [to.zeros(ddp_space.flat_dim),
                 to.ones(ddp_space.flat_dim)]).to(dtype=to.float32),
            q=1,
            num_restarts=num_restarts,
            raw_samples=num_samples,
        )

        cand_norm = cand_norm.to(dtype=to.get_default_dtype())
        cand = uc_projector.project_back(cand_norm.detach())
        print_cbt(f"Converged to argmax of the posterior mean: {cand.numpy()}",
                  "g",
                  bright=True)
        return cand
Example #2
0
    def gae(self, concat_ros: StepSequence, v_pred: to.Tensor = None, requires_grad: bool = False) -> to.Tensor:
        """
        Compute the generalized advantage estimation as described in [1].

        :param concat_ros: concatenated rollouts (sequence of steps from potentially different rollouts)
        :param v_pred: state-value predictions if already computed, else pass None
        :param requires_grad: is the gradient required
        :return adv: tensor of advantages
        """
        with ExitStack() as stack:
            if not requires_grad:
                stack.enter_context(to.no_grad())
            if v_pred is None:
                # Get the predictions from the value function
                v_pred = self.values(concat_ros)

            # Compute the advantages
            adv = to.empty_like(v_pred)
            for k in reversed(range(concat_ros.length)):
                if concat_ros[k].done:
                    adv[k] = concat_ros[k].reward - v_pred[k]
                else:
                    adv[k] = concat_ros[k].reward + self.gamma*v_pred[k + 1] - v_pred[k] + \
                             self.gamma*self.lamda*adv[k + 1]

            if self.standardize_adv:
                if isinstance(self.standardizer, RunningStandardizer):
                    adv = self.standardizer(adv, axis=0)
                else:
                    adv = standardize(adv)

            return adv
Example #3
0
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results.mean_returns

        # Get the perturbations (deltas from the current policy parameters)
        s = param_results.parameters - self._policy.param_values
        # also divide by the standard deviation to fully standardize
        s /= self._expl_strat.std

        if self.transform_returns:
            # Ascending sort according to return values
            idcs_acs = np.argsort(rets_avg_ros)[::-1]
            s_asc = s[list(idcs_acs), :]

            # Update the mean (see [1, 2])
            delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (see [1, 2])
            grad_std = self.eta_std_util @ (s_asc**2 - 1.)
            new_std = self._expl_strat.std * to.exp(
                self.lr_std * grad_std / 2.)
            self._expl_strat.adapt(std=new_std)

        else:
            # Standardize averaged returns over all pop_size rollouts
            rets_stdized = standardize(rets_avg_ros)
            rets_stdized = to.from_numpy(rets_stdized).to(
                to.get_default_dtype())

            # delta_mean = 1./len(param_results) * (rets_stdized @ s)
            delta_mean = 1. / (self._expl_strat.std *
                               len(param_results)) * (rets_stdized @ s)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (monotonous exponential decay)
            new_std = self._expl_strat.std * 0.999**self._curr_iter
            self._expl_strat.adapt(std=new_std)

        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std), 4)
        self.logger.add_value('avg expl strat std',
                              to.mean(self._expl_strat.std), 4)
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std), 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)
Example #4
0
    def step(self, snapshot_mode: str = 'latest', meta_info: dict = None):
        # Save snapshot to save the correct iteration count
        self.save_snapshot()

        if self.curr_checkpoint == -2:
            # Train the initial policies in the source domain
            self.train_init_policies()
            self.reached_checkpoint()  # setting counter to -1

        if self.curr_checkpoint == -1:
            # Evaluate the initial policies in the target domain
            self.eval_init_policies()
            self.reached_checkpoint()  # setting counter to 0

        if self.curr_checkpoint == 0:
            # Normalize the input data and standardize the output data
            cands_norm = self.ddp_projector.project_to(self.cands)
            cands_values_stdized = standardize(self.cands_values).unsqueeze(1)

            # Create and fit the GP model
            gp = SingleTaskGP(cands_norm, cands_values_stdized)
            gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5))
            mll = ExactMarginalLogLikelihood(gp.likelihood, gp)
            fit_gpytorch_model(mll)
            print_cbt('Fitted the GP.', 'g')

            # Acquisition functions
            if self.acq_fcn_type == 'UCB':
                acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True)
            elif self.acq_fcn_type == 'EI':
                acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            elif self.acq_fcn_type == 'PI':
                acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True)
            else:
                raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'")

            # Optimize acquisition function and get new candidate point
            cand_norm, acq_value = optimize_acqf(
                acq_function=acq_fcn,
                bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]),
                q=1,
                num_restarts=self.acq_restarts,
                raw_samples=self.acq_samples
            )
            next_cand = self.ddp_projector.project_back(cand_norm)
            print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g')
            self.cands = to.cat([self.cands, next_cand], dim=0)
            pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 1

        if self.curr_checkpoint == 1:
            # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold
            wrapped_trn_fcn = until_thold_exceeded(
                self.thold_succ_subrtn.item(), self.max_subrtn_rep
            )(self.train_policy_sim)
            wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}')
            self.reached_checkpoint()  # setting counter to 2

        if self.curr_checkpoint == 2:
            # Evaluate the current policy in the target domain
            policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir,
                                        meta_info=dict(prefix=f'iter_{self._curr_iter}'))
            self.curr_cand_value = self.eval_policy(
                self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}',
                self.num_eval_rollouts_real
            )
            self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0)
            pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info)

            # Store the argmax after training and evaluating
            curr_argmax_cand = BayRn.argmax_posterior_mean(
                self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples
            )
            self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0)
            pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info)
            self.reached_checkpoint()  # setting counter to 0
Example #5
0
    def __init__(
        self,
        data: to.Tensor,
        window_size: int,
        ratio_train: float,
        standardize_data: bool = False,
        scale_min_max_data: bool = False,
        name: str = "UnnamedDataSet",
    ):
        r"""
        Constructor

        :param data: complete raw data set, where the samples are along the first dimension
        :param window_size: length of the sequences fed to the policy for predicting the next value
        :param ratio_train: ratio of the training samples w.r.t. the total sample count
        :param standardize_data: if `True`, the data is standardized to be $~ N(0,1)$
        :param scale_min_max_data:  if `True`, the data is scaled to $\in [-1, 1]$
        :param name: descriptive name for the data set
        """
        if not isinstance(data, to.Tensor):
            raise pyrado.TypeErr(given=data, expected_type=to.Tensor)
        if not isinstance(window_size, int):
            raise pyrado.TypeErr(given=window_size, expected_type=int)
        if window_size < 1:
            raise pyrado.ValueErr(given=window_size, ge_constraint="1")
        if not isinstance(ratio_train, float):
            raise pyrado.TypeErr(given=ratio_train, expected_type=float)
        if not (0 < ratio_train < 1):
            raise pyrado.ValueErr(given=ratio_train,
                                  g_constraint="0",
                                  l_constraint="1")
        if standardize_data and scale_min_max_data:
            raise pyrado.ValueErr(
                msg=
                "Scaling and normalizing the data at the same time is not supported!"
            )

        self.data_all_raw = to.atleast_2d(
            data).T if data.ndim == 1 else data  # samples along rows
        self._ratio_train = ratio_train
        self._window_size = window_size
        self.name = name

        # Process the data
        self.is_standardized, self.is_scaled = False, False
        if standardize_data:
            self.data_all = standardize(self.data_all_raw)  # ~ N(0,1)
            self.is_standardized = True
        elif scale_min_max_data:
            self.data_all = scale_min_max(self.data_all_raw, -1,
                                          1)  # in [-1, 1]
            self.is_scaled = True
        else:
            self.data_all = self.data_all_raw

        # Split the data into training and testing data
        self.data_trn = self.data_all[:self.num_samples_trn]
        self.data_tst = self.data_all[self.num_samples_trn:]

        # Targets are the next time steps
        self.data_all_inp = self.data_all[:-1]
        self.data_trn_inp = self.data_trn[:-1]
        self.data_tst_inp = self.data_tst[:-1]
        self.data_all_targ = self.data_all[1:]
        self.data_trn_targ = self.data_trn[1:]
        self.data_tst_targ = self.data_tst[1:]

        # Create sequences
        self.data_trn_ws = self.cut_to_window_size(self.data_trn,
                                                   self._window_size)
        self.data_tst_ws = self.cut_to_window_size(self.data_tst,
                                                   self._window_size)
        self.data_trn_seqs = create_sequences(self.data_trn_ws,
                                              len_seq=self._window_size + 1)
        self.data_tst_seqs = create_sequences(self.data_tst_ws,
                                              len_seq=self._window_size + 1)

        print_cbt(f"Created {str(self)}", "w")
Example #6
0
    def update(self):
        """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """
        # Containers for logging
        expl_strat_stds = to.zeros(self.num_batch_updates)
        qfcn_1_losses = to.zeros(self.num_batch_updates)
        qfcn_2_losses = to.zeros(self.num_batch_updates)
        qfcn_1_grad_norm = to.zeros(self.num_batch_updates)
        qfcn_2_grad_norm = to.zeros(self.num_batch_updates)
        policy_losses = to.zeros(self.num_batch_updates)
        policy_grad_norm = to.zeros(self.num_batch_updates)

        for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates,
                      desc=f'Updating', unit='batches', file=sys.stdout, leave=False):
            # Sample steps and the associated next step from the replay memory
            steps, next_steps = self._memory.sample(self.batch_size)
            steps.torch(data_type=to.get_default_dtype())
            next_steps.torch(data_type=to.get_default_dtype())

            # Standardize and optionally scale the rewards
            if self.standardize_rew:
                rewards = standardize(steps.rewards).unsqueeze(1)
            else:
                rewards = steps.rewards.unsqueeze(1)
            rewards *= self.rew_scale

            # Explore and compute the current log probs (later used for policy update)
            if self.policy.is_recurrent:
                act_expl, log_probs_expl, _ = self._expl_strat(steps.observations, steps.hidden_states)
            else:
                act_expl, log_probs_expl = self._expl_strat(steps.observations)
            expl_strat_stds[b] = to.mean(self._expl_strat.std.data)

            # Update the the entropy coefficient
            if self.learn_ent_coeff:
                # Compute entropy coefficient loss
                ent_coeff_loss = -to.mean(self._log_ent_coeff*(log_probs_expl.detach() + self.target_entropy))
                self._ent_coeff_optim.zero_grad()
                ent_coeff_loss.backward()
                self._ent_coeff_optim.step()

            with to.no_grad():
                # Create masks for the non-final observations
                not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype()).unsqueeze(1)

                # Compute the (next)state-(next)action values Q(s',a') from the target networks
                if self.policy.is_recurrent:
                    next_act_expl, next_log_probs, _ = self._expl_strat(next_steps.observations,
                                                                        next_steps.hidden_states)
                else:
                    next_act_expl, next_log_probs = self._expl_strat(next_steps.observations)
                next_q_val_target_1 = self.qfcn_targ_1(to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_2 = self.qfcn_targ_2(to.cat([next_steps.observations, next_act_expl], dim=1))
                next_q_val_target_min = to.min(next_q_val_target_1, next_q_val_target_2)
                next_q_val_target_min -= self.ent_coeff*next_log_probs  # add entropy term
                # TD error (including entropy term)
                next_q_val = rewards + not_done*self.gamma*next_q_val_target_min  # [4] does not use the reward here

            # Compute the (current)state-(current)action values Q(s,a) from the two Q-networks
            # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2]
            q_val_1 = self.qfcn_1(to.cat([steps.observations, steps.actions], dim=1))
            q_val_2 = self.qfcn_2(to.cat([steps.observations, steps.actions], dim=1))
            q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val)
            q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val)
            q_loss = (q_1_loss + q_2_loss)/2.  # averaging the Q-functions is taken from [3]
            qfcn_1_losses[b] = q_1_loss.data
            qfcn_2_losses[b] = q_2_loss.data

            # Update the Q-fcns
            self._optim_qfcns.zero_grad()
            q_loss.backward()
            qfcn_1_grad_norm[b] = self.clip_grad(self.qfcn_1, None)
            qfcn_2_grad_norm[b] = self.clip_grad(self.qfcn_2, None)
            self._optim_qfcns.step()

            # Compute the policy loss
            # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))]
            q_1_val_expl = self.qfcn_1(to.cat([steps.observations, act_expl], dim=1))
            q_2_val_expl = self.qfcn_2(to.cat([steps.observations, act_expl], dim=1))
            min_q_val_expl = to.min(q_1_val_expl, q_2_val_expl)
            policy_loss = to.mean(self.ent_coeff*log_probs_expl - min_q_val_expl)  # self.ent_coeff is detached
            policy_losses[b] = policy_loss.data

            # Update the policy
            self._optim_policy.zero_grad()
            policy_loss.backward()
            policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy, self.max_grad_norm)
            self._optim_policy.step()

            # Soft-update the target networks
            if (self._curr_iter*self.num_batch_updates + b)%self.target_update_intvl == 0:
                SAC.soft_update(self.qfcn_targ_1, self.qfcn_1, self.tau)
                SAC.soft_update(self.qfcn_targ_2, self.qfcn_2, self.tau)

        # Update the learning rate if the schedulers have been specified
        if self._lr_scheduler_policy is not None:
            self._lr_scheduler_policy.step()
            self._lr_scheduler_qfcns.step()

        # Logging
        self.logger.add_value('Q1 loss', to.mean(qfcn_1_losses))
        self.logger.add_value('Q2 loss', to.mean(qfcn_2_losses))
        self.logger.add_value('policy loss', to.mean(policy_losses))
        self.logger.add_value('avg grad norm policy', to.mean(policy_grad_norm))
        self.logger.add_value('avg expl strat std', to.mean(expl_strat_stds))
        self.logger.add_value('ent_coeff', self.ent_coeff)
        if self._lr_scheduler_policy is not None:
            self.logger.add_value('avg lr policy', to.mean(self._lr_scheduler_policy.get_last_lr()), 6)
            self.logger.add_value('avg lr critic', to.mean(self._lr_scheduler_qfcns.get_last_lr()), 6)