def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor, ddp_space: BoxSpace, num_restarts: int, num_samples: int) -> to.Tensor: """ Compute the GP input with the maximal posterior mean. :param cands: candidates a.k.a. x :param cands_values: observed values a.k.a. y :param ddp_space: space of the domain distribution parameters, indicates the lower and upper bound :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :return: un-normalized candidate with maximum posterior value a.k.a. x """ if not isinstance(cands, to.Tensor): raise pyrado.TypeErr(given=cands, expected_type=to.Tensor) if not isinstance(cands_values, to.Tensor): raise pyrado.TypeErr(given=cands_values, expected_type=to.Tensor) if not isinstance(ddp_space, BoxSpace): raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace) # Normalize the input data and standardize the output data uc_projector = UnitCubeProjector( to.from_numpy(ddp_space.bound_lo).to(dtype=to.get_default_dtype()), to.from_numpy(ddp_space.bound_up).to(dtype=to.get_default_dtype()), ) cands_norm = uc_projector.project_to(cands) cands_values_stdized = standardize(cands_values) if cands_norm.shape[0] > cands_values.shape[0]: print_cbt( f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring " f"the candidates without evaluation for computing the argmax.", "y", ) cands_norm = cands_norm[:cands_values.shape[0], :] # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint("raw_noise", GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) # Find position with maximal posterior mean cand_norm, _ = optimize_acqf( acq_function=PosteriorMean(gp), bounds=to.stack( [to.zeros(ddp_space.flat_dim), to.ones(ddp_space.flat_dim)]).to(dtype=to.float32), q=1, num_restarts=num_restarts, raw_samples=num_samples, ) cand_norm = cand_norm.to(dtype=to.get_default_dtype()) cand = uc_projector.project_back(cand_norm.detach()) print_cbt(f"Converged to argmax of the posterior mean: {cand.numpy()}", "g", bright=True) return cand
def gae(self, concat_ros: StepSequence, v_pred: to.Tensor = None, requires_grad: bool = False) -> to.Tensor: """ Compute the generalized advantage estimation as described in [1]. :param concat_ros: concatenated rollouts (sequence of steps from potentially different rollouts) :param v_pred: state-value predictions if already computed, else pass None :param requires_grad: is the gradient required :return adv: tensor of advantages """ with ExitStack() as stack: if not requires_grad: stack.enter_context(to.no_grad()) if v_pred is None: # Get the predictions from the value function v_pred = self.values(concat_ros) # Compute the advantages adv = to.empty_like(v_pred) for k in reversed(range(concat_ros.length)): if concat_ros[k].done: adv[k] = concat_ros[k].reward - v_pred[k] else: adv[k] = concat_ros[k].reward + self.gamma*v_pred[k + 1] - v_pred[k] + \ self.gamma*self.lamda*adv[k + 1] if self.standardize_adv: if isinstance(self.standardizer, RunningStandardizer): adv = self.standardizer(adv, axis=0) else: adv = standardize(adv) return adv
def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns # Get the perturbations (deltas from the current policy parameters) s = param_results.parameters - self._policy.param_values # also divide by the standard deviation to fully standardize s /= self._expl_strat.std if self.transform_returns: # Ascending sort according to return values idcs_acs = np.argsort(rets_avg_ros)[::-1] s_asc = s[list(idcs_acs), :] # Update the mean (see [1, 2]) delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc) self._policy.param_values += self.lr_mean * delta_mean # Update the std (see [1, 2]) grad_std = self.eta_std_util @ (s_asc**2 - 1.) new_std = self._expl_strat.std * to.exp( self.lr_std * grad_std / 2.) self._expl_strat.adapt(std=new_std) else: # Standardize averaged returns over all pop_size rollouts rets_stdized = standardize(rets_avg_ros) rets_stdized = to.from_numpy(rets_stdized).to( to.get_default_dtype()) # delta_mean = 1./len(param_results) * (rets_stdized @ s) delta_mean = 1. / (self._expl_strat.std * len(param_results)) * (rets_stdized @ s) self._policy.param_values += self.lr_mean * delta_mean # Update the std (monotonous exponential decay) new_std = self._expl_strat.std * 0.999**self._curr_iter self._expl_strat.adapt(std=new_std) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
def step(self, snapshot_mode: str = 'latest', meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == -2: # Train the initial policies in the source domain self.train_init_policies() self.reached_checkpoint() # setting counter to -1 if self.curr_checkpoint == -1: # Evaluate the initial policies in the target domain self.eval_init_policies() self.reached_checkpoint() # setting counter to 0 if self.curr_checkpoint == 0: # Normalize the input data and standardize the output data cands_norm = self.ddp_projector.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand_norm, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples ) next_cand = self.ddp_projector.project_back(cand_norm) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep )(self.train_policy_sim) wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}') self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Evaluate the current policy in the target domain policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) self.curr_cand_value = self.eval_policy( self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}', self.num_eval_rollouts_real ) self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0) pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples ) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 0
def __init__( self, data: to.Tensor, window_size: int, ratio_train: float, standardize_data: bool = False, scale_min_max_data: bool = False, name: str = "UnnamedDataSet", ): r""" Constructor :param data: complete raw data set, where the samples are along the first dimension :param window_size: length of the sequences fed to the policy for predicting the next value :param ratio_train: ratio of the training samples w.r.t. the total sample count :param standardize_data: if `True`, the data is standardized to be $~ N(0,1)$ :param scale_min_max_data: if `True`, the data is scaled to $\in [-1, 1]$ :param name: descriptive name for the data set """ if not isinstance(data, to.Tensor): raise pyrado.TypeErr(given=data, expected_type=to.Tensor) if not isinstance(window_size, int): raise pyrado.TypeErr(given=window_size, expected_type=int) if window_size < 1: raise pyrado.ValueErr(given=window_size, ge_constraint="1") if not isinstance(ratio_train, float): raise pyrado.TypeErr(given=ratio_train, expected_type=float) if not (0 < ratio_train < 1): raise pyrado.ValueErr(given=ratio_train, g_constraint="0", l_constraint="1") if standardize_data and scale_min_max_data: raise pyrado.ValueErr( msg= "Scaling and normalizing the data at the same time is not supported!" ) self.data_all_raw = to.atleast_2d( data).T if data.ndim == 1 else data # samples along rows self._ratio_train = ratio_train self._window_size = window_size self.name = name # Process the data self.is_standardized, self.is_scaled = False, False if standardize_data: self.data_all = standardize(self.data_all_raw) # ~ N(0,1) self.is_standardized = True elif scale_min_max_data: self.data_all = scale_min_max(self.data_all_raw, -1, 1) # in [-1, 1] self.is_scaled = True else: self.data_all = self.data_all_raw # Split the data into training and testing data self.data_trn = self.data_all[:self.num_samples_trn] self.data_tst = self.data_all[self.num_samples_trn:] # Targets are the next time steps self.data_all_inp = self.data_all[:-1] self.data_trn_inp = self.data_trn[:-1] self.data_tst_inp = self.data_tst[:-1] self.data_all_targ = self.data_all[1:] self.data_trn_targ = self.data_trn[1:] self.data_tst_targ = self.data_tst[1:] # Create sequences self.data_trn_ws = self.cut_to_window_size(self.data_trn, self._window_size) self.data_tst_ws = self.cut_to_window_size(self.data_tst, self._window_size) self.data_trn_seqs = create_sequences(self.data_trn_ws, len_seq=self._window_size + 1) self.data_tst_seqs = create_sequences(self.data_tst_ws, len_seq=self._window_size + 1) print_cbt(f"Created {str(self)}", "w")
def update(self): """ Update the policy's and Q-functions' parameters on transitions sampled from the replay memory. """ # Containers for logging expl_strat_stds = to.zeros(self.num_batch_updates) qfcn_1_losses = to.zeros(self.num_batch_updates) qfcn_2_losses = to.zeros(self.num_batch_updates) qfcn_1_grad_norm = to.zeros(self.num_batch_updates) qfcn_2_grad_norm = to.zeros(self.num_batch_updates) policy_losses = to.zeros(self.num_batch_updates) policy_grad_norm = to.zeros(self.num_batch_updates) for b in tqdm(range(self.num_batch_updates), total=self.num_batch_updates, desc=f'Updating', unit='batches', file=sys.stdout, leave=False): # Sample steps and the associated next step from the replay memory steps, next_steps = self._memory.sample(self.batch_size) steps.torch(data_type=to.get_default_dtype()) next_steps.torch(data_type=to.get_default_dtype()) # Standardize and optionally scale the rewards if self.standardize_rew: rewards = standardize(steps.rewards).unsqueeze(1) else: rewards = steps.rewards.unsqueeze(1) rewards *= self.rew_scale # Explore and compute the current log probs (later used for policy update) if self.policy.is_recurrent: act_expl, log_probs_expl, _ = self._expl_strat(steps.observations, steps.hidden_states) else: act_expl, log_probs_expl = self._expl_strat(steps.observations) expl_strat_stds[b] = to.mean(self._expl_strat.std.data) # Update the the entropy coefficient if self.learn_ent_coeff: # Compute entropy coefficient loss ent_coeff_loss = -to.mean(self._log_ent_coeff*(log_probs_expl.detach() + self.target_entropy)) self._ent_coeff_optim.zero_grad() ent_coeff_loss.backward() self._ent_coeff_optim.step() with to.no_grad(): # Create masks for the non-final observations not_done = to.tensor(1. - steps.done, dtype=to.get_default_dtype()).unsqueeze(1) # Compute the (next)state-(next)action values Q(s',a') from the target networks if self.policy.is_recurrent: next_act_expl, next_log_probs, _ = self._expl_strat(next_steps.observations, next_steps.hidden_states) else: next_act_expl, next_log_probs = self._expl_strat(next_steps.observations) next_q_val_target_1 = self.qfcn_targ_1(to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_2 = self.qfcn_targ_2(to.cat([next_steps.observations, next_act_expl], dim=1)) next_q_val_target_min = to.min(next_q_val_target_1, next_q_val_target_2) next_q_val_target_min -= self.ent_coeff*next_log_probs # add entropy term # TD error (including entropy term) next_q_val = rewards + not_done*self.gamma*next_q_val_target_min # [4] does not use the reward here # Compute the (current)state-(current)action values Q(s,a) from the two Q-networks # E_{(s_t, a_t) ~ D} [1/2 * (Q_i(s_t, a_t) - r_t - gamma * E_{s_{t+1} ~ p} [V(s_{t+1})] )^2] q_val_1 = self.qfcn_1(to.cat([steps.observations, steps.actions], dim=1)) q_val_2 = self.qfcn_2(to.cat([steps.observations, steps.actions], dim=1)) q_1_loss = nn.functional.mse_loss(q_val_1, next_q_val) q_2_loss = nn.functional.mse_loss(q_val_2, next_q_val) q_loss = (q_1_loss + q_2_loss)/2. # averaging the Q-functions is taken from [3] qfcn_1_losses[b] = q_1_loss.data qfcn_2_losses[b] = q_2_loss.data # Update the Q-fcns self._optim_qfcns.zero_grad() q_loss.backward() qfcn_1_grad_norm[b] = self.clip_grad(self.qfcn_1, None) qfcn_2_grad_norm[b] = self.clip_grad(self.qfcn_2, None) self._optim_qfcns.step() # Compute the policy loss # E_{s_t ~ D, eps_t ~ N} [log( pi( f(eps_t; s_t) ) ) - Q(s_t, f(eps_t; s_t))] q_1_val_expl = self.qfcn_1(to.cat([steps.observations, act_expl], dim=1)) q_2_val_expl = self.qfcn_2(to.cat([steps.observations, act_expl], dim=1)) min_q_val_expl = to.min(q_1_val_expl, q_2_val_expl) policy_loss = to.mean(self.ent_coeff*log_probs_expl - min_q_val_expl) # self.ent_coeff is detached policy_losses[b] = policy_loss.data # Update the policy self._optim_policy.zero_grad() policy_loss.backward() policy_grad_norm[b] = self.clip_grad(self._expl_strat.policy, self.max_grad_norm) self._optim_policy.step() # Soft-update the target networks if (self._curr_iter*self.num_batch_updates + b)%self.target_update_intvl == 0: SAC.soft_update(self.qfcn_targ_1, self.qfcn_1, self.tau) SAC.soft_update(self.qfcn_targ_2, self.qfcn_2, self.tau) # Update the learning rate if the schedulers have been specified if self._lr_scheduler_policy is not None: self._lr_scheduler_policy.step() self._lr_scheduler_qfcns.step() # Logging self.logger.add_value('Q1 loss', to.mean(qfcn_1_losses)) self.logger.add_value('Q2 loss', to.mean(qfcn_2_losses)) self.logger.add_value('policy loss', to.mean(policy_losses)) self.logger.add_value('avg grad norm policy', to.mean(policy_grad_norm)) self.logger.add_value('avg expl strat std', to.mean(expl_strat_stds)) self.logger.add_value('ent_coeff', self.ent_coeff) if self._lr_scheduler_policy is not None: self.logger.add_value('avg lr policy', to.mean(self._lr_scheduler_policy.get_last_lr()), 6) self.logger.add_value('avg lr critic', to.mean(self._lr_scheduler_qfcns.get_last_lr()), 6)