def test_cov(x, data_along_rows): rowvar = not data_along_rows cov_np = np.cov(x, rowvar=rowvar) cov_pyrado = cov(to.from_numpy(x), data_along_rows=data_along_rows).numpy() assert cov_pyrado.shape[0] == cov_pyrado.shape[1] if data_along_rows: assert cov_np.shape[0] == x.shape[1] assert cov_pyrado.shape[0] == x.shape[1] else: assert cov_np.shape[0] == x.shape[0] assert cov_pyrado.shape[0] == x.shape[0] assert np.allclose(cov_np, cov_pyrado)
def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.tensor(param_results.mean_returns) # Descending sort according to return values and the importance samples a.k.a. elites (see [1, p.12]) idcs_dcs = to.argsort(rets_avg_ros, descending=True) idcs_dcs = idcs_dcs[:self.num_is_samples] rets_avg_is = rets_avg_ros[idcs_dcs] params_is = param_results.parameters[idcs_dcs, :] # Update the policy parameters from the mean importance samples self._policy.param_values = to.mean(params_is, dim=0) # Update the exploration covariance from the empirical variance of the importance samples if isinstance(self._expl_strat.noise, DiagNormalNoise): std_is = to.std(params_is, dim=0) extra_expl_std = self.extra_expl_std_init * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(std=std_is + extra_expl_std) elif isinstance(self._expl_strat.noise, FullNormalNoise): cov_is = cov(params_is, data_along_rows=True) extra_expl_cov = to.pow(self.extra_expl_std_init, 2) * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(cov=cov_is + extra_expl_cov) else: raise NotImplementedError # CEM could also sample using different distributions print(self._policy.param_values) # Logging self.logger.add_value('median imp samp return', to.median(rets_avg_is)) self.logger.add_value('min imp samp return', to.min(rets_avg_is)) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std)) self.logger.add_value( 'avg expl strat std', to.mean(self._expl_strat.std.data).detach().numpy()) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std)) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy().item())
def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.from_numpy(param_results.mean_returns).to( to.get_default_dtype()) # Descending sort according to return values and the importance samples a.k.a. elites (see [1, p.12]) idcs_dcs = to.argsort(rets_avg_ros, descending=True) idcs_dcs = idcs_dcs[:self.num_is_samples] rets_avg_is = rets_avg_ros[idcs_dcs] params_is = param_results.parameters[idcs_dcs, :] # Update the policy parameters from the mean importance samples self._policy.param_values = to.mean(params_is, dim=0) # Update the exploration covariance from the empirical variance of the importance samples if isinstance(self._expl_strat.noise, DiagNormalNoise): std_is = to.std(params_is, dim=0) extra_expl_std = self.extra_expl_std_init * max( 1.0 - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(std=std_is + extra_expl_std) elif isinstance(self._expl_strat.noise, FullNormalNoise): cov_is = cov(params_is, data_along_rows=True) extra_expl_cov = to.pow(self.extra_expl_std_init, 2) * max( 1.0 - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(cov=cov_is + extra_expl_cov) # Logging self.logger.add_value("median imp samp return", to.median(rets_avg_is), 4) self.logger.add_value("min imp samp return", to.min(rets_avg_is), 4) self.logger.add_value("min expl strat std", to.min(self._expl_strat.std), 4) self.logger.add_value("avg expl strat std", to.mean(self._expl_strat.std), 4) self.logger.add_value("max expl strat std", to.max(self._expl_strat.std), 4) self.logger.add_value("expl strat entropy", self._expl_strat.get_entropy(), 4)