def step_update(self, state, action, reward, next_state, done): self.step_counter += 1 self.discounted_sum_reward += self.current_disc * reward self.current_disc *= self.discount_factor if self.step_counter >= self.update_offset and self.step_counter % self.update_frequency == 0: if len(self.discounted_sum_rewards) > 0 and len(self.context_buffer) > 0: self.algorithm_iterations += 1 avg_performance = np.mean(self.discounted_sum_rewards) self.discounted_sum_rewards = [] ins, cons = self.context_buffer.read_buffer() initial_states, contexts = np.array(ins), np.array(cons) values = self.value_estimator(initial_states) if values is None: raise Exception("Please define a valid value estimator, this one returns None...") old_context_dist = deepcopy(self.context_dist) contexts_t = to_float_tensor(contexts, use_cuda=False) old_c_log_prob_t = old_context_dist.log_pdf_t(contexts_t).detach() # Estimate the value of the state after the policy update c_val_t = to_float_tensor(values, use_cuda=False) # Add the penalty term cur_kl_t = self.target_context_kl(numpy=False) if self.use_avg_performance: alpha_cur_t = self.alpha_function(self.algorithm_iterations, avg_performance, cur_kl_t) else: alpha_cur_t = self.alpha_function(self.algorithm_iterations, torch.mean(c_val_t).detach(), cur_kl_t) cg_step(partial(self._compute_context_loss, contexts_t, old_c_log_prob_t, c_val_t, alpha_cur_t), partial(self._compute_context_kl, old_context_dist), self.max_kl, self.context_dist.parameters, self.context_dist.set_weights, self.context_dist.get_weights, **self.cg_parameters, use_cuda=False) cov = self.context_dist._chol_flat.detach().numpy() if self.std_lower_bound is not None and self.target_context_kl() > self.kl_threshold: cov[0:self.context_dim] = np.log(np.maximum(np.exp(cov[0:self.context_dim]), self.std_lower_bound)) self.context_dist.set_weights(np.concatenate((self.context_dist.mean(), cov))) self.bk["mean"].append(self.context_dist.mean()) self.bk["covariance"].append(self.context_dist.covariance_matrix()) self.bk["steps"].append(self.step_counter) self.bk["algo_iterations"].append(self.algorithm_iterations) self.bk["kl"].append(self.target_context_kl()) else: print("Skipping iteration at step {} because buffers are empty.".format(self.step_counter))
def log_pdf(self, x): x = to_float_tensor(x, self._use_cuda) return self.log_pdf_t(x).detach().cpu().numpy()