def policy_learn(self):
     """A learning iteration for the policy"""
     all_discounted_returns = self.calculate_all_discounted_returns()
     if self.hyperparameters["normalise_rewards"]:
         all_discounted_returns = normalise_rewards(all_discounted_returns)
     for _ in range(self.hyperparameters["learning_iterations_per_round"]):
         all_ratio_of_policy_probabilities = self.calculate_all_ratio_of_policy_probabilities()
         loss = self.calculate_loss([all_ratio_of_policy_probabilities], all_discounted_returns)
         self.take_policy_new_optimisation_step(loss)
Esempio n. 2
0
 def policy_learn(self):
     all_discounted_returns = self.calculate_all_discounted_returns()
     if self.config.hyperparameters["normalise_rewards"]:
         all_discounted_returns = normalise_rewards(all_discounted_returns)
     # number of epochs
     for _ in range(
             self.config.hyperparameters["learning_iterations_per_round"]):
         all_ratio_of_policy_probabilities = self.calculate_all_ratio_of_policy_probabilities(
         )
         loss = self.calculate_loss([all_ratio_of_policy_probabilities],
                                    all_discounted_returns)
         self.take_policy_new_optimisation_step(loss)
     if self.config.save_model:
         torch.save(self.policy_new.state_dict(), self.config.model_path)
     self.init_batch_lists()