コード例 #1
0
 def test_policy_gradient(self):
     action = self.mu.sample().array
     pg = acer.compute_policy_gradient_loss(action, 1, self.pi, self.mu,
                                            self.action_value, 0,
                                            self.truncation_threshold)
     print('pg', pg.array)
     self.assertFalse(np.isnan(np.sum(pg.array)))
コード例 #2
0
 def bias_correction_policy_gradients(truncation_threshold):
     gs = []
     for sample in mu_samples:
         base_policy.cleargrads()
         loss = acer.compute_policy_gradient_loss(
             action=sample,
             advantage=evaluate_action(sample),
             action_distrib=pi,
             action_distrib_mu=mu,
             action_value=action_value,
             v=0,
             truncation_threshold=truncation_threshold)
         F.squeeze(loss).backward()
         gs.append(extract_gradients_as_single_vector(base_policy))
     return gs