def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ policy1.actor = policy1.actor.to(default_device()) policy2.actor = policy2.actor.to(default_device()) decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) np_obs = decision_step.obs masks = policy1._extract_masks(decision_step) memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] with torch.no_grad(): _, log_probs1, _, _ = policy1.sample_actions(tensor_obs, masks=masks, memories=memories) _, log_probs2, _, _ = policy2.sample_actions(tensor_obs, masks=masks, memories=memories) np.testing.assert_array_equal( ModelUtils.to_numpy(log_probs1.all_discrete_tensor), ModelUtils.to_numpy(log_probs2.all_discrete_tensor), )
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) vec_vis_obs, masks = policy1._split_decision_step(decision_step) vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] vis_obs = [ torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations ] memories = torch.as_tensor( policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0) with torch.no_grad(): _, log_probs1, _, _, _ = policy1.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) _, log_probs2, _, _, _ = policy2.sample_actions(vec_obs, vis_obs, masks=masks, memories=memories, all_log_probs=True) np.testing.assert_array_equal(log_probs1, log_probs2)
def _compare_two_policies(policy1: TFPolicy, policy2: TFPolicy) -> None: """ Make sure two policies have the same output for the same input. """ decision_step, _ = mb.create_steps_from_behavior_spec( policy1.behavior_spec, num_agents=1) run_out1 = policy1.evaluate(decision_step, list(decision_step.agent_id)) run_out2 = policy2.evaluate(decision_step, list(decision_step.agent_id)) np.testing.assert_array_equal(run_out2["log_probs"], run_out1["log_probs"])
def test_policy_evaluate(rnn, visual, discrete): # Test evaluate policy = create_policy_mock(TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual) decision_step, terminal_step = mb.create_steps_from_behavior_spec( policy.behavior_spec, num_agents=NUM_AGENTS) run_out = policy.evaluate(decision_step, list(decision_step.agent_id)) if discrete: run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE)) else: assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)