コード例 #1
0
def test_reward_decreases(demo_to_buffer: Any, use_actions: bool,
                          behavior_spec: BehaviorSpec, seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    buffer_expert = create_agent_buffer(behavior_spec, 1000)
    buffer_policy = create_agent_buffer(behavior_spec, 1000)
    demo_to_buffer.return_value = None, buffer_expert
    gail_settings = GAILSettings(demo_path="",
                                 learning_rate=0.005,
                                 use_vail=False,
                                 use_actions=use_actions)
    gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec,
                                     gail_settings)

    init_reward_expert = gail_rp.evaluate(buffer_expert)[0]
    init_reward_policy = gail_rp.evaluate(buffer_policy)[0]

    for _ in range(10):
        gail_rp.update(buffer_policy)
        reward_expert = gail_rp.evaluate(buffer_expert)[0]
        reward_policy = gail_rp.evaluate(buffer_policy)[0]
        assert reward_expert >= 0  # GAIL / VAIL reward always positive
        assert reward_policy >= 0
    reward_expert = gail_rp.evaluate(buffer_expert)[0]
    reward_policy = gail_rp.evaluate(buffer_policy)[0]
    assert reward_expert > reward_policy  # Expert reward greater than non-expert reward
    assert (reward_expert > init_reward_expert
            )  # Expert reward getting better as network trains
    assert (reward_policy < init_reward_policy
            )  # Non-expert reward getting worse as network trains
コード例 #2
0
def test_reward_decreases_vail(demo_to_buffer: Any, use_actions: bool,
                               behavior_spec: BehaviorSpec, seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    buffer_expert = create_agent_buffer(behavior_spec, 1000)
    buffer_policy = create_agent_buffer(behavior_spec, 1000)
    demo_to_buffer.return_value = None, buffer_expert
    gail_settings = GAILSettings(demo_path="",
                                 learning_rate=0.005,
                                 use_vail=True,
                                 use_actions=use_actions)
    DiscriminatorNetwork.initial_beta = 0.0
    # we must set the initial value of beta to 0 for testing
    # If we do not, the kl-loss will dominate early and will block the estimator
    gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec,
                                     gail_settings)

    for _ in range(300):
        gail_rp.update(buffer_policy)
        reward_expert = gail_rp.evaluate(buffer_expert)[0]
        reward_policy = gail_rp.evaluate(buffer_policy)[0]
        assert reward_expert >= 0  # GAIL / VAIL reward always positive
        assert reward_policy >= 0
    reward_expert = gail_rp.evaluate(buffer_expert)[0]
    reward_policy = gail_rp.evaluate(buffer_policy)[0]
    assert reward_expert > reward_policy  # Expert reward greater than non-expert reward
コード例 #3
0
 def create_reward_signals(self, reward_signal_configs):
     """
     Create reward signals
     :param reward_signal_configs: Reward signal config.
     """
     for reward_signal, settings in reward_signal_configs.items():
         # Name reward signals by string in case we have duplicates later
         self.reward_signals[reward_signal.value] = create_reward_provider(
             reward_signal, self.policy.behavior_spec, settings)
コード例 #4
0
def test_factory(behavior_spec: BehaviorSpec) -> None:
    settings = RewardSignalSettings()
    extrinsic_rp = create_reward_provider(RewardSignalType.EXTRINSIC,
                                          behavior_spec, settings)
    assert extrinsic_rp.name == "Extrinsic"
コード例 #5
0
def test_factory(behavior_spec: BehaviorSpec) -> None:
    gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
    gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec,
                                     gail_settings)
    assert gail_rp.name == "GAIL"
コード例 #6
0
ファイル: test_rnd.py プロジェクト: zereyak13/ml-agents
def test_factory(behavior_spec: BehaviorSpec) -> None:
    curiosity_settings = RNDSettings(32, 0.01)
    curiosity_rp = create_reward_provider(RewardSignalType.RND, behavior_spec,
                                          curiosity_settings)
    assert curiosity_rp.name == "RND"