def test_reward_decreases(demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) buffer_expert = create_agent_buffer(behavior_spec, 1000) buffer_policy = create_agent_buffer(behavior_spec, 1000) demo_to_buffer.return_value = None, buffer_expert gail_settings = GAILSettings(demo_path="", learning_rate=0.005, use_vail=False, use_actions=use_actions) gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) init_reward_expert = gail_rp.evaluate(buffer_expert)[0] init_reward_policy = gail_rp.evaluate(buffer_policy)[0] for _ in range(10): gail_rp.update(buffer_policy) reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert >= 0 # GAIL / VAIL reward always positive assert reward_policy >= 0 reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert > reward_policy # Expert reward greater than non-expert reward assert (reward_expert > init_reward_expert ) # Expert reward getting better as network trains assert (reward_policy < init_reward_policy ) # Non-expert reward getting worse as network trains
def test_reward_decreases_vail(demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) buffer_expert = create_agent_buffer(behavior_spec, 1000) buffer_policy = create_agent_buffer(behavior_spec, 1000) demo_to_buffer.return_value = None, buffer_expert gail_settings = GAILSettings(demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions) DiscriminatorNetwork.initial_beta = 0.0 # we must set the initial value of beta to 0 for testing # If we do not, the kl-loss will dominate early and will block the estimator gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) for _ in range(300): gail_rp.update(buffer_policy) reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert >= 0 # GAIL / VAIL reward always positive assert reward_policy >= 0 reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert > reward_policy # Expert reward greater than non-expert reward
def create_reward_signals(self, reward_signal_configs): """ Create reward signals :param reward_signal_configs: Reward signal config. """ for reward_signal, settings in reward_signal_configs.items(): # Name reward signals by string in case we have duplicates later self.reward_signals[reward_signal.value] = create_reward_provider( reward_signal, self.policy.behavior_spec, settings)
def test_factory(behavior_spec: BehaviorSpec) -> None: settings = RewardSignalSettings() extrinsic_rp = create_reward_provider(RewardSignalType.EXTRINSIC, behavior_spec, settings) assert extrinsic_rp.name == "Extrinsic"
def test_factory(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) assert gail_rp.name == "GAIL"
def test_factory(behavior_spec: BehaviorSpec) -> None: curiosity_settings = RNDSettings(32, 0.01) curiosity_rp = create_reward_provider(RewardSignalType.RND, behavior_spec, curiosity_settings) assert curiosity_rp.name == "RND"