Ejemplo n.º 1
0
class SiblingRivalryLearner(BaseSiblingRivalryLearner, DistanceLearner):
    AGENT_TYPE = 'HierarchicalSiblingRivalry'

    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env,
                               128,
                               a_range=5,
                               action_size=2,
                               use_antigoal=self.use_antigoal)
        self.q_target = Critic(self._dummy_env,
                               128,
                               a_range=5,
                               action_size=2,
                               use_antigoal=self.use_antigoal)
        self.q_target.load_state_dict(self.q_module.state_dict())

        self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2)
        self.v_module_lo = Value(self._dummy_env,
                                 256,
                                 goal_size=2,
                                 use_antigoal=False)
Ejemplo n.º 2
0
    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128)
        self.p_target = Policy(self._dummy_env, 128)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env, 128)
        self.q_target = Critic(self._dummy_env, 128)
        self.q_target.load_state_dict(self.q_module.state_dict())
Ejemplo n.º 3
0
class SiblingRivalryLearner(BaseSiblingRivalryLearner, DistanceLearner):
    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128)
        self.p_target = Policy(self._dummy_env, 128)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env,
                               128,
                               use_antigoal=self.use_antigoal)
        self.q_target = Critic(self._dummy_env,
                               128,
                               use_antigoal=self.use_antigoal)
        self.q_target.load_state_dict(self.q_module.state_dict())
Ejemplo n.º 4
0
    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env, 128, a_range=5, action_size=2)
        self.q_target = Critic(self._dummy_env, 128, a_range=5, action_size=2)
        self.q_target.load_state_dict(self.q_module.state_dict())

        self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2)
        self.v_module_lo = Value(self._dummy_env,
                                 256,
                                 goal_size=2,
                                 use_antigoal=False)
Ejemplo n.º 5
0
class DistanceLearner(BaseDistanceLearner):
    def create_env(self):
        return Env(**self.env_params)

    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128)
        self.p_target = Policy(self._dummy_env, 128)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env, 128)
        self.q_target = Critic(self._dummy_env, 128)
        self.q_target.load_state_dict(self.q_module.state_dict())

    def _make_agent(self):
        return Agent(noise=self.noise,
                     epsilon=self.epsilon,
                     env=self.create_env(),
                     policy=self.policy)

    def soft_update(self):
        module_pairs = [
            dict(source=self.q_module, target=self.q_target),
            dict(source=self.policy, target=self.p_target),
        ]
        for pair in module_pairs:
            for p, p_targ in zip(pair['source'].parameters(),
                                 pair['target'].parameters()):
                p_targ.data *= self.polyak
                p_targ.data += (1 - self.polyak) * p.data

    def get_icm_loss(self, batch):
        return self.icm(batch['state'], batch['next_state'], batch['action'])

    def get_next_qs(self, batch):
        next_policy_actions = self.p_target(batch['next_state'], batch['goal'])
        return self.q_target(batch['next_state'], next_policy_actions,
                             batch['goal'], batch.get('antigoal', None))

    def get_action_qs(self, batch):
        return self.q_module(batch['state'], batch['action'], batch['goal'],
                             batch.get('antigoal', None))

    def get_policy_loss_and_actions(self, batch):
        policy_actions = self.policy(batch['state'], batch['goal'])
        p_losses = -self.q_target.q_no_grad(batch['state'], policy_actions,
                                            batch['goal'],
                                            batch.get('antigoal', None))
        p_loss = p_losses.mean()
        return p_loss, policy_actions
Ejemplo n.º 6
0
class DistanceLearner(BaseDistanceLearner):
    AGENT_TYPE = 'HierarchicalDistance'

    def __init__(self,
                 *args,
                 hi_skip=10,
                 entropy_lambda_lo=0.02,
                 n_lo_epochs=1,
                 **kwargs):
        self._hierarchical_agent_kwargs = dict(
            hi_skip=hi_skip,
            entropy_lambda=entropy_lambda_lo,
            n_lo_epochs=n_lo_epochs)
        super().__init__(*args, **kwargs)

        self._lo_parameters = self.agent._lo_parameters

    def create_env(self):
        return Env(**self.env_params)

    def _make_agent_modules(self):
        self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2)
        self.p_target.load_state_dict(self.policy.state_dict())

        self.q_module = Critic(self._dummy_env, 128, a_range=5, action_size=2)
        self.q_target = Critic(self._dummy_env, 128, a_range=5, action_size=2)
        self.q_target.load_state_dict(self.q_module.state_dict())

        self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2)
        self.v_module_lo = Value(self._dummy_env,
                                 256,
                                 goal_size=2,
                                 use_antigoal=False)

    def _make_agent(self):
        return Agent(env=self.create_env(),
                     policy_lo=self.policy_lo,
                     value_lo=self.v_module_lo,
                     noise=self.noise,
                     epsilon=self.epsilon,
                     policy=self.policy,
                     **self._hierarchical_agent_kwargs)

    def soft_update(self):
        module_pairs = [
            dict(source=self.q_module, target=self.q_target),
            dict(source=self.policy, target=self.p_target),
        ]
        for pair in module_pairs:
            for p, p_targ in zip(pair['source'].parameters(),
                                 pair['target'].parameters()):
                p_targ.data *= self.polyak
                p_targ.data += (1 - self.polyak) * p.data

    def get_next_qs(self, batch):
        next_policy_actions = self.p_target(batch['next_state'], batch['goal'])
        return self.q_target(batch['next_state'], next_policy_actions,
                             batch['goal'], batch.get('antigoal', None))

    def get_action_qs(self, batch):
        return self.q_module(batch['state'], batch['action'], batch['goal'],
                             batch.get('antigoal', None))

    def get_policy_loss_and_actions(self, batch):
        policy_actions = self.policy(batch['state'], batch['goal'])
        p_losses = -self.q_target.q_no_grad(batch['state'], policy_actions,
                                            batch['goal'],
                                            batch.get('antigoal', None))
        p_loss = p_losses.mean()
        return p_loss, policy_actions