class SiblingRivalryLearner(BaseSiblingRivalryLearner, DistanceLearner): AGENT_TYPE = 'HierarchicalSiblingRivalry' def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128, a_range=5, action_size=2, use_antigoal=self.use_antigoal) self.q_target = Critic(self._dummy_env, 128, a_range=5, action_size=2, use_antigoal=self.use_antigoal) self.q_target.load_state_dict(self.q_module.state_dict()) self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2) self.v_module_lo = Value(self._dummy_env, 256, goal_size=2, use_antigoal=False)
def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128) self.p_target = Policy(self._dummy_env, 128) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128) self.q_target = Critic(self._dummy_env, 128) self.q_target.load_state_dict(self.q_module.state_dict())
def _make_agent_modules(self): self._make_skill_embedding() kwargs = dict(env=self._dummy_env, hidden_size=self.hidden_size, num_layers=self.num_layers, goal_size=self.skill_n, normalize_inputs=self.normalize_inputs) self.policy = ReparamTrickPolicy(**kwargs) self.v_module = Value(use_antigoal=False, **kwargs) self.v_target = Value(use_antigoal=False, **kwargs) self.q1 = Critic(use_antigoal=False, **kwargs) self.q2 = Critic(use_antigoal=False, **kwargs)
class SiblingRivalryLearner(BaseSiblingRivalryLearner, DistanceLearner): def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128) self.p_target = Policy(self._dummy_env, 128) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128, use_antigoal=self.use_antigoal) self.q_target = Critic(self._dummy_env, 128, use_antigoal=self.use_antigoal) self.q_target.load_state_dict(self.q_module.state_dict())
def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128, a_range=5, action_size=2) self.q_target = Critic(self._dummy_env, 128, a_range=5, action_size=2) self.q_target.load_state_dict(self.q_module.state_dict()) self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2) self.v_module_lo = Value(self._dummy_env, 256, goal_size=2, use_antigoal=False)
class DistanceLearner(BaseDistanceLearner): def create_env(self): return Env(**self.env_params) def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128) self.p_target = Policy(self._dummy_env, 128) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128) self.q_target = Critic(self._dummy_env, 128) self.q_target.load_state_dict(self.q_module.state_dict()) def _make_agent(self): return Agent(noise=self.noise, epsilon=self.epsilon, env=self.create_env(), policy=self.policy) def soft_update(self): module_pairs = [ dict(source=self.q_module, target=self.q_target), dict(source=self.policy, target=self.p_target), ] for pair in module_pairs: for p, p_targ in zip(pair['source'].parameters(), pair['target'].parameters()): p_targ.data *= self.polyak p_targ.data += (1 - self.polyak) * p.data def get_icm_loss(self, batch): return self.icm(batch['state'], batch['next_state'], batch['action']) def get_next_qs(self, batch): next_policy_actions = self.p_target(batch['next_state'], batch['goal']) return self.q_target(batch['next_state'], next_policy_actions, batch['goal'], batch.get('antigoal', None)) def get_action_qs(self, batch): return self.q_module(batch['state'], batch['action'], batch['goal'], batch.get('antigoal', None)) def get_policy_loss_and_actions(self, batch): policy_actions = self.policy(batch['state'], batch['goal']) p_losses = -self.q_target.q_no_grad(batch['state'], policy_actions, batch['goal'], batch.get('antigoal', None)) p_loss = p_losses.mean() return p_loss, policy_actions
class DistanceLearner(BaseDistanceLearner): AGENT_TYPE = 'HierarchicalDistance' def __init__(self, *args, hi_skip=10, entropy_lambda_lo=0.02, n_lo_epochs=1, **kwargs): self._hierarchical_agent_kwargs = dict( hi_skip=hi_skip, entropy_lambda=entropy_lambda_lo, n_lo_epochs=n_lo_epochs) super().__init__(*args, **kwargs) self._lo_parameters = self.agent._lo_parameters def create_env(self): return Env(**self.env_params) def _make_agent_modules(self): self.policy = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target = Policy(self._dummy_env, 128, a_range=5, action_size=2) self.p_target.load_state_dict(self.policy.state_dict()) self.q_module = Critic(self._dummy_env, 128, a_range=5, action_size=2) self.q_target = Critic(self._dummy_env, 128, a_range=5, action_size=2) self.q_target.load_state_dict(self.q_module.state_dict()) self.policy_lo = StochasticPolicy(self._dummy_env, 256, goal_size=2) self.v_module_lo = Value(self._dummy_env, 256, goal_size=2, use_antigoal=False) def _make_agent(self): return Agent(env=self.create_env(), policy_lo=self.policy_lo, value_lo=self.v_module_lo, noise=self.noise, epsilon=self.epsilon, policy=self.policy, **self._hierarchical_agent_kwargs) def soft_update(self): module_pairs = [ dict(source=self.q_module, target=self.q_target), dict(source=self.policy, target=self.p_target), ] for pair in module_pairs: for p, p_targ in zip(pair['source'].parameters(), pair['target'].parameters()): p_targ.data *= self.polyak p_targ.data += (1 - self.polyak) * p.data def get_next_qs(self, batch): next_policy_actions = self.p_target(batch['next_state'], batch['goal']) return self.q_target(batch['next_state'], next_policy_actions, batch['goal'], batch.get('antigoal', None)) def get_action_qs(self, batch): return self.q_module(batch['state'], batch['action'], batch['goal'], batch.get('antigoal', None)) def get_policy_loss_and_actions(self, batch): policy_actions = self.policy(batch['state'], batch['goal']) p_losses = -self.q_target.q_no_grad(batch['state'], policy_actions, batch['goal'], batch.get('antigoal', None)) p_loss = p_losses.mean() return p_loss, policy_actions