コード例 #1
0
ファイル: mlp.py プロジェクト: Xingyu-Lin/softagent
    def __init__(
            self,
            observation_shape,
            hidden_sizes,
            action_size,
            n_tile=20,
    ):
        super().__init__()
        self._obs_ndim = 1
        self._n_tile = n_tile
        input_dim = int(np.sum(observation_shape))

        self._action_size = action_size
        self.mlp_loc = MlpModel(
            input_size=input_dim,
            hidden_sizes=hidden_sizes,
            output_size=4
        )
        self.mlp_delta = MlpModel(
            input_size=input_dim + 4 * n_tile,
            hidden_sizes=hidden_sizes,
            output_size=3 * 2,
        )

        self.delta_distribution = Gaussian(
            dim=3,
            squash=True,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )
        self.cat_distribution = Categorical(4)

        self._counter = 0
コード例 #2
0
    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        _initial_model_state_dict = self.initial_model_state_dict
        self.initial_model_state_dict = None  #! Don't let base agent try to load.
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)
        self.initial_model_state_dict = _initial_model_state_dict

        self.q_model = self.QModelCls(**self.env_model_kwargs,
                                      **self.q_model_kwargs)
        self.target_q_model = self.QModelCls(**self.env_model_kwargs,
                                             **self.q_model_kwargs)
        self.target_q_model.load_state_dict(self.q_model.state_dict())

        if self.initial_model_state_dict is not None and not self.load_model_after_min_steps:
            self.load_state_dict(self.initial_model_state_dict)

        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            squash=self.action_squash,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )
        # Tie weights (need to make sure False if not using encoder)
        if self.tie_weights:
            self.model.encoder.copy_conv_weights_from(self.q_model.encoder)
コード例 #3
0
ファイル: ddpg_agent.py プロジェクト: kevinghst/rl_ul
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     """Instantiates mu and q, and target_mu and target_q models."""
     super().initialize(env_spaces,
                        share_memory,
                        global_B=global_B,
                        env_ranks=env_ranks)
     self.q_model = self.QModelCls(**self.env_model_kwargs,
                                   **self.q_model_kwargs)
     if self.initial_q_model_state_dict is not None:
         self.q_model.load_state_dict(self.initial_q_model_state_dict)
     self.target_model = self.ModelCls(**self.env_model_kwargs,
                                       **self.model_kwargs)
     self.target_q_model = self.QModelCls(**self.env_model_kwargs,
                                          **self.q_model_kwargs)
     self.target_q_model.load_state_dict(self.q_model.state_dict())
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         std=self.action_std,
         noise_clip=self.action_noise_clip,
         clip=env_spaces.action.high[0],  # Assume symmetric low=-high.
     )
コード例 #4
0
ファイル: sac_agent.py プロジェクト: keirp/glamor
 def initialize(self, env_spaces, share_memory=False,
                global_B=1, env_ranks=None):
     _initial_model_state_dict = self.initial_model_state_dict
     # Don't let base agent try to load.
     self.initial_model_state_dict = None
     super().initialize(env_spaces, share_memory,
                        global_B=global_B, env_ranks=env_ranks)
     self.initial_model_state_dict = _initial_model_state_dict
     self.q1_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
     self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
     self.target_q1_model = self.QModelCls(**self.env_model_kwargs,
                                           **self.q_model_kwargs)
     self.target_q2_model = self.QModelCls(**self.env_model_kwargs,
                                           **self.q_model_kwargs)
     self.target_q1_model.load_state_dict(self.q1_model.state_dict())
     self.target_q2_model.load_state_dict(self.q2_model.state_dict())
     if self.initial_model_state_dict is not None:
         self.load_state_dict(self.initial_model_state_dict)
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         squash=self.action_squash,
         min_std=np.exp(MIN_LOG_STD),
         max_std=np.exp(MAX_LOG_STD),
     )
コード例 #5
0
ファイル: ddpg_agent.py プロジェクト: wwxFromTju/rlpyt
 def initialize(self, env_spaces, share_memory=False):
     env_model_kwargs = self.make_env_to_model_kwargs(env_spaces)
     self.mu_model = self.MuModelCls(**env_model_kwargs,
                                     **self.mu_model_kwargs)
     self.q_model = self.QModelCls(**env_model_kwargs,
                                   **self.q_model_kwargs)
     if share_memory:
         self.mu_model.share_memory()
         # self.q_model.share_memory()  # Not needed for sampling.
         self.shared_mu_model = self.mu_model
         # self.shared_q_model = self.q_model
     if self.initial_mu_model_state_dict is not None:
         self.mu_model.load_state_dict(self.initial_mu_model_state_dict)
     if self.initial_q_model_state_dict is not None:
         self.q_model.load_state_dict(self.initial_q_model_state_dict)
     self.target_mu_model = self.MuModelCls(**env_model_kwargs,
                                            **self.mu_model_kwargs)
     self.target_mu_model.load_state_dict(self.mu_model.state_dict())
     self.target_q_model = self.QModelCls(**env_model_kwargs,
                                          **self.q_model_kwargs)
     self.target_q_model.load_state_dict(self.q_model.state_dict())
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         std=self.action_std,
         noise_clip=self.action_noise_clip,
         clip=env_spaces.action.high[0],  # Assume symmetric low=-high.
     )
     self.env_spaces = env_spaces
     self.env_model_kwargs = env_model_kwargs
コード例 #6
0
 def initialize(self, env_spaces, share_memory=False):
     env_model_kwargs = self.make_env_to_model_kwargs(env_spaces)
     self.q1_model = self.QModelCls(**env_model_kwargs,
                                    **self.q_model_kwargs)
     self.q2_model = self.QModelCls(**env_model_kwargs,
                                    **self.q_model_kwargs)
     self.v_model = self.VModelCls(**env_model_kwargs,
                                   **self.v_model_kwargs)
     self.pi_model = self.PiModelCls(**env_model_kwargs,
                                     **self.pi_model_kwargs)
     if share_memory:
         self.pi_model.share_memory()  # Only one needed for sampling.
         self.shared_pi_model = self.pi_model
     if self.initial_q1_model_state_dict is not None:
         self.q1_model.load_state_dict(self.initial_q1_model_state_dict)
     if self.initial_q2_model_state_dict is not None:
         self.q2_model.load_state_dict(self.initial_q2_model_state_dict)
     if self.initial_v_model_state_dict is not None:
         self.v_model.load_state_dict(self.initial_v_model_state_dict)
     if self.initial_pi_model_state_dict is not None:
         self.pi_model.load_state_dict(self.initial_pi_model_state_dict)
     self.target_v_model = self.VModelCls(**env_model_kwargs,
                                          **self.v_model_kwargs)
     self.target_v_model.load_state_dict(self.v_model.state_dict())
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         squash=self.action_squash,
         min_std=np.exp(MIN_LOG_STD),
         max_std=np.exp(MAX_LOG_STD),
     )
     self.env_spaces = env_spaces
     self.env_model_kwargs = env_model_kwargs
コード例 #7
0
    def initialize(self, env_spaces, share_memory=False,
                   global_B=1, env_ranks=None):
        _initial_model_state_dict = self.initial_model_state_dict
        self.initial_model_state_dict = None
        super().initialize(env_spaces, share_memory,
                           global_B=global_B, env_ranks=env_ranks)
        self.initial_model_state_dict = _initial_model_state_dict
        self.q_models = [self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
                         for _ in range(self.n_qs)]

        self.target_q_models = [self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
                                for _ in range(self.n_qs)]
        [target_q.load_state_dict(q.state_dict())
         for target_q, q in zip(self.target_q_models, self.q_models)]

        self.log_alpha = nn.Parameter(torch.tensor(0.0, dtype=torch.float32))

        if self.initial_model_state_dict is not None:
            self.load_state_dict(self.initial_model_state_dict)
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            squash=self.action_squash,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )
コード例 #8
0
 def optim_initialize(self, rank=0):
     """Called by async runner."""
     self.rank = rank
     self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(),
                                       lr=self.learning_rate,
                                       **self.optim_kwargs)
     self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(),
                                       lr=self.learning_rate,
                                       **self.optim_kwargs)
     self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(),
                                       lr=self.learning_rate,
                                       **self.optim_kwargs)
     self._log_alpha = torch.zeros(1, requires_grad=True)
     self._alpha = torch.exp(self._log_alpha.detach())
     self.alpha_optimizer = self.OptimCls((self._log_alpha, ),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
     if self.target_entropy == "auto":
         self.target_entropy = -np.prod(self.agent.env_spaces.action.shape)
     if self.initial_optim_state_dict is not None:
         self.load_optim_state_dict(self.initial_optim_state_dict)
     if self.action_prior == "gaussian":
         self.action_prior_distribution = Gaussian(dim=np.prod(
             self.agent.env_spaces.action.shape),
                                                   std=1.)
コード例 #9
0
ファイル: sac_discrete.py プロジェクト: alishobeiri/gfootball
    def optim_initialize(self, rank=0):
        """Called in initilize or by async runner after forking sampler."""
        self.rank = rank
        self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        if self.fixed_alpha is None:
            self.target_entropy = -np.log(
                (1.0 / self.agent.env_spaces.action.n)) * 0.98
            self._log_alpha = torch.zeros(1, requires_grad=True)
            self._alpha = self._log_alpha.exp()
            self.alpha_optimizer = self.OptimCls((self._log_alpha, ),
                                                 lr=self.learning_rate,
                                                 **self.optim_kwargs)
        else:
            self._log_alpha = torch.tensor([np.log(self.fixed_alpha)])
            self._alpha = torch.tensor([self.fixed_alpha])
            self.alpha_optimizer = None
        if self.target_entropy == "auto":
            self.target_entropy = -np.prod(self.agent.env_spaces.action.n)

        if self.initial_optim_state_dict is not None:
            self.load_optim_state_dict(self.initial_optim_state_dict)
        if self.action_prior == "gaussian":
            self.action_prior_distribution = Gaussian(dim=np.prod(
                self.agent.env_spaces.action.shape),
                                                      std=1.)
コード例 #10
0
ファイル: mlp.py プロジェクト: Xingyu-Lin/softagent
    def __init__(
            self,
            observation_shape,
            hidden_sizes,
            action_size,
            all_corners=False
            ):
        super().__init__()
        self._obs_ndim = 1
        self._all_corners = all_corners
        input_dim = int(np.sum(observation_shape))

        print('all corners', self._all_corners)
        delta_dim = 12 if all_corners else 3
        self._delta_dim = delta_dim
        self.mlp = MlpModel(
            input_size=input_dim,
            hidden_sizes=hidden_sizes,
            output_size=2 * delta_dim + 4, # 3 for each corners, times two for std, 4 probs
        )

        self.delta_distribution = Gaussian(
            dim=delta_dim,
            squash=True,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )
        self.cat_distribution = Categorical(4)
コード例 #11
0
ファイル: gaussian.py プロジェクト: wwxFromTju/rlpyt
 def initialize(self, env_spaces, share_memory=False):
     super().initialize(env_spaces, share_memory)
     assert len(env_spaces.action.shape == 1)
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         # min_std=MIN_STD,
         # clip=env_spaces.action.high[0],  # Probably +1?
     )
コード例 #12
0
ファイル: gaussian.py プロジェクト: e-271/rlpyt
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     super().initialize(env_spaces, share_memory)
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         # min_std=MIN_STD,
         clip=env_spaces.action.high[0],  # Probably +1?
     )
コード例 #13
0
 def initialize(self, env_spaces, share_memory=False,
         global_B=1, env_ranks=None):
     super().initialize(env_spaces, share_memory,
         global_B=global_B, env_ranks=env_ranks)
     assert len(env_spaces.action.shape) == 1
     # assert len(np.unique(env_spaces.action.high)) == 1
     # assert np.all(env_spaces.action.low == -env_spaces.action.high)
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         # min_std=MIN_STD,
         # clip=env_spaces.action.high[0],  # Probably +1?
     )
コード例 #14
0
ファイル: oc.py プロジェクト: DavidSlayback/rlpyt
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     super().initialize(env_spaces, share_memory)
     assert len(env_spaces.action.shape) == 1
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         # min_std=MIN_STD,
         # clip=env_spaces.action.high[0],  # Probably +1?
     )
     self.distribution_omega = Categorical(
         dim=self.model_kwargs["option_size"])
コード例 #15
0
 def optim_initialize(self, rank=0):
     """Called by async runner."""
     self.rank = rank
     self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(),
         lr=self.learning_rate, **self.optim_kwargs)
     self.q_optimizers = [self.OptimCls(q_param)
                          for q_param in self.agent.q_parameters()]
     self.alpha_optimizer = self.OptimCls([self.agent.log_alpha],
         lr=self.learning_rate, **self.optim_kwargs)
     if self.initial_optim_state_dict is not None:
         self.pi_optimizer.load_state_dict(self.initial_optim_state_dict)
     if self.action_prior == "gaussian":
         self.action_prior_distribution = Gaussian(
             dim=self.agent.env_spaces.action.size, std=1.)
コード例 #16
0
ファイル: td3_agent.py プロジェクト: nirbhayjm/rlpyt
 def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None):
     super().initialize(env_spaces, share_memory, global_B, env_ranks)
     self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
     if self.initial_q2_model_state_dict is not None:
         self.q2_model.load_state_dict(self.initial_q2_model_state_dict)
     self.target_q2_model = self.QModelCls(
         **self.env_model_kwargs, **self.q_model_kwargs
     )
     self.target_q2_model.load_state_dict(self.q2_model.state_dict())
     self.target_distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         std=self.target_noise_std,
         noise_clip=self.target_noise_clip,
         clip=env_spaces.action.high[0],  # Assume symmetric low=-high.
     )
コード例 #17
0
ファイル: gaussian.py プロジェクト: DavidSlayback/rlpyt
class GaussianPgAgent(BaseAgent):
    """
    Agent for policy gradient algorithm using Gaussian action distribution.
    """
    def __call__(self, observation, prev_action, prev_reward, device='cpu'):
        """Performs forward pass on training data, for algorithm."""
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)
        return buffer_to((DistInfoStd(mean=mu, log_std=log_std), value),
                         device=device)

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        """Extends base method to build Gaussian distribution."""
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)
        assert len(env_spaces.action.shape) == 1
        assert len(np.unique(env_spaces.action.high)) == 1
        assert np.all(env_spaces.action.low == -env_spaces.action.high)
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            # min_std=MIN_STD,
            # clip=env_spaces.action.high[0],  # Probably +1?
        )

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward, device="cpu"):
        """
        Compute policy's action distribution from inputs, and sample an
        action. Calls the model to produce mean, log_std, and value estimate.
        Moves inputs to device and returns outputs back to CPU, for the
        sampler.  (no grad)
        """
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)
        dist_info = DistInfoStd(mean=mu, log_std=log_std)
        action = self.distribution.sample(dist_info)
        agent_info = AgentInfo(dist_info=dist_info, value=value)
        action, agent_info = buffer_to((action, agent_info), device=device)
        return AgentStep(action=action, agent_info=agent_info)

    @torch.no_grad()
    def value(self, observation, prev_action, prev_reward, device="cpu"):
        """
        Compute the value estimate for the environment state, e.g. for the
        bootstrap value, V(s_{T+1}), in the sampler.  (no grad)
        """
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        _mu, _log_std, value = self.model(*model_inputs)
        return value.to(device)
コード例 #18
0
    def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples):
        if agent.recurrent:
            raise NotImplementedError
        self.agent = agent
        self.n_itr = n_itr
        self.mid_batch_reset = mid_batch_reset
        self.optimizer = self.OptimCls(agent.parameters(),
                                       lr=self.learning_rate,
                                       **self.optim_kwargs)
        if self.initial_optim_state_dict is not None:
            self.optimizer.load_state_dict(self.initial_optim_state_dict)

        sample_bs = batch_spec.size
        train_bs = self.batch_size
        assert (self.training_ratio * sample_bs) % train_bs == 0
        self.updates_per_optimize = int(
            (self.training_ratio * sample_bs) // train_bs)
        logger.log(
            f"From sampler batch size {sample_bs}, training "
            f"batch size {train_bs}, and training ratio "
            f"{self.training_ratio}, computed {self.updates_per_optimize} "
            f"updates per iteration.")
        self.min_itr_learn = self.min_steps_learn // sample_bs
        self.agent.give_min_itr_learn(self.min_itr_learn)

        example_to_buffer = SamplesToBuffer(
            observation=examples["observation"],
            action=examples["action"],
            reward=examples["reward"],
            done=examples["done"],
        )
        replay_kwargs = dict(
            example=example_to_buffer,
            size=self.replay_size,
            B=batch_spec.B,
            n_step_return=self.n_step_return,
        )
        self.replay_buffer = UniformReplayBuffer(**replay_kwargs)

        if self.action_prior == "gaussian":
            self.action_prior_distribution = Gaussian(
                dim=agent.env_spaces.action.size, std=1.)
コード例 #19
0
ファイル: oc.py プロジェクト: DavidSlayback/rlpyt
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     """Extends base method to build Gaussian distribution."""
     super().initialize(env_spaces,
                        share_memory,
                        global_B=global_B,
                        env_ranks=env_ranks)
     assert len(env_spaces.action.shape) == 1
     assert len(np.unique(env_spaces.action.high)) == 1
     assert np.all(env_spaces.action.low == -env_spaces.action.high)
     self.distribution = Gaussian(
         dim=env_spaces.action.shape[0],
         # min_std=MIN_STD,
         # clip=env_spaces.action.high[0],  # Probably +1?
     )
     self.distribution_omega = Categorical(
         dim=self.model_kwargs["option_size"])
コード例 #20
0
class RecurrentGaussianPgAgentBase(BaseAgent):
    def __call__(self, observation, prev_action, prev_reward, init_rnn_state):
        # Assume init_rnn_state already shaped: [N,B,H]
        model_inputs = buffer_to(
            (observation, prev_action, prev_reward, init_rnn_state),
            device=self.device)
        mu, log_std, value, next_rnn_state = self.model(*model_inputs)
        dist_info, value = buffer_to(
            (DistInfoStd(mean=mu, log_std=log_std), value), device="cpu")
        return dist_info, value, next_rnn_state  # Leave rnn_state on device.

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        super().initialize(env_spaces, share_memory)
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            # min_std=MIN_STD,
            # clip=env_spaces.action.high[0],  # Probably +1?
        )

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        agent_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value, rnn_state = self.model(*agent_inputs,
                                                   self.prev_rnn_state)
        dist_info = DistInfoStd(mean=mu, log_std=log_std)
        action = self.distribution.sample(dist_info)
        # Model handles None, but Buffer does not, make zeros if needed:
        prev_rnn_state = self.prev_rnn_state or buffer_func(
            rnn_state, torch.zeros_like)
        # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
        # (Special case: model should always leave B dimension in.)
        prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
        agent_info = AgentInfoRnn(dist_info=dist_info,
                                  value=value,
                                  prev_rnn_state=prev_rnn_state)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        self.advance_rnn_state(rnn_state)  # Keep on device.
        return AgentStep(action=action, agent_info=agent_info)

    @torch.no_grad()
    def value(self, observation, prev_action, prev_reward):
        agent_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        _mu, _log_std, value, _rnn_state = self.model(*agent_inputs,
                                                      self.prev_rnn_state)
        return value.to("cpu")
コード例 #21
0
    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        super(SacAgent, self).initialize(env_spaces,
                                         share_memory,
                                         global_B=global_B,
                                         env_ranks=env_ranks)

        self.target_model = self.ModelCls(**self.env_model_kwargs,
                                          **self.model_kwargs)
        self.target_model.load_state_dict(self.model.state_dict())
        if self.initial_model_state_dict is not None:
            self.load_state_dict(self.initial_model_state_dict)
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            squash=self.action_squash,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )
コード例 #22
0
    def optim_initialize(self, rank=0):
        """Called in initilize or by async runner after forking sampler."""
        self.rank = rank

        # Be very explicit about which parameters are optimized where.
        self.pi_optimizer = self.OptimCls(
            chain(
                self.agent.pi_fc1.parameters(),  # No conv.
                self.agent.pi_mlp.parameters(),
            ),
            lr=self.pi_lr,
            betas=(self.pi_beta, 0.999),
        )
        self.q_optimizer = self.OptimCls(
            chain(
                () if self.stop_conv_grad else self.agent.conv.parameters(),
                self.agent.q_fc1.parameters(),
                self.agent.q_mlps.parameters(),
            ),
            lr=self.q_lr,
            betas=(self.q_beta, 0.999),
        )

        self._log_alpha = torch.tensor(np.log(self.alpha_init), requires_grad=True)
        self._alpha = torch.exp(self._log_alpha.detach())
        self.alpha_optimizer = self.OptimCls(
            (self._log_alpha,), lr=self.alpha_lr, betas=(self.alpha_beta, 0.999)
        )

        if self.target_entropy == "auto":
            self.target_entropy = -np.prod(self.agent.env_spaces.action.shape)
        if self.initial_optim_state_dict is not None:
            self.load_optim_state_dict(self.initial_optim_state_dict)
        if self.action_prior == "gaussian":
            self.action_prior_distribution = Gaussian(
                dim=np.prod(self.agent.env_spaces.action.shape), std=1.0
            )
コード例 #23
0
class MultiAgentGaussianPgAgent(BaseAgent):
    def __call__(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)

        samples = (DistInfoStd(mean=mu, log_std=log_std), value)
        return buffer_to(samples, device="cpu")

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)

        for _a_space in env_spaces.action.space:
            assert len(_a_space.shape) == 1
            # assert len(np.unique(_a_space.high)) == 1
            assert np.all(_a_space.low == -_a_space.high)

        self.distribution = Gaussian(
            dim=env_spaces.action.shape[-1],
            # min_std=MIN_STD,
            # clip=env_spaces.action.high[0],  # Probably +1?
        )

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)
        # import pdb; pdb.set_trace()

        dist_info = DistInfoStd(mean=mu, log_std=log_std)
        action = self.distribution.sample(dist_info)
        agent_info = AgentInfo(dist_info=dist_info, value=value)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    @torch.no_grad()
    def value(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        _mu, _log_std, value = self.model(*model_inputs)
        return value.to("cpu")
コード例 #24
0
 def initialize(self,
                env_spaces,
                share_memory=False,
                global_B=1,
                env_ranks=None):
     """Extends base method to build Gaussian distribution."""
     if (not (env_spaces.action.high == 1).all()
             and (env_spaces.action.low == -1).all()):
         raise ValueError(f"The space for all actions should be [-1, 1].")
     super().initialize(env_spaces,
                        share_memory,
                        global_B=global_B,
                        env_ranks=env_ranks)
     self.distribution = Gaussian(dim=env_spaces.action.shape[0],
                                  min_std=1e-6,
                                  max_std=1)
コード例 #25
0
ファイル: gaussian.py プロジェクト: wwxFromTju/rlpyt
class GaussianPgAgent(BasePgAgent):
    def __call__(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)
        return buffer_to((DistInfoStd(mean=mu, log_std=log_std), value),
                         device="cpu")

    def initialize(self, env_spaces, share_memory=False):
        super().initialize(env_spaces, share_memory)
        assert len(env_spaces.action.shape) == 1
        assert len(np.unique(env_spaces.action.high)) == 1
        assert np.all(env_spaces.action.low == -env_spaces.action.high)
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            # min_std=MIN_STD,
            # clip=env_spaces.action.high[0],  # Probably +1?
        )

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value = self.model(*model_inputs)
        dist_info = DistInfoStd(mean=mu, log_std=log_std)
        action = self.distribution.sample(dist_info)
        agent_info = AgentInfo(dist_info=dist_info, value=value)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    @torch.no_grad()
    def value(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        _mu, _log_std, value = self.model(*model_inputs)
        return value.to("cpu")
コード例 #26
0
ファイル: sac_discrete.py プロジェクト: alishobeiri/gfootball
class SACDiscrete(RlAlgorithm):
    """Soft actor critic algorithm, training from a replay buffer."""

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.99,
            batch_size=256,
            min_steps_learn=int(1e4),
            replay_size=int(1e6),
            replay_ratio=256,  # data_consumption / data_generation
            target_update_tau=0.005,  # tau=1 for hard update.
            target_update_interval=1,  # 1000 for hard update, 1 for soft.
            learning_rate=3e-4,
            fixed_alpha=None,  # None for adaptive alpha, float for any fixed value
            OptimCls=torch.optim.Adam,
            optim_kwargs=None,
            initial_optim_state_dict=None,  # for all of them.
            action_prior="uniform",  # or "gaussian"
            reward_scale=1,
            target_entropy="auto",  # "auto", float, or None
            reparameterize=True,
            clip_grad_norm=1e9,
            # policy_output_regularization=0.001,
            n_step_return=1,
            updates_per_sync=1,  # For async mode only.
            bootstrap_timelimit=False,
            ReplayBufferCls=None,  # Leave None to select by above options.
    ):
        """Save input arguments."""
        if optim_kwargs is None:
            optim_kwargs = dict()
        assert action_prior in ["uniform", "gaussian"]
        self._batch_size = batch_size
        del batch_size  # Property.
        save__init__args(locals())

    def initialize(self,
                   agent,
                   n_itr,
                   batch_spec,
                   mid_batch_reset,
                   examples,
                   world_size=1,
                   rank=0):
        """Stores input arguments and initializes replay buffer and optimizer.
        Use in non-async runners.  Computes number of gradient updates per
        optimization iteration as `(replay_ratio * sampler-batch-size /
        training-batch_size)`."""
        self.agent = agent
        self.n_itr = n_itr
        self.mid_batch_reset = mid_batch_reset
        self.sampler_bs = sampler_bs = batch_spec.size
        self.updates_per_optimize = int(self.replay_ratio * sampler_bs /
                                        self.batch_size)
        logger.log(
            f"From sampler batch size {sampler_bs}, training "
            f"batch size {self.batch_size}, and replay ratio "
            f"{self.replay_ratio}, computed {self.updates_per_optimize} "
            f"updates per iteration.")
        self.min_itr_learn = self.min_steps_learn // sampler_bs
        agent.give_min_itr_learn(self.min_itr_learn)
        self.initialize_replay_buffer(examples, batch_spec)
        self.optim_initialize(rank)

    def async_initialize(self,
                         agent,
                         sampler_n_itr,
                         batch_spec,
                         mid_batch_reset,
                         examples,
                         world_size=1):
        """Used in async runner only; returns replay buffer allocated in shared
        memory, does not instantiate optimizer. """
        self.agent = agent
        self.n_itr = sampler_n_itr
        self.initialize_replay_buffer(examples, batch_spec, async_=True)
        self.mid_batch_reset = mid_batch_reset
        self.sampler_bs = sampler_bs = batch_spec.size
        self.updates_per_optimize = self.updates_per_sync
        self.min_itr_learn = int(self.min_steps_learn // sampler_bs)
        agent.give_min_itr_learn(self.min_itr_learn)
        return self.replay_buffer

    def optim_initialize(self, rank=0):
        """Called in initilize or by async runner after forking sampler."""
        self.rank = rank
        self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        if self.fixed_alpha is None:
            self.target_entropy = -np.log(
                (1.0 / self.agent.env_spaces.action.n)) * 0.98
            self._log_alpha = torch.zeros(1, requires_grad=True)
            self._alpha = self._log_alpha.exp()
            self.alpha_optimizer = self.OptimCls((self._log_alpha, ),
                                                 lr=self.learning_rate,
                                                 **self.optim_kwargs)
        else:
            self._log_alpha = torch.tensor([np.log(self.fixed_alpha)])
            self._alpha = torch.tensor([self.fixed_alpha])
            self.alpha_optimizer = None
        if self.target_entropy == "auto":
            self.target_entropy = -np.prod(self.agent.env_spaces.action.n)

        if self.initial_optim_state_dict is not None:
            self.load_optim_state_dict(self.initial_optim_state_dict)
        if self.action_prior == "gaussian":
            self.action_prior_distribution = Gaussian(dim=np.prod(
                self.agent.env_spaces.action.shape),
                                                      std=1.)

    def initialize_replay_buffer(self, examples, batch_spec, async_=False):
        """
        Allocates replay buffer using examples and with the fields in `SamplesToBuffer`
        namedarraytuple.
        """
        example_to_buffer = SamplesToBuffer(
            observation=examples["observation"],
            action=examples["action"],
            reward=examples["reward"],
            done=examples["done"],
        )
        if not self.bootstrap_timelimit:
            ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer
        else:
            example_to_buffer = SamplesToBufferTl(
                *example_to_buffer, timeout=examples["env_info"].timeout)
            ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer
        replay_kwargs = dict(
            example=example_to_buffer,
            size=self.replay_size,
            B=batch_spec.B,
            n_step_return=self.n_step_return,
        )
        if self.ReplayBufferCls is not None:
            ReplayCls = self.ReplayBufferCls
            logger.log(
                f"WARNING: ignoring internal selection logic and using"
                f" input replay buffer class: {ReplayCls} -- compatibility not"
                " guaranteed.")
        self.replay_buffer = ReplayCls(**replay_kwargs)

    def optimize_agent(self, itr, samples=None, sampler_itr=None):
        """
        Extracts the needed fields from input samples and stores them in the 
        replay buffer.  Then samples from the replay buffer to train the agent
        by gradient updates (with the number of updates determined by replay
        ratio, sampler batch size, and training batch size).
        """
        itr = itr if sampler_itr is None else sampler_itr  # Async uses sampler_itr.
        if samples is not None:
            samples_to_buffer = self.samples_to_buffer(samples)
            self.replay_buffer.append_samples(samples_to_buffer)
        opt_info = OptInfo(*([] for _ in range(len(OptInfo._fields))))
        if itr < self.min_itr_learn:
            return opt_info
        for _ in range(self.updates_per_optimize):
            samples_from_replay = self.replay_buffer.sample_batch(
                self.batch_size)
            losses, values = self.loss(samples_from_replay)
            q1_loss, q2_loss, pi_loss, alpha_loss = losses

            if alpha_loss is not None:
                self.alpha_optimizer.zero_grad()
                alpha_loss.backward()
                self.alpha_optimizer.step()
                self._alpha = torch.exp(self._log_alpha.detach())

            self.pi_optimizer.zero_grad()
            pi_loss.backward()
            pi_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.pi_parameters(), self.clip_grad_norm)
            self.pi_optimizer.step()

            # Step Q's last because pi_loss.backward() uses them?
            self.q1_optimizer.zero_grad()
            q1_loss.backward()
            q1_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.q1_parameters(), self.clip_grad_norm)
            self.q1_optimizer.step()

            self.q2_optimizer.zero_grad()
            q2_loss.backward()
            q2_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.q2_parameters(), self.clip_grad_norm)
            self.q2_optimizer.step()

            grad_norms = (q1_grad_norm, q2_grad_norm, pi_grad_norm)

            self.append_opt_info_(opt_info, losses, grad_norms, values)
            self.update_counter += 1
            if self.update_counter % self.target_update_interval == 0:
                self.agent.update_target(self.target_update_tau)

        return opt_info

    def samples_to_buffer(self, samples):
        """Defines how to add data from sampler into the replay buffer. Called
        in optimize_agent() if samples are provided to that method."""
        samples_to_buffer = SamplesToBuffer(
            observation=samples.env.observation,
            action=samples.agent.action,
            reward=samples.env.reward,
            done=samples.env.done,
        )
        if self.bootstrap_timelimit:
            samples_to_buffer = SamplesToBufferTl(
                *samples_to_buffer, timeout=samples.env.env_info.timeout)
        return samples_to_buffer

    def loss(self, samples):
        """
        Computes losses for twin Q-values against the min of twin target Q-values
        and an entropy term.  Computes reparameterized policy loss, and loss for
        tuning entropy weighting, alpha.  
        
        Input samples have leading batch dimension [B,..] (but not time).
        """
        agent_inputs, target_inputs, action = buffer_to(
            (samples.agent_inputs, samples.target_inputs, samples.action))

        if self.mid_batch_reset and not self.agent.recurrent:
            valid = torch.ones_like(samples.done, dtype=torch.float)  # or None
        else:
            valid = valid_from_done(samples.done)
        if self.bootstrap_timelimit:
            # To avoid non-use of bootstrap when environment is 'done' due to
            # time-limit, turn off training on these samples.
            valid *= (1 - samples.timeout_n.float())

        with torch.no_grad():
            target_action, target_action_probs, target_log_pi, _ = self.agent.pi(
                *target_inputs)
            target_q1, target_q2 = self.agent.target_q(*target_inputs,
                                                       target_action)
            min_target_q = torch.min(target_q1, target_q2)
            target_value = target_action_probs * (min_target_q -
                                                  self._alpha * target_log_pi)
            target_value = target_value.sum(dim=1).unsqueeze(-1)
            disc = self.discount**self.n_step_return
            y = self.reward_scale * samples.return_ + (
                1 - samples.done_n.float()) * disc * target_value

        q1, q2 = self.agent.q(*agent_inputs, action)
        q1 = torch.gather(q1, 1, action.unsqueeze(1).long())
        q2 = torch.gather(q2, 1, action.unsqueeze(1).long())

        q1_loss = 0.5 * valid_mean((y - q1)**2, valid)
        q2_loss = 0.5 * valid_mean((y - q2)**2, valid)

        action, action_probs, log_pi, _ = self.agent.pi(*agent_inputs)
        q1_pi, q2_pi = self.agent.q(*agent_inputs, action)
        min_pi_target = torch.min(q1_pi, q2_pi)
        inside_term = self._alpha * log_pi - min_pi_target
        policy_loss = (action_probs * inside_term).sum(dim=1).mean()
        log_pi = torch.sum(log_pi * action_probs, dim=1)

        # if self.policy_output_regularization > 0:
        #     pi_losses += self.policy_output_regularization * torch.mean(
        #         0.5 * pi_mean ** 2 + 0.5 * pi_log_std ** 2, dim=-1)
        pi_loss = valid_mean(policy_loss, valid)

        if self.target_entropy is not None and self.fixed_alpha is None:
            alpha_losses = -self._log_alpha * (log_pi.detach() +
                                               self.target_entropy)
            alpha_loss = valid_mean(alpha_losses, valid)
        else:
            alpha_loss = None

        losses = (q1_loss, q2_loss, pi_loss, alpha_loss)
        values = tuple(val.detach() for val in (q1, q2, action_probs))
        return losses, values

    def get_action_prior(self, action):
        if self.action_prior == "uniform":
            prior_log_pi = 0.0
        elif self.action_prior == "gaussian":
            prior_log_pi = self.action_prior_distribution.log_likelihood(
                action, GaussianDistInfo(mean=torch.zeros_like(action)))
        return prior_log_pi

    def append_opt_info_(self, opt_info, losses, grad_norms, values):
        """In-place."""
        q1_loss, q2_loss, pi_loss, alpha_loss = losses
        q1_grad_norm, q2_grad_norm, pi_grad_norm = grad_norms
        q1, q2, action_probs = values
        opt_info.q1Loss.append(q1_loss.item())
        opt_info.q2Loss.append(q2_loss.item())
        opt_info.piLoss.append(pi_loss.item())
        opt_info.q1GradNorm.append(
            torch.tensor(q1_grad_norm).item())  # backwards compatible
        opt_info.q2GradNorm.append(
            torch.tensor(q2_grad_norm).item())  # backwards compatible
        opt_info.piGradNorm.append(
            torch.tensor(pi_grad_norm).item())  # backwards compatible
        opt_info.q1.extend(q1[::10].numpy())  # Downsample for stats.
        opt_info.q2.extend(q2[::10].numpy())
        opt_info.qMeanDiff.append(torch.mean(abs(q1 - q2)).item())
        opt_info.alpha.append(self._alpha.item())

    def optim_state_dict(self):
        return dict(
            pi_optimizer=self.pi_optimizer.state_dict(),
            q1_optimizer=self.q1_optimizer.state_dict(),
            q2_optimizer=self.q2_optimizer.state_dict(),
            alpha_optimizer=self.alpha_optimizer.state_dict()
            if self.alpha_optimizer else None,
            log_alpha=self._log_alpha.detach().item(),
        )

    def load_optim_state_dict(self, state_dict):
        self.pi_optimizer.load_state_dict(state_dict["pi_optimizer"])
        self.q1_optimizer.load_state_dict(state_dict["q1_optimizer"])
        self.q2_optimizer.load_state_dict(state_dict["q2_optimizer"])
        if self.alpha_optimizer is not None and state_dict[
                "alpha_optimizer"] is not None:
            self.alpha_optimizer.load_state_dict(state_dict["alpha_optimizer"])
        with torch.no_grad():
            self._log_alpha[:] = state_dict["log_alpha"]
            self._alpha = torch.exp(self._log_alpha.detach())
コード例 #27
0
ファイル: gaussian.py プロジェクト: DavidSlayback/rlpyt
class RecurrentGaussianPgAgentBase(BaseAgent):
    def __call__(self,
                 observation,
                 prev_action,
                 prev_reward,
                 init_rnn_state,
                 device="cpu"):
        """Performs forward pass on training data, for algorithm (requires
        recurrent state input)."""
        # Assume init_rnn_state already shaped: [N,B,H]
        model_inputs = buffer_to(
            (observation, prev_action, prev_reward, init_rnn_state),
            device=self.device)
        mu, log_std, value, next_rnn_state = self.model(*model_inputs)
        dist_info, value = buffer_to(
            (DistInfoStd(mean=mu, log_std=log_std), value), device=device)
        return dist_info, value, next_rnn_state  # Leave rnn_state on device.

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        super().initialize(env_spaces, share_memory)
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            # min_std=MIN_STD,
            # clip=env_spaces.action.high[0],  # Probably +1?
        )

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward, device="cpu"):
        """
        Compute policy's action distribution from inputs, and sample an
        action. Calls the model to produce mean, log_std, value estimate, and
        next recurrent state.  Moves inputs to device and returns outputs back
        to CPU, for the sampler.  Advances the recurrent state of the agent.
        (no grad)
        """
        agent_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu, log_std, value, rnn_state = self.model(*agent_inputs,
                                                   self.prev_rnn_state)
        dist_info = DistInfoStd(mean=mu, log_std=log_std)
        action = self.distribution.sample(dist_info)
        # Model handles None, but Buffer does not, make zeros if needed:
        prev_rnn_state = self.prev_rnn_state if self.prev_rnn_state is not None else buffer_func(
            rnn_state, torch.zeros_like)
        # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage.
        # (Special case: model should always leave B dimension in.)
        prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1)
        agent_info = AgentInfoRnn(dist_info=dist_info,
                                  value=value,
                                  prev_rnn_state=prev_rnn_state)
        action, agent_info = buffer_to((action, agent_info), device=device)
        self.advance_rnn_state(rnn_state)  # Keep on device.
        return AgentStep(action=action, agent_info=agent_info)

    @torch.no_grad()
    def value(self, observation, prev_action, prev_reward, device="cpu"):
        """
        Compute the value estimate for the environment state using the
        currently held recurrent state, without advancing the recurrent state,
        e.g. for the bootstrap value V(s_{T+1}), in the sampler.  (no grad)
        """
        agent_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        _mu, _log_std, value, _rnn_state = self.model(*agent_inputs,
                                                      self.prev_rnn_state)
        return value.to(device)
コード例 #28
0
ファイル: sac_v.py プロジェクト: afansi/rlpyt
class SAC_V(RlAlgorithm):
    """TO BE DEPRECATED."""

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.99,
            batch_size=256,
            min_steps_learn=int(1e4),
            replay_size=int(1e6),
            replay_ratio=256,  # data_consumption / data_generation
            target_update_tau=0.005,  # tau=1 for hard update.
            target_update_interval=1,  # 1000 for hard update, 1 for soft.
            learning_rate=3e-4,
            OptimCls=torch.optim.Adam,
            optim_kwargs=None,
            initial_optim_state_dict=None,  # for all of them.
            action_prior="uniform",  # or "gaussian"
            reward_scale=1,
            reparameterize=True,
            clip_grad_norm=1e9,
            policy_output_regularization=0.001,
            n_step_return=1,
            updates_per_sync=1,  # For async mode only.
            bootstrap_timelimit=True,
            ReplayBufferCls=None,  #  Leave None to select by above options.
    ):
        if optim_kwargs is None:
            optim_kwargs = dict()
        assert action_prior in ["uniform", "gaussian"]
        self._batch_size = batch_size
        del batch_size  # Property.
        save__init__args(locals())

    def initialize(self,
                   agent,
                   n_itr,
                   batch_spec,
                   mid_batch_reset,
                   examples,
                   world_size=1,
                   rank=0):
        """Used in basic or synchronous multi-GPU runners, not async."""
        self.agent = agent
        self.n_itr = n_itr
        self.mid_batch_reset = mid_batch_reset
        self.sampler_bs = sampler_bs = batch_spec.size
        self.updates_per_optimize = int(self.replay_ratio * sampler_bs /
                                        self.batch_size)
        logger.log(
            f"From sampler batch size {sampler_bs}, training "
            f"batch size {self.batch_size}, and replay ratio "
            f"{self.replay_ratio}, computed {self.updates_per_optimize} "
            f"updates per iteration.")
        self.min_itr_learn = self.min_steps_learn // sampler_bs
        agent.give_min_itr_learn(self.min_itr_learn)
        self.initialize_replay_buffer(examples, batch_spec)
        self.optim_initialize(rank)

    def async_initialize(self,
                         agent,
                         sampler_n_itr,
                         batch_spec,
                         mid_batch_reset,
                         examples,
                         world_size=1):
        """Used in async runner only."""
        self.agent = agent
        self.n_itr = sampler_n_itr
        self.initialize_replay_buffer(examples, batch_spec, async_=True)
        self.mid_batch_reset = mid_batch_reset
        self.sampler_bs = sampler_bs = batch_spec.size
        self.updates_per_optimize = self.updates_per_sync
        self.min_itr_learn = int(self.min_steps_learn // sampler_bs)
        agent.give_min_itr_learn(self.min_itr_learn)
        return self.replay_buffer

    def optim_initialize(self, rank=0):
        """Called by async runner."""
        self.rank = rank
        self.pi_optimizer = self.OptimCls(self.agent.pi_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q1_optimizer = self.OptimCls(self.agent.q1_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.q2_optimizer = self.OptimCls(self.agent.q2_parameters(),
                                          lr=self.learning_rate,
                                          **self.optim_kwargs)
        self.v_optimizer = self.OptimCls(self.agent.v_parameters(),
                                         lr=self.learning_rate,
                                         **self.optim_kwargs)
        if self.initial_optim_state_dict is not None:
            self.load_optim_state_dict(self.initial_optim_state_dict)
        if self.action_prior == "gaussian":
            self.action_prior_distribution = Gaussian(
                dim=self.agent.env_spaces.action.size, std=1.)

    def initialize_replay_buffer(self, examples, batch_spec, async_=False):
        example_to_buffer = self.examples_to_buffer(examples)
        replay_kwargs = dict(
            example=example_to_buffer,
            size=self.replay_size,
            B=batch_spec.B,
            n_step_return=self.n_step_return,
        )
        if not self.bootstrap_timelimit:
            ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer
        else:
            ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer
        if self.ReplayBufferCls is not None:
            ReplayCls = self.ReplayBufferCls
            logger.log(
                f"WARNING: ignoring internal selection logic and using"
                f" input replay buffer class: {ReplayCls} -- compatibility not"
                " guaranteed.")
        self.replay_buffer = ReplayCls(**replay_kwargs)

    def optimize_agent(self, itr, samples=None, sampler_itr=None):
        itr = itr if sampler_itr is None else sampler_itr  # Async uses sampler_itr.
        if samples is not None:
            samples_to_buffer = self.samples_to_buffer(samples)
            self.replay_buffer.append_samples(samples_to_buffer)
        opt_info = OptInfo(*([] for _ in range(len(OptInfo._fields))))
        if itr < self.min_itr_learn:
            return opt_info
        for _ in range(self.updates_per_optimize):
            samples_from_replay = self.replay_buffer.sample_batch(
                self.batch_size)
            losses, values = self.loss(samples_from_replay)
            q1_loss, q2_loss, v_loss, pi_loss = losses

            self.v_optimizer.zero_grad()
            v_loss.backward()
            v_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.v_parameters(), self.clip_grad_norm)
            self.v_optimizer.step()

            self.pi_optimizer.zero_grad()
            pi_loss.backward()
            pi_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.pi_parameters(), self.clip_grad_norm)
            self.pi_optimizer.step()

            # Step Q's last because pi_loss.backward() uses them?
            self.q1_optimizer.zero_grad()
            q1_loss.backward()
            q1_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.q1_parameters(), self.clip_grad_norm)
            self.q1_optimizer.step()

            self.q2_optimizer.zero_grad()
            q2_loss.backward()
            q2_grad_norm = torch.nn.utils.clip_grad_norm_(
                self.agent.q2_parameters(), self.clip_grad_norm)
            self.q2_optimizer.step()

            grad_norms = (q1_grad_norm, q2_grad_norm, v_grad_norm,
                          pi_grad_norm)

            self.append_opt_info_(opt_info, losses, grad_norms, values)
            self.update_counter += 1
            if self.update_counter % self.target_update_interval == 0:
                self.agent.update_target(self.target_update_tau)
        return opt_info

    def samples_to_buffer(self, samples):
        return SamplesToBuffer(
            observation=samples.env.observation,
            action=samples.agent.action,
            reward=samples.env.reward,
            done=samples.env.done,
            timeout=getattr(samples.env.env_info, "timeout", None),
        )

    def examples_to_buffer(self, examples):
        """Defines how to initialize the replay buffer from examples. Called
        in initialize_replay_buffer().
        """
        return SamplesToBuffer(
            observation=examples["observation"],
            action=examples["action"],
            reward=examples["reward"],
            done=examples["done"],
            timeout=getattr(examples["env_info"], "timeout", None),
        )

    def loss(self, samples):
        """Samples have leading batch dimension [B,..] (but not time)."""
        agent_inputs, target_inputs, action = buffer_to(
            (samples.agent_inputs, samples.target_inputs, samples.action))
        q1, q2 = self.agent.q(*agent_inputs, action)
        with torch.no_grad():
            target_v = self.agent.target_v(*target_inputs)
        disc = self.discount**self.n_step_return
        y = (self.reward_scale * samples.return_ +
             (1 - samples.done_n.float()) * disc * target_v)
        if self.mid_batch_reset and not self.agent.recurrent:
            valid = torch.ones_like(samples.done, dtype=torch.float)
        else:
            valid = valid_from_done(samples.done)

        if self.bootstrap_timelimit:
            # To avoid non-use of bootstrap when environment is 'done' due to
            # time-limit, turn off training on these samples.
            valid *= (1 - samples.timeout_n.float())

        q1_loss = 0.5 * valid_mean((y - q1)**2, valid)
        q2_loss = 0.5 * valid_mean((y - q2)**2, valid)

        v = self.agent.v(*agent_inputs)
        new_action, log_pi, (pi_mean,
                             pi_log_std) = self.agent.pi(*agent_inputs)
        if not self.reparameterize:
            new_action = new_action.detach()  # No grad.
        log_target1, log_target2 = self.agent.q(*agent_inputs, new_action)
        min_log_target = torch.min(log_target1, log_target2)
        prior_log_pi = self.get_action_prior(new_action.cpu())
        v_target = (min_log_target - log_pi +
                    prior_log_pi).detach()  # No grad.

        v_loss = 0.5 * valid_mean((v - v_target)**2, valid)

        if self.reparameterize:
            pi_losses = log_pi - min_log_target
        else:
            pi_factor = (v - v_target).detach()
            pi_losses = log_pi * pi_factor
        if self.policy_output_regularization > 0:
            pi_losses += self.policy_output_regularization * torch.mean(
                0.5 * pi_mean**2 + 0.5 * pi_log_std**2, dim=-1)
        pi_loss = valid_mean(pi_losses, valid)

        losses = (q1_loss, q2_loss, v_loss, pi_loss)
        values = tuple(val.detach()
                       for val in (q1, q2, v, pi_mean, pi_log_std))
        return losses, values

    # def q_loss(self, samples):
    #     """Samples have leading batch dimension [B,..] (but not time)."""
    #     agent_inputs, target_inputs, action = buffer_to(
    #         (samples.agent_inputs, samples.target_inputs, samples.action),
    #         device=self.agent.device)  # Move to device once, re-use.
    #     q1, q2 = self.agent.q(*agent_inputs, action)
    #     with torch.no_grad():
    #         target_v = self.agent.target_v(*target_inputs)
    #     disc = self.discount ** self.n_step_return
    #     y = (self.reward_scale * samples.return_ +
    #         (1 - samples.done_n.float()) * disc * target_v)
    #     if self.mid_batch_reset and not self.agent.recurrent:
    #         valid = None  # OR: torch.ones_like(samples.done, dtype=torch.float)
    #     else:
    #         valid = valid_from_done(samples.done)

    #     q1_loss = 0.5 * valid_mean((y - q1) ** 2, valid)
    #     q2_loss = 0.5 * valid_mean((y - q2) ** 2, valid)

    #     losses = (q1_loss, q2_loss)
    #     values = tuple(val.detach() for val in (q1, q2))
    #     return losses, values, agent_inputs, valid

    # def pi_v_loss(self, agent_inputs, valid):
    #     v = self.agent.v(*agent_inputs)
    #     new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs)
    #     if not self.reparameterize:
    #         new_action = new_action.detach()  # No grad.
    #     log_target1, log_target2 = self.agent.q(*agent_inputs, new_action)
    #     min_log_target = torch.min(log_target1, log_target2)
    #     prior_log_pi = self.get_action_prior(new_action.cpu())
    #     v_target = (min_log_target - log_pi + prior_log_pi).detach()  # No grad.
    #     v_loss = 0.5 * valid_mean((v - v_target) ** 2, valid)

    #     if self.reparameterize:
    #         pi_losses = log_pi - min_log_target  # log_target1  # min_log_target
    #     else:
    #         pi_factor = (v - v_target).detach()  # No grad.
    #         pi_losses = log_pi * pi_factor
    #     if self.policy_output_regularization > 0:
    #         pi_losses += self.policy_output_regularization * torch.sum(
    #             0.5 * pi_mean ** 2 + 0.5 * pi_log_std ** 2, dim=-1)
    #     pi_loss = valid_mean(pi_losses, valid)

    #     losses = (v_loss, pi_loss)
    #     values = tuple(val.detach() for val in (v, pi_mean, pi_log_std))
    #     return losses, values

    # def loss(self, samples):
    #     """Samples have leading batch dimension [B,..] (but not time)."""
    #     agent_inputs, target_inputs, action = buffer_to(
    #         (samples.agent_inputs, samples.target_inputs, samples.action),
    #         device=self.agent.device)  # Move to device once, re-use.
    #     q1, q2 = self.agent.q(*agent_inputs, action)
    #     with torch.no_grad():
    #         target_v = self.agent.target_v(*target_inputs)
    #     disc = self.discount ** self.n_step_return
    #     y = (self.reward_scale * samples.return_ +
    #         (1 - samples.done_n.float()) * disc * target_v)
    #     if self.mid_batch_reset and not self.agent.recurrent:
    #         valid = None  # OR: torch.ones_like(samples.done, dtype=torch.float)
    #     else:
    #         valid = valid_from_done(samples.done)

    #     q1_loss = 0.5 * valid_mean((y - q1) ** 2, valid)
    #     q2_loss = 0.5 * valid_mean((y - q2) ** 2, valid)

    #     v = self.agent.v(*agent_inputs)
    #     new_action, log_pi, (pi_mean, pi_log_std) = self.agent.pi(*agent_inputs)
    #     if not self.reparameterize:
    #         new_action = new_action.detach()  # No grad.
    #     log_target1, log_target2 = self.agent.q(*agent_inputs, new_action)
    #     min_log_target = torch.min(log_target1, log_target2)
    #     prior_log_pi = self.get_action_prior(new_action.cpu())
    #     v_target = (min_log_target - log_pi + prior_log_pi).detach()  # No grad.
    #     v_loss = 0.5 * valid_mean((v - v_target) ** 2, valid)

    #     if self.reparameterize:
    #         pi_losses = log_pi - min_log_target  # log_target1
    #     else:
    #         pi_factor = (v - v_target).detach()  # No grad.
    #         pi_losses = log_pi * pi_factor
    #     if self.policy_output_regularization > 0:
    #         pi_losses += torch.sum(self.policy_output_regularization * 0.5 *
    #             pi_mean ** 2 + pi_log_std ** 2, dim=-1)
    #     pi_loss = valid_mean(pi_losses, valid)

    #     losses = (q1_loss, q2_loss, v_loss, pi_loss)
    #     values = tuple(val.detach() for val in (q1, q2, v, pi_mean, pi_log_std))
    #     return losses, values

    def get_action_prior(self, action):
        if self.action_prior == "uniform":
            prior_log_pi = 0.0
        elif self.action_prior == "gaussian":
            prior_log_pi = self.action_prior_distribution.log_likelihood(
                action, GaussianDistInfo(mean=torch.zeros_like(action)))
        return prior_log_pi

    def append_opt_info_(self, opt_info, losses, grad_norms, values):
        """In-place."""
        q1_loss, q2_loss, v_loss, pi_loss = losses
        q1_grad_norm, q2_grad_norm, v_grad_norm, pi_grad_norm = grad_norms
        q1, q2, v, pi_mean, pi_log_std = values
        opt_info.q1Loss.append(q1_loss.item())
        opt_info.q2Loss.append(q2_loss.item())
        opt_info.vLoss.append(v_loss.item())
        opt_info.piLoss.append(pi_loss.item())
        opt_info.q1GradNorm.append(
            torch.tensor(q1_grad_norm).item())  # backwards compatible
        opt_info.q2GradNorm.append(
            torch.tensor(q2_grad_norm).item())  # backwards compatible
        opt_info.vGradNorm.append(
            torch.tensor(v_grad_norm).item())  # backwards compatible
        opt_info.piGradNorm.append(
            torch.tensor(pi_grad_norm).item())  # backwards compatible
        opt_info.q1.extend(q1[::10].numpy())  # Downsample for stats.
        opt_info.q2.extend(q2[::10].numpy())
        opt_info.v.extend(v[::10].numpy())
        opt_info.piMu.extend(pi_mean[::10].numpy())
        opt_info.piLogStd.extend(pi_log_std[::10].numpy())
        opt_info.qMeanDiff.append(torch.mean(abs(q1 - q2)).item())

    def optim_state_dict(self):
        return dict(
            pi_optimizer=self.pi_optimizer.state_dict(),
            q1_optimizer=self.q1_optimizer.state_dict(),
            q2_optimizer=self.q2_optimizer.state_dict(),
            v_optimizer=self.v_optimizer.state_dict(),
        )

    def load_optim_state_dict(self, state_dict):
        self.pi_optimizer.load_state_dict(state_dict["pi_optimizer"])
        self.q1_optimizer.load_state_dict(state_dict["q1_optimizer"])
        self.q2_optimizer.load_state_dict(state_dict["q2_optimizer"])
        self.v_optimizer.load_state_dict(state_dict["v_optimizer"])
コード例 #29
0
class SacAgent(BaseAgent):
    """TO BE DEPRECATED."""

    def __init__(
            self,
            ModelCls=PiMlpModel,  # Pi model.
            QModelCls=QofMuMlpModel,
            VModelCls=VMlpModel,
            model_kwargs=None,  # Pi model.
            q_model_kwargs=None,
            v_model_kwargs=None,
            initial_model_state_dict=None,  # All models.
            action_squash=1.,  # Max magnitude (or None).
            pretrain_std=0.75,  # With squash 0.75 is near uniform.
            ):
        if model_kwargs is None:
            model_kwargs = dict(hidden_sizes=[256, 256])
        if q_model_kwargs is None:
            q_model_kwargs = dict(hidden_sizes=[256, 256])
        if v_model_kwargs is None:
            v_model_kwargs = dict(hidden_sizes=[256, 256])
        super().__init__(ModelCls=ModelCls, model_kwargs=model_kwargs,
            initial_model_state_dict=initial_model_state_dict)
        save__init__args(locals())
        self.min_itr_learn = 0  # Get from algo.

    def initialize(self, env_spaces, share_memory=False,
            global_B=1, env_ranks=None):
        _initial_model_state_dict = self.initial_model_state_dict
        self.initial_model_state_dict = None  # Don't let base agent try to load.
        super().initialize(env_spaces, share_memory,
            global_B=global_B, env_ranks=env_ranks)
        self.initial_model_state_dict = _initial_model_state_dict
        self.q1_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
        self.q2_model = self.QModelCls(**self.env_model_kwargs, **self.q_model_kwargs)
        self.v_model = self.VModelCls(**self.env_model_kwargs, **self.v_model_kwargs)
        self.target_v_model = self.VModelCls(**self.env_model_kwargs,
            **self.v_model_kwargs)
        self.target_v_model.load_state_dict(self.v_model.state_dict())
        if self.initial_model_state_dict is not None:
            self.load_state_dict(self.initial_model_state_dict)
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            squash=self.action_squash,
            min_std=np.exp(MIN_LOG_STD),
            max_std=np.exp(MAX_LOG_STD),
        )

    def to_device(self, cuda_idx=None):
        super().to_device(cuda_idx)
        self.q1_model.to(self.device)
        self.q2_model.to(self.device)
        self.v_model.to(self.device)
        self.target_v_model.to(self.device)

    def data_parallel(self):
        super().data_parallel
        DDP_WRAP = DDPC if self.device.type == "cpu" else DDP
        self.q1_model = DDP_WRAP(self.q1_model)
        self.q2_model = DDP_WRAP(self.q2_model)
        self.v_model = DDP_WRAP(self.v_model)

    def give_min_itr_learn(self, min_itr_learn):
        self.min_itr_learn = min_itr_learn  # From algo.

    def make_env_to_model_kwargs(self, env_spaces):
        assert len(env_spaces.action.shape) == 1
        return dict(
            observation_shape=env_spaces.observation.shape,
            action_size=env_spaces.action.shape[0],
        )

    def q(self, observation, prev_action, prev_reward, action):
        model_inputs = buffer_to((observation, prev_action, prev_reward,
            action), device=self.device)
        q1 = self.q1_model(*model_inputs)
        q2 = self.q2_model(*model_inputs)
        return q1.cpu(), q2.cpu()

    def v(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
            device=self.device)
        v = self.v_model(*model_inputs)
        return v.cpu()

    def pi(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
            device=self.device)
        mean, log_std = self.model(*model_inputs)
        dist_info = DistInfoStd(mean=mean, log_std=log_std)
        action, log_pi = self.distribution.sample_loglikelihood(dist_info)
        # action = self.distribution.sample(dist_info)
        # log_pi = self.distribution.log_likelihood(action, dist_info)
        log_pi, dist_info = buffer_to((log_pi, dist_info), device="cpu")
        return action, log_pi, dist_info  # Action stays on device for q models.

    def target_v(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
            device=self.device)
        target_v = self.target_v_model(*model_inputs)
        return target_v.cpu()

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        model_inputs = buffer_to((observation, prev_action, prev_reward),
            device=self.device)
        mean, log_std = self.model(*model_inputs)
        dist_info = DistInfoStd(mean=mean, log_std=log_std)
        action = self.distribution.sample(dist_info)
        agent_info = AgentInfo(dist_info=dist_info)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def update_target(self, tau=1):
        update_state_dict(self.target_v_model, self.v_model.state_dict(), tau)

    @property
    def models(self):
        return Models(pi=self.model, q1=self.q1_model, q2=self.q2_model,
            v=self.v_model)

    def pi_parameters(self):
        return self.model.parameters()

    def q1_parameters(self):
        return self.q1_model.parameters()

    def q2_parameters(self):
        return self.q2_model.parameters()

    def v_parameters(self):
        return self.v_model.parameters()

    def train_mode(self, itr):
        super().train_mode(itr)
        self.q1_model.train()
        self.q2_model.train()
        self.v_model.train()

    def sample_mode(self, itr):
        super().sample_mode(itr)
        self.q1_model.eval()
        self.q2_model.eval()
        self.v_model.eval()
        if itr == 0:
            logger.log(f"Agent at itr {itr}, sample std: {self.pretrain_std}")
        if itr == self.min_itr_learn:
            logger.log(f"Agent at itr {itr}, sample std: learned.")
        std = None if itr >= self.min_itr_learn else self.pretrain_std
        self.distribution.set_std(std)  # If None: std from policy dist_info.

    def eval_mode(self, itr):
        super().eval_mode(itr)
        self.q1_model.eval()
        self.q2_model.eval()
        self.v_model.eval()
        self.distribution.set_std(0.)  # Deterministic (dist_info std ignored).

    def state_dict(self):
        return dict(
            model=self.model.state_dict(),  # Pi model.
            q1_model=self.q1_model.state_dict(),
            q2_model=self.q2_model.state_dict(),
            v_model=self.v_model.state_dict(),
            target_v_model=self.target_v_model.state_dict(),
        )

    def load_state_dict(self, state_dict):
        self.model.load_state_dict(state_dict["model"])
        self.q1_model.load_state_dict(state_dict["q1_model"])
        self.q2_model.load_state_dict(state_dict["q2_model"])
        self.v_model.load_state_dict(state_dict["v_model"])
        self.target_v_model.load_state_dict(state_dict["target_v_model"])
コード例 #30
0
ファイル: ddpg_agent.py プロジェクト: kevinghst/rl_ul
class DdpgAgent(BaseAgent):
    """Agent for deep deterministic policy gradient algorithm."""

    shared_mu_model = None

    def __init__(
        self,
        ModelCls=MuMlpModel,  # Mu model.
        QModelCls=QofMuMlpModel,
        model_kwargs=None,  # Mu model.
        q_model_kwargs=None,
        initial_model_state_dict=None,  # Mu model.
        initial_q_model_state_dict=None,
        action_std=0.1,
        action_noise_clip=None,
    ):
        """Saves input arguments; default network sizes saved here."""
        if model_kwargs is None:
            model_kwargs = dict(hidden_sizes=[400, 300])
        if q_model_kwargs is None:
            q_model_kwargs = dict(hidden_sizes=[400, 300])
        save__init__args(locals())
        super().__init__()  # For async setup.

    def initialize(self,
                   env_spaces,
                   share_memory=False,
                   global_B=1,
                   env_ranks=None):
        """Instantiates mu and q, and target_mu and target_q models."""
        super().initialize(env_spaces,
                           share_memory,
                           global_B=global_B,
                           env_ranks=env_ranks)
        self.q_model = self.QModelCls(**self.env_model_kwargs,
                                      **self.q_model_kwargs)
        if self.initial_q_model_state_dict is not None:
            self.q_model.load_state_dict(self.initial_q_model_state_dict)
        self.target_model = self.ModelCls(**self.env_model_kwargs,
                                          **self.model_kwargs)
        self.target_q_model = self.QModelCls(**self.env_model_kwargs,
                                             **self.q_model_kwargs)
        self.target_q_model.load_state_dict(self.q_model.state_dict())
        assert len(env_spaces.action.shape) == 1
        self.distribution = Gaussian(
            dim=env_spaces.action.shape[0],
            std=self.action_std,
            noise_clip=self.action_noise_clip,
            clip=env_spaces.action.high[0],  # Assume symmetric low=-high.
        )

    def to_device(self, cuda_idx=None):
        super().to_device(cuda_idx)  # Takes care of self.model.
        self.target_model.to(self.device)
        self.q_model.to(self.device)
        self.target_q_model.to(self.device)

    def data_parallel(self):
        device_id = super().data_parallel()  # Takes care of self.model.
        self.q_model = DDP(
            self.q_model,
            device_ids=None if device_id is None else [device_id],  # 1 GPU.
            output_device=device_id,
        )
        return device_id

    def make_env_to_model_kwargs(self, env_spaces):
        assert len(env_spaces.action.shape) == 1
        return dict(
            observation_shape=env_spaces.observation.shape,
            action_size=env_spaces.action.shape[0],
        )

    def q(self, observation, prev_action, prev_reward, action):
        """Compute Q-value for input state/observation and action (with grad)."""
        model_inputs = buffer_to(
            (observation, prev_action, prev_reward, action),
            device=self.device)
        q = self.q_model(*model_inputs)
        return q.cpu()

    def q_at_mu(self, observation, prev_action, prev_reward):
        """Compute Q-value for input state/observation, through the mu_model
        (with grad)."""
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu = self.model(*model_inputs)
        q = self.q_model(*model_inputs, mu)
        return q.cpu()

    def target_q_at_mu(self, observation, prev_action, prev_reward):
        """Compute target Q-value for input state/observation, through the
        target mu_model."""
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        target_mu = self.target_model(*model_inputs)
        target_q_at_mu = self.target_q_model(*model_inputs, target_mu)
        return target_q_at_mu.cpu()

    @torch.no_grad()
    def step(self, observation, prev_action, prev_reward):
        """Computes distribution parameters (mu) for state/observation,
        returns (gaussian) sampled action."""
        model_inputs = buffer_to((observation, prev_action, prev_reward),
                                 device=self.device)
        mu = self.model(*model_inputs)
        action = self.distribution.sample(DistInfo(mean=mu))
        agent_info = AgentInfo(mu=mu)
        action, agent_info = buffer_to((action, agent_info), device="cpu")
        return AgentStep(action=action, agent_info=agent_info)

    def update_target(self, tau=1):
        update_state_dict(self.target_model, self.model.state_dict(), tau)
        update_state_dict(self.target_q_model, self.q_model.state_dict(), tau)

    def q_parameters(self):
        return self.q_model.parameters()

    def mu_parameters(self):
        return self.model.parameters()

    def train_mode(self, itr):
        super().train_mode(itr)
        self.q_model.train()

    def sample_mode(self, itr):
        super().sample_mode(itr)
        self.q_model.eval()
        self.distribution.set_std(self.action_std)

    def eval_mode(self, itr):
        super().eval_mode(itr)
        self.q_model.eval()
        self.distribution.set_std(0.)  # Deterministic.

    def state_dict(self):
        return dict(
            model=self.model.state_dict(),
            q_model=self.q_model.state_dict(),
            target_model=self.target_model.state_dict(),
            target_q_model=self.target_q_model.state_dict(),
        )

    def load_state_dict(self, state_dict):
        self.model.load_state_dict(state_dict["model"])
        self.q_model.load_state_dict(state_dict["q_model"])
        self.target_model.load_state_dict(state_dict["target_model"])
        self.target_q_model.load_state_dict(state_dict["target_q_model"])