Esempio n. 1
0
    def compute_episodic_return(
        batch: Batch,
        v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None,
        gamma: float = 0.99,
        gae_lambda: float = 0.95,
        time_trunc: Optional[int] = None,
        rew_norm: bool = False,
    ) -> Batch:
        """Compute returns over given full-length episodes.

        Implementation of Generalized Advantage Estimator (arXiv:1506.02438).

        :param batch: a data batch which contains several full-episode data
            chronologically.
        :type batch: :class:`~tianshou.data.Batch`
        :param v_s_: the value function of all next states :math:`V(s')`.
        :type v_s_: numpy.ndarray
        :param float gamma: the discount factor, should be in [0, 1], defaults
            to 0.99.
        :param float gae_lambda: the parameter for Generalized Advantage
            Estimation, should be in [0, 1], defaults to 0.95.
        :param bool rew_norm: normalize the reward to Normal(0, 1), defaults
            to False.

        :return: a Batch. The result will be stored in batch.returns as a numpy
            array with shape (bsz, ).
        """
        rew = batch.rew
        v_s_ = np.zeros_like(rew) if v_s_ is None else to_numpy(v_s_.flatten())
        returns = _episodic_return(
            v_s_, rew, batch.done, gamma, gae_lambda, time_trunc)
        if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2):
            returns = (returns - returns.mean()) / returns.std()
        batch.returns = returns
        return batch
Esempio n. 2
0
    def forward(self,
                batch: Batch,
                state: Optional[Union[dict, Batch, np.ndarray]] = None,
                model: str = 'model',
                input: str = 'obs',
                eps: Optional[float] = None,
                **kwargs) -> Batch:
        """Compute action over the given batch data.

        :param float eps: in [0, 1], for epsilon-greedy exploration method.

        :return: A :class:`~tianshou.data.Batch` which has 3 keys:

            * ``act`` the action.
            * ``logits`` the network's raw output.
            * ``state`` the hidden state.

        .. seealso::

            Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for
            more detailed explanation.
        """
        model = getattr(self, model)
        obs = getattr(batch, input)
        q, h = model(obs, state=state, info=batch.info)
        act = to_numpy(q.max(dim=1)[1])
        # add eps to act
        if eps is None:
            eps = self.eps
        if not np.isclose(eps, 0):
            for i in range(len(q)):
                if np.random.rand() < eps:
                    act[i] = np.random.randint(q.shape[1])
        return Batch(logits=q, act=act, state=h)
Esempio n. 3
0
 def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                indice: np.ndarray) -> Batch:
     if self._rew_norm:
         mean, std = batch.rew.mean(), batch.rew.std()
         if not np.isclose(std, 0.0, 1e-2):
             batch.rew = (batch.rew - mean) / std
     v, v_, old_log_prob = [], [], []
     with torch.no_grad():
         for b in batch.split(self._batch, shuffle=False, merge_last=True):
             v_.append(self.critic(b.obs_next))
             v.append(self.critic(b.obs))
             old_log_prob.append(
                 self(b).dist.log_prob(to_torch_as(b.act, v[0])))
     v_ = to_numpy(torch.cat(v_, dim=0))
     batch = self.compute_episodic_return(batch,
                                          buffer,
                                          indice,
                                          v_,
                                          gamma=self._gamma,
                                          gae_lambda=self._lambda,
                                          rew_norm=self._rew_norm)
     batch.v = torch.cat(v, dim=0).flatten()  # old value
     batch.act = to_torch_as(batch.act, v[0])
     batch.logp_old = torch.cat(old_log_prob, dim=0)
     batch.returns = to_torch_as(batch.returns, v[0])
     batch.adv = batch.returns - batch.v
     if self._rew_norm:
         mean, std = batch.adv.mean(), batch.adv.std()
         if not np.isclose(std.item(), 0.0, 1e-2):
             batch.adv = (batch.adv - mean) / std
     return batch
Esempio n. 4
0
    def map_action_inverse(
            self, act: Union[Batch, List,
                             np.ndarray]) -> Union[Batch, List, np.ndarray]:
        """Inverse operation to :meth:`~tianshou.policy.BasePolicy.map_action`.

        This function is called in :meth:`~tianshou.data.Collector.collect` for
        random initial steps. It scales [action_space.low, action_space.high] to
        the value ranges of policy.forward.

        :param act: a data batch, list or numpy.ndarray which is the action taken
            by gym.spaces.Box.sample().

        :return: action remapped.
        """
        if isinstance(self.action_space, gym.spaces.Box):
            act = to_numpy(act)
            if isinstance(act, np.ndarray):
                if self.action_scaling:
                    low, high = self.action_space.low, self.action_space.high
                    scale = high - low
                    eps = np.finfo(np.float32).eps.item()
                    scale[scale < eps] += eps
                    act = (act - low) * 2.0 / scale - 1.0
                if self.action_bound_method == "tanh":
                    act = (np.log(1.0 + act) -
                           np.log(1.0 - act)) / 2.0  # type: ignore
        return act
Esempio n. 5
0
 def forward(
     self,
     batch: Batch,
     state: Optional[Union[dict, Batch, np.ndarray]] = None,
     model: str = "model",
     input: str = "obs",
     **kwargs: Any,
 ) -> Batch:
     if model == "model_old":
         sample_size = self._target_sample_size
     elif self.training:
         sample_size = self._online_sample_size
     else:
         sample_size = self._sample_size
     model = getattr(self, model)
     obs = batch[input]
     obs_ = obs.obs if hasattr(obs, "obs") else obs
     (logits, taus), h = model(
         obs_, sample_size=sample_size, state=state, info=batch.info
     )
     q = self.compute_q_value(logits, getattr(obs, "mask", None))
     if not hasattr(self, "max_action_num"):
         self.max_action_num = q.shape[1]
     act = to_numpy(q.max(dim=1)[1])
     return Batch(logits=logits, act=act, state=h, taus=taus)
Esempio n. 6
0
    def compute_nstep_return(
        batch: Batch,
        buffer: ReplayBuffer,
        indice: np.ndarray,
        target_q_fn: Callable[[ReplayBuffer, np.ndarray], torch.Tensor],
        gamma: float = 0.99,
        n_step: int = 1,
        rew_norm: bool = False,
    ) -> Batch:
        r"""Compute n-step return for Q-learning targets.

        .. math::
            G_t = \sum_{i = t}^{t + n - 1} \gamma^{i - t}(1 - d_i)r_i +
            \gamma^n (1 - d_{t + n}) Q_{\mathrm{target}}(s_{t + n})

        where :math:`\gamma` is the discount factor,
        :math:`\gamma \in [0, 1]`, :math:`d_t` is the done flag of step
        :math:`t`.

        :param batch: a data batch, which is equal to buffer[indice].
        :type batch: :class:`~tianshou.data.Batch`
        :param buffer: a data buffer which contains several full-episode data
            chronologically.
        :type buffer: :class:`~tianshou.data.ReplayBuffer`
        :param indice: sampled timestep.
        :type indice: numpy.ndarray
        :param function target_q_fn: a function receives :math:`t+n-1` step's
            data and compute target Q value.
        :param float gamma: the discount factor, should be in [0, 1], defaults
            to 0.99.
        :param int n_step: the number of estimation step, should be an int
            greater than 0, defaults to 1.
        :param bool rew_norm: normalize the reward to Normal(0, 1), defaults
            to False.

        :return: a Batch. The result will be stored in batch.returns as a
            torch.Tensor with shape (bsz, ).
        """
        rew = buffer.rew
        if rew_norm:
            bfr = rew[:min(len(buffer), 1000)]  # avoid large buffer
            mean, std = bfr.mean(), bfr.std()
            if np.isclose(std, 0, 1e-2):
                mean, std = 0.0, 1.0
        else:
            mean, std = 0.0, 1.0
        buf_len = len(buffer)
        terminal = (indice + n_step - 1) % buf_len
        target_q_torch = target_q_fn(buffer, terminal).flatten()  # (bsz, )
        target_q = to_numpy(target_q_torch)

        target_q = _nstep_return(rew, buffer.done, target_q, indice, gamma,
                                 n_step, len(buffer), mean, std)

        batch.returns = to_torch_as(target_q, target_q_torch)
        # prio buffer update
        if isinstance(buffer, PrioritizedReplayBuffer):
            batch.weight = to_torch_as(batch.weight, target_q_torch)
        return batch
Esempio n. 7
0
def test_utils_to_torch_numpy():
    batch = Batch(a=np.float64(1.0),
                  b=Batch(c=np.ones((1, ), dtype=np.float32),
                          d=torch.ones((1, ), dtype=torch.float64)))
    a_torch_float = to_torch(batch.a, dtype=torch.float32)
    assert a_torch_float.dtype == torch.float32
    a_torch_double = to_torch(batch.a, dtype=torch.float64)
    assert a_torch_double.dtype == torch.float64
    batch_torch_float = to_torch(batch, dtype=torch.float32)
    assert batch_torch_float.a.dtype == torch.float32
    assert batch_torch_float.b.c.dtype == torch.float32
    assert batch_torch_float.b.d.dtype == torch.float32
    data_list = [float('nan'), 1]
    data_list_torch = to_torch(data_list)
    assert data_list_torch.dtype == torch.float64
    data_list_2 = [np.random.rand(3, 3), np.random.rand(3, 3)]
    data_list_2_torch = to_torch(data_list_2)
    assert data_list_2_torch.shape == (2, 3, 3)
    assert np.allclose(to_numpy(to_torch(data_list_2)), data_list_2)
    data_list_3 = [np.zeros((3, 2)), np.zeros((3, 3))]
    data_list_3_torch = to_torch(data_list_3)
    assert isinstance(data_list_3_torch, list)
    assert all(isinstance(e, torch.Tensor) for e in data_list_3_torch)
    assert all(
        starmap(np.allclose, zip(to_numpy(to_torch(data_list_3)),
                                 data_list_3)))
    data_list_4 = [np.zeros((2, 3)), np.zeros((3, 3))]
    data_list_4_torch = to_torch(data_list_4)
    assert isinstance(data_list_4_torch, list)
    assert all(isinstance(e, torch.Tensor) for e in data_list_4_torch)
    assert all(
        starmap(np.allclose, zip(to_numpy(to_torch(data_list_4)),
                                 data_list_4)))
    data_list_5 = [np.zeros(2), np.zeros((3, 3))]
    data_list_5_torch = to_torch(data_list_5)
    assert isinstance(data_list_5_torch, list)
    assert all(isinstance(e, torch.Tensor) for e in data_list_5_torch)
    data_array = np.random.rand(3, 2, 2)
    data_empty_tensor = to_torch(data_array[[]])
    assert isinstance(data_empty_tensor, torch.Tensor)
    assert data_empty_tensor.shape == (0, 2, 2)
    data_empty_array = to_numpy(data_empty_tensor)
    assert isinstance(data_empty_array, np.ndarray)
    assert data_empty_array.shape == (0, 2, 2)
    assert np.allclose(to_numpy(to_torch(data_array)), data_array)
Esempio n. 8
0
    def compute_nstep_return(
        batch: Batch,
        buffer: ReplayBuffer,
        indice: np.ndarray,
        target_q_fn: Callable[[ReplayBuffer, np.ndarray], torch.Tensor],
        gamma: float = 0.99,
        n_step: int = 1,
        rew_norm: bool = False,
        use_mixed: bool = False,
    ) -> Batch:
        r"""Compute n-step return for Q-learning targets.

        .. math::
            G_t = \sum_{i = t}^{t + n - 1} \gamma^{i - t}(1 - d_i)r_i +
            \gamma^n (1 - d_{t + n}) Q_{\mathrm{target}}(s_{t + n})

        where :math:`\gamma` is the discount factor, :math:`\gamma \in [0, 1]`,
        :math:`d_t` is the done flag of step :math:`t`.

        :param Batch batch: a data batch, which is equal to buffer[indice].
        :param ReplayBuffer buffer: the data buffer.
        :param function target_q_fn: a function which compute target Q value
            of "obs_next" given data buffer and wanted indices.
        :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99.
        :param int n_step: the number of estimation step, should be an int greater
            than 0. Default to 1.
        :param bool rew_norm: normalize the reward to Normal(0, 1), Default to False.

        :return: a Batch. The result will be stored in batch.returns as a
            torch.Tensor with the same shape as target_q_fn's return tensor.
        """
        assert not rew_norm, \
            "Reward normalization in computing n-step returns is unsupported now."
        rew = buffer.rew
        bsz = len(indice)
        indices = [indice]
        for _ in range(n_step - 1):
            indices.append(buffer.next(indices[-1]))
        indices = np.stack(indices)
        # terminal indicates buffer indexes nstep after 'indice',
        # and are truncated at the end of each episode
        terminal = indices[-1]
        with autocast(enabled=use_mixed):
            with torch.no_grad():
                target_q_torch = target_q_fn(buffer, terminal)  # (bsz, ?)
        target_q = to_numpy(target_q_torch.float().reshape(bsz, -1))
        target_q = target_q * BasePolicy.value_mask(buffer, terminal).reshape(
            -1, 1)
        end_flag = buffer.done.copy()
        end_flag[buffer.unfinished_index()] = True
        target_q = _nstep_return(rew, end_flag, target_q, indices, gamma,
                                 n_step)

        batch.returns = to_torch_as(target_q, target_q_torch)
        if hasattr(batch, "weight"):  # prio buffer update
            batch.weight = to_torch_as(batch.weight, target_q_torch)
        return batch
Esempio n. 9
0
    def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                   indices: np.ndarray) -> Batch:
        """Pre-process the data from the provided replay buffer.

        Used in :meth:`update`. Check out :ref:`process_fn` for more information.
        """
        # update reward
        with torch.no_grad():
            batch.rew = to_numpy(-F.logsigmoid(-self.disc(batch)).flatten())
        return super().process_fn(batch, buffer, indices)
Esempio n. 10
0
    def compute_episodic_return(
        batch: Batch,
        buffer: ReplayBuffer,
        indice: np.ndarray,
        v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None,
        v_s: Optional[Union[np.ndarray, torch.Tensor]] = None,
        gamma: float = 0.99,
        gae_lambda: float = 0.95,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Compute returns over given batch.

        Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438)
        to calculate q/advantage value of given batch.

        :param Batch batch: a data batch which contains several episodes of data in
            sequential order. Mind that the end of each finished episode of batch
            should be marked by done flag, unfinished (or collecting) episodes will be
            recongized by buffer.unfinished_index().
        :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to
            buffer[indice].
        :param np.ndarray v_s_: the value function of all next states :math:`V(s')`.
        :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99.
        :param float gae_lambda: the parameter for Generalized Advantage Estimation,
            should be in [0, 1]. Default to 0.95.

        :return: two numpy arrays (returns, advantage) with each shape (bsz, ).
        """
        rew = batch.rew
        if v_s_ is None:
            assert np.isclose(gae_lambda, 1.0)
            v_s_ = np.zeros_like(rew)
        else:
            v_s_ = to_numpy(v_s_.flatten())  # type: ignore
            v_s_ = v_s_ * BasePolicy.value_mask(buffer, indice)
        v_s = np.roll(v_s_, 1) if v_s is None else to_numpy(v_s.flatten())

        end_flag = batch.done.copy()
        end_flag[np.isin(indice, buffer.unfinished_index())] = True
        advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda)
        returns = advantage + v_s
        # normalization varies from each policy, so we don't do it here
        return returns, advantage
Esempio n. 11
0
    def update_weight(self, index: np.ndarray,
                      new_weight: Union[np.ndarray, torch.Tensor]) -> None:
        """Update priority weight by index in this buffer.

        :param np.ndarray index: index you want to update weight.
        :param np.ndarray new_weight: new priority weight you want to update.
        """
        weight = np.abs(to_numpy(new_weight)) + self.__eps
        self.weight[index] = weight**self._alpha
        self._max_prio = max(self._max_prio, weight.max())
        self._min_prio = min(self._min_prio, weight.min())
Esempio n. 12
0
 def predict_next_action(self):
     """
     Predicts the next action given observation 'self.data.obs' and policy 'self.policy',
     and stores it in 'self.data.act'
     :return: outcome of policy forward pass
     """
     with torch.no_grad():
         self.data.obs = np.expand_dims(self.data.obs, axis=0)
         result = self.policy(self.data, last_state=None)
     self.data.act = to_numpy(result.act)
     return result
Esempio n. 13
0
    def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                   indices: np.ndarray) -> Batch:
        """Pre-process the data from the provided replay buffer.

        Used in :meth:`update`. Check out :ref:`process_fn` for more information.
        """
        mse_loss, act_hat = self.model(batch.obs, batch.act, batch.obs_next)
        batch.policy = Batch(orig_rew=batch.rew,
                             act_hat=act_hat,
                             mse_loss=mse_loss)
        batch.rew += to_numpy(mse_loss * self.reward_scale)
        return self.policy.process_fn(batch, buffer, indices)
Esempio n. 14
0
 def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                indice: np.ndarray) -> Batch:
     v_s, v_s_, old_log_prob = [], [], []
     with torch.no_grad():
         for b in batch.split(self._batch, shuffle=False, merge_last=True):
             v_s.append(self.critic(b.obs))
             v_s_.append(self.critic(b.obs_next))
             old_log_prob.append(
                 self(b).dist.log_prob(to_torch_as(b.act, v_s[0])))
     batch.v_s = torch.cat(v_s, dim=0).flatten()  # old value
     v_s = to_numpy(batch.v_s)
     v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten())
     # when normalizing values, we do not minus self.ret_rms.mean to be numerically
     # consistent with OPENAI baselines' value normalization pipeline. Emperical
     # study also shows that "minus mean" will harm performances a tiny little bit
     # due to unknown reasons (on Mujoco envs, not confident, though).
     if self._rew_norm:  # unnormalize v_s & v_s_
         v_s = v_s * np.sqrt(self.ret_rms.var + self._eps)
         v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps)
     unnormalized_returns, advantages = self.compute_episodic_return(
         batch,
         buffer,
         indice,
         v_s_,
         v_s,
         gamma=self._gamma,
         gae_lambda=self._lambda)
     if self._rew_norm:
         batch.returns = unnormalized_returns / \
             np.sqrt(self.ret_rms.var + self._eps)
         self.ret_rms.update(unnormalized_returns)
         mean, std = np.mean(advantages), np.std(advantages)
         advantages = (advantages - mean) / std  # per-batch norm
     else:
         batch.returns = unnormalized_returns
     batch.act = to_torch_as(batch.act, batch.v_s)
     batch.logp_old = torch.cat(old_log_prob, dim=0)
     batch.returns = to_torch_as(batch.returns, batch.v_s)
     batch.adv = to_torch_as(advantages, batch.v_s)
     return batch
Esempio n. 15
0
    def forward(
        self,
        batch: Batch,
        state: Optional[Union[dict, Batch, np.ndarray]] = None,
        model: str = "model",
        input: str = "obs",
        **kwargs: Any,
    ) -> Batch:
        """Compute action over the given batch data.

        :return: A :class:`~tianshou.data.Batch` which has 2 keys:

            * ``act`` the action.
            * ``state`` the hidden state.

        .. seealso::

            Please refer to :meth:`~tianshou.policy.DQNPolicy.forward` for
            more detailed explanation.
        """
        model = getattr(self, model)
        obs = batch[input]
        obs_ = obs.obs if hasattr(obs, "obs") else obs
        dist, h = model(obs_, state=state, info=batch.info)
        q = (dist * self.support).sum(2)
        act: np.ndarray = to_numpy(q.max(dim=1)[1])
        if hasattr(obs, "mask"):
            # some of actions are masked, they cannot be selected
            q_: np.ndarray = to_numpy(q)
            q_[~obs.mask] = -np.inf
            act = q_.argmax(axis=1)
        # add eps to act in training or testing phase
        if not self.updating and not np.isclose(self.eps, 0.0):
            for i in range(len(q)):
                if np.random.rand() < self.eps:
                    q_ = np.random.rand(*q[i].shape)
                    if hasattr(obs, "mask"):
                        q_[~obs.mask[i]] = -np.inf
                    act[i] = q_.argmax()
        return Batch(logits=dist, act=act, state=h)
Esempio n. 16
0
 def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                indice: np.ndarray) -> Batch:
     if self._lambda in [0, 1]:
         return self.compute_episodic_return(
             batch, None, gamma=self._gamma, gae_lambda=self._lambda)
     v_ = []
     with torch.no_grad():
         for b in batch.split(self._batch, shuffle=False, merge_last=True):
             v_.append(to_numpy(self.critic(b.obs_next)))
     v_ = np.concatenate(v_, axis=0)
     return self.compute_episodic_return(
         batch, v_, gamma=self._gamma, gae_lambda=self._lambda,
         rew_norm=self._rew_norm)
Esempio n. 17
0
 def forward(
     self,
     s: Union[np.ndarray, torch.Tensor],
     a: Optional[Union[np.ndarray, torch.Tensor]] = None,
     info: Dict[str, Any] = {},
 ) -> torch.Tensor:
     """Mapping: (s, a) -> logits -> Q(s, a)."""
     if a is not None:
         if s.dtype == np.object:
             a = to_numpy(a)
             s_0 = np.stack(s[:, 0], axis=0)
             s_1 = np.vstack(s[:, 1])
             s_1 = np.hstack((s_1, a))
             s = (s_0, s_1)
         else:
             s = s.reshape(s.shape[0], -1)
             a = to_numpy(a)
             a = a.reshape(a.shape[0], -1)
             s = np.concatenate((s, a), axis=1)
     logits, h = self.preprocess(s)
     logits = self.last(logits)
     return logits
Esempio n. 18
0
 def forward(
     self,
     batch: Batch,
     state: Optional[Union[Dict, Batch, np.ndarray]] = None,
     model: str = "model",
     input: str = "obs",
     **kwargs: Any,
 ) -> Batch:
     model = getattr(self, model)
     obs = batch[input]
     obs_next = obs.obs if hasattr(obs, "obs") else obs
     logits, hidden = model(obs_next, state=state, info=batch.info)
     act = to_numpy(logits.max(dim=-1)[1])
     return Batch(logits=logits, act=act, state=hidden)
Esempio n. 19
0
def test_nstep_returns(size=10000):
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(obs=0, act=0, rew=i + 1, done=i % 4 == 3)
    batch, indice = buf.sample(0)
    assert np.allclose(indice, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [10, 11, 2, 3, 4, 5, 6, 7, 8, 9]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=1).pop('returns'))
    assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    r_ = compute_nstep_return_base(1, .1, buf, indice)
    assert np.allclose(returns, r_), (r_, returns)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=1
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 2
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=2).pop('returns'))
    assert np.allclose(returns, [
        3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    r_ = compute_nstep_return_base(2, .1, buf, indice)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=2
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 10
    returns = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn, gamma=.1, n_step=10).pop('returns'))
    assert np.allclose(returns, [
        3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12])
    r_ = compute_nstep_return_base(10, .1, buf, indice)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(BasePolicy.compute_nstep_return(
        batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=10
    ).pop('returns'))
    assert np.allclose(returns_multidim, returns[:, np.newaxis])

    if __name__ == '__main__':
        buf = ReplayBuffer(size)
        for i in range(int(size * 1.5)):
            buf.add(obs=0, act=0, rew=i + 1, done=np.random.randint(3) == 0)
        batch, indice = buf.sample(256)

        def vanilla():
            return compute_nstep_return_base(3, .1, buf, indice)

        def optimized():
            return BasePolicy.compute_nstep_return(
                batch, buf, indice, target_q_fn, gamma=.1, n_step=3)

        cnt = 3000
        print('nstep vanilla', timeit(vanilla, setup=vanilla, number=cnt))
        print('nstep optim  ', timeit(optimized, setup=optimized, number=cnt))
 def add(self, x: Union[float, list, np.ndarray, torch.Tensor]) -> float:
     """Add a scalar into :class:`MovAvg`. You can add ``torch.Tensor`` with
     only one element, a python scalar, or a list of python scalar.
     """
     if isinstance(x, torch.Tensor):
         x = to_numpy(x.flatten())
     if isinstance(x, list) or isinstance(x, np.ndarray):
         for _ in x:
             if _ not in self.banned:
                 self.cache.append(_)
     elif x not in self.banned:
         self.cache.append(x)
     if self.size > 0 and len(self.cache) > self.size:
         self.cache = self.cache[-self.size:]
     return self.get()
Esempio n. 21
0
 def process_fn(self, batch: Batch, buffer: ReplayBuffer,
                indice: np.ndarray) -> Batch:
     if self._rew_norm:
         mean, std = batch.rew.mean(), batch.rew.std()
         if not np.isclose(std, 0):
             batch.rew = (batch.rew - mean) / std
     if self._lambda in [0, 1]:
         return self.compute_episodic_return(
             batch, None, gamma=self._gamma, gae_lambda=self._lambda)
     v_ = []
     with torch.no_grad():
         for b in batch.split(self._batch, shuffle=False):
             v_.append(self.critic(b.obs_next))
     v_ = to_numpy(torch.cat(v_, dim=0))
     return self.compute_episodic_return(
         batch, v_, gamma=self._gamma, gae_lambda=self._lambda)
Esempio n. 22
0
def compute_nstep_return_base(nstep, gamma, buffer, indice):
    returns = np.zeros_like(indice, dtype=np.float)
    buf_len = len(buffer)
    for i in range(len(indice)):
        flag, r = False, 0.
        for n in range(nstep):
            idx = (indice[i] + n) % buf_len
            r += buffer.rew[idx] * gamma**n
            if buffer.done[idx]:
                flag = True
                break
        if not flag:
            idx = (indice[i] + nstep - 1) % buf_len
            r += to_numpy(target_q_fn(buffer, idx)) * gamma**nstep
        returns[i] = r
    return returns
Esempio n. 23
0
 def obs_attacks(self, data, target_action: List[int]):
     """
     Performs an image adversarial attack on the observation stored in 'obs' respect to
     the action 'target_action' using the method defined in 'self.obs_adv_atk'
     """
     data = deepcopy(data)
     obs = torch.FloatTensor(data.obs).to(
         self.device)  # convert observation to tensor
     act = torch.tensor(target_action).to(
         self.device)  # convert action to tensor
     adv_obs = self.obs_adv_atk.perturb(
         obs, act)  # create adversarial observation
     with torch.no_grad():
         adv_obs = adv_obs.cpu().detach().numpy()
         data.obs = adv_obs
         result = self.policy(data, last_state=None)
     return to_numpy(result.act), adv_obs
Esempio n. 24
0
    def forward(
        self,
        batch: Batch,
        state: Optional[Union[dict, Batch, np.ndarray]] = None,
        model: str = "model",
        input: str = "obs",
        **kwargs: Any,
    ) -> Batch:
        """Compute action over the given batch data.

        If you need to mask the action, please add a "mask" into batch.obs, for
        example, if we have an environment that has "0/1/2" three actions:
        ::

            batch == Batch(
                obs=Batch(
                    obs="original obs, with batch_size=1 for demonstration",
                    mask=np.array([[False, True, False]]),
                    # action 1 is available
                    # action 0 and 2 are unavailable
                ),
                ...
            )

        :param float eps: in [0, 1], for epsilon-greedy exploration method.

        :return: A :class:`~tianshou.data.Batch` which has 3 keys:

            * ``act`` the action.
            * ``logits`` the network's raw output.
            * ``state`` the hidden state.

        .. seealso::

            Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for
            more detailed explanation.
        """
        model = getattr(self, model)
        obs = batch[input]
        obs_ = obs.obs if hasattr(obs, "obs") else obs
        logits, h = model(obs_, state=state, info=batch.info)
        q = self.compute_q_value(logits, getattr(obs, "mask", None))
        if not hasattr(self, "max_action_num"):
            self.max_action_num = q.shape[1]
        act = to_numpy(q.max(dim=1)[1])
        return Batch(logits=logits, act=act, state=h)
Esempio n. 25
0
def test_nstep_returns(size=10000):
    buf = ReplayBuffer(10)
    for i in range(12):
        buf.add(Batch(obs=0, act=0, rew=i + 1, done=i % 4 == 3))
    batch, indices = buf.sample(0)
    assert np.allclose(indices, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
    # rew:  [11, 12, 3, 4, 5, 6, 7, 8, 9, 10]
    # done: [ 0,  1, 0, 1, 0, 0, 0, 1, 0, 0]
    # test nstep = 1
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=1
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12])
    r_ = compute_nstep_return_base(1, .1, buf, indices)
    assert np.allclose(returns, r_), (r_, returns)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=1
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 2
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=2
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12])
    r_ = compute_nstep_return_base(2, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=2
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
    # test nstep = 10
    returns = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn, gamma=.1, n_step=10
        ).pop('returns').reshape(-1)
    )
    assert np.allclose(returns, [3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12])
    r_ = compute_nstep_return_base(10, .1, buf, indices)
    assert np.allclose(returns, r_)
    returns_multidim = to_numpy(
        BasePolicy.compute_nstep_return(
            batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=10
        ).pop('returns')
    )
    assert np.allclose(returns_multidim, returns[:, np.newaxis])
Esempio n. 26
0
 def learn(self, batch: Batch, **kwargs) -> Dict[str, float]:
     if self._target and self._cnt % self._freq == 0:
         self.sync_weight()
     self.optim.zero_grad()
     q = self(batch).logits
     q = q[np.arange(len(q)), batch.act]
     r = to_torch_as(batch.returns, q)
     if hasattr(batch, 'update_weight'):
         td = r - q
         batch.update_weight(batch.indice, to_numpy(td))
         impt_weight = to_torch_as(batch.impt_weight, q)
         loss = (td.pow(2) * impt_weight).mean()
     else:
         loss = F.mse_loss(q, r)
     loss.backward()
     self.optim.step()
     self._cnt += 1
     return {'loss': loss.item()}
Esempio n. 27
0
    def compute_episodic_return(
        batch: Batch,
        buffer: ReplayBuffer,
        indice: np.ndarray,
        v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None,
        gamma: float = 0.99,
        gae_lambda: float = 0.95,
        rew_norm: bool = False,
    ) -> Batch:
        """Compute returns over given batch.

        Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438)
        to calculate q function/reward to go of given batch.

        :param Batch batch: a data batch which contains several episodes of data in
            sequential order. Mind that the end of each finished episode of batch
            should be marked by done flag, unfinished (or collecting) episodes will be
            recongized by buffer.unfinished_index().
        :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to
            buffer[indice].
        :param np.ndarray v_s_: the value function of all next states :math:`V(s')`.
        :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99.
        :param float gae_lambda: the parameter for Generalized Advantage Estimation,
            should be in [0, 1]. Default to 0.95.
        :param bool rew_norm: normalize the reward to Normal(0, 1). Default to False.

        :return: a Batch. The result will be stored in batch.returns as a numpy
            array with shape (bsz, ).
        """
        rew = batch.rew
        if v_s_ is None:
            assert np.isclose(gae_lambda, 1.0)
            v_s_ = np.zeros_like(rew)
        else:
            v_s_ = to_numpy(v_s_.flatten()) * BasePolicy.value_mask(
                buffer, indice)

        end_flag = batch.done.copy()
        end_flag[np.isin(indice, buffer.unfinished_index())] = True
        returns = _episodic_return(v_s_, rew, end_flag, gamma, gae_lambda)
        if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2):
            returns = (returns - returns.mean()) / returns.std()
        batch.returns = returns
        return batch
Esempio n. 28
0
 def process_fn(
     self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray
 ) -> Batch:
     v_s_ = []
     with torch.no_grad():
         for b in batch.split(self._batch, shuffle=False, merge_last=True):
             v_s_.append(to_numpy(self.critic(b.obs_next)))
     v_s_ = np.concatenate(v_s_, axis=0)
     if self._rew_norm:  # unnormalize v_s_
         v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean
     unnormalized_returns, _ = self.compute_episodic_return(
         batch, buffer, indice, v_s_=v_s_,
         gamma=self._gamma, gae_lambda=self._lambda)
     if self._rew_norm:
         batch.returns = (unnormalized_returns - self.ret_rms.mean) / \
             np.sqrt(self.ret_rms.var + self._eps)
         self.ret_rms.update(unnormalized_returns)
     else:
         batch.returns = unnormalized_returns
     return batch
Esempio n. 29
0
def compute_nstep_return_base(nstep, gamma, buffer, indices):
    returns = np.zeros_like(indices, dtype=float)
    buf_len = len(buffer)
    for i in range(len(indices)):
        flag, rew = False, 0.
        real_step_n = nstep
        for n in range(nstep):
            idx = (indices[i] + n) % buf_len
            rew += buffer.rew[idx] * gamma**n
            if buffer.done[idx]:
                if not (hasattr(buffer, 'info')
                        and buffer.info['TimeLimit.truncated'][idx]):
                    flag = True
                real_step_n = n + 1
                break
        if not flag:
            idx = (indices[i] + real_step_n - 1) % buf_len
            rew += to_numpy(target_q_fn(buffer, idx)) * gamma**real_step_n
        returns[i] = rew
    return returns
Esempio n. 30
0
    def forward(self, obs, eps=False): #eps can be False (greedy) or a number in [0,1]

        q, _ = self.model(obs)

        # something like this, on random exploration?
        # q=q+torch.rand_like(q)*0.1

        # epsilon greedy
        # boltzman

        # torch.softmax(q) #[0.249999999,0.250000001,0.24999999,0.249999999] #[0,0,0,0]
        # q =to_numpy(torch.softmax(q[0],dim=-1))
        # action = np.random.choice(np.arange(q.shape[-1]),p=q)

        if eps != False and np.random.rand() < eps:
            action = np.random.choice(np.arange(q.shape[-1]))
        else:
            action = to_numpy(q.max(dim=1)[1])  # choose max q

        return action