def compute_episodic_return( batch: Batch, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, time_trunc: Optional[int] = None, rew_norm: bool = False, ) -> Batch: """Compute returns over given full-length episodes. Implementation of Generalized Advantage Estimator (arXiv:1506.02438). :param batch: a data batch which contains several full-episode data chronologically. :type batch: :class:`~tianshou.data.Batch` :param v_s_: the value function of all next states :math:`V(s')`. :type v_s_: numpy.ndarray :param float gamma: the discount factor, should be in [0, 1], defaults to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1], defaults to 0.95. :param bool rew_norm: normalize the reward to Normal(0, 1), defaults to False. :return: a Batch. The result will be stored in batch.returns as a numpy array with shape (bsz, ). """ rew = batch.rew v_s_ = np.zeros_like(rew) if v_s_ is None else to_numpy(v_s_.flatten()) returns = _episodic_return( v_s_, rew, batch.done, gamma, gae_lambda, time_trunc) if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2): returns = (returns - returns.mean()) / returns.std() batch.returns = returns return batch
def forward(self, batch: Batch, state: Optional[Union[dict, Batch, np.ndarray]] = None, model: str = 'model', input: str = 'obs', eps: Optional[float] = None, **kwargs) -> Batch: """Compute action over the given batch data. :param float eps: in [0, 1], for epsilon-greedy exploration method. :return: A :class:`~tianshou.data.Batch` which has 3 keys: * ``act`` the action. * ``logits`` the network's raw output. * ``state`` the hidden state. .. seealso:: Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for more detailed explanation. """ model = getattr(self, model) obs = getattr(batch, input) q, h = model(obs, state=state, info=batch.info) act = to_numpy(q.max(dim=1)[1]) # add eps to act if eps is None: eps = self.eps if not np.isclose(eps, 0): for i in range(len(q)): if np.random.rand() < eps: act[i] = np.random.randint(q.shape[1]) return Batch(logits=q, act=act, state=h)
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._rew_norm: mean, std = batch.rew.mean(), batch.rew.std() if not np.isclose(std, 0.0, 1e-2): batch.rew = (batch.rew - mean) / std v, v_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_.append(self.critic(b.obs_next)) v.append(self.critic(b.obs)) old_log_prob.append( self(b).dist.log_prob(to_torch_as(b.act, v[0]))) v_ = to_numpy(torch.cat(v_, dim=0)) batch = self.compute_episodic_return(batch, buffer, indice, v_, gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm) batch.v = torch.cat(v, dim=0).flatten() # old value batch.act = to_torch_as(batch.act, v[0]) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, v[0]) batch.adv = batch.returns - batch.v if self._rew_norm: mean, std = batch.adv.mean(), batch.adv.std() if not np.isclose(std.item(), 0.0, 1e-2): batch.adv = (batch.adv - mean) / std return batch
def map_action_inverse( self, act: Union[Batch, List, np.ndarray]) -> Union[Batch, List, np.ndarray]: """Inverse operation to :meth:`~tianshou.policy.BasePolicy.map_action`. This function is called in :meth:`~tianshou.data.Collector.collect` for random initial steps. It scales [action_space.low, action_space.high] to the value ranges of policy.forward. :param act: a data batch, list or numpy.ndarray which is the action taken by gym.spaces.Box.sample(). :return: action remapped. """ if isinstance(self.action_space, gym.spaces.Box): act = to_numpy(act) if isinstance(act, np.ndarray): if self.action_scaling: low, high = self.action_space.low, self.action_space.high scale = high - low eps = np.finfo(np.float32).eps.item() scale[scale < eps] += eps act = (act - low) * 2.0 / scale - 1.0 if self.action_bound_method == "tanh": act = (np.log(1.0 + act) - np.log(1.0 - act)) / 2.0 # type: ignore return act
def forward( self, batch: Batch, state: Optional[Union[dict, Batch, np.ndarray]] = None, model: str = "model", input: str = "obs", **kwargs: Any, ) -> Batch: if model == "model_old": sample_size = self._target_sample_size elif self.training: sample_size = self._online_sample_size else: sample_size = self._sample_size model = getattr(self, model) obs = batch[input] obs_ = obs.obs if hasattr(obs, "obs") else obs (logits, taus), h = model( obs_, sample_size=sample_size, state=state, info=batch.info ) q = self.compute_q_value(logits, getattr(obs, "mask", None)) if not hasattr(self, "max_action_num"): self.max_action_num = q.shape[1] act = to_numpy(q.max(dim=1)[1]) return Batch(logits=logits, act=act, state=h, taus=taus)
def compute_nstep_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, target_q_fn: Callable[[ReplayBuffer, np.ndarray], torch.Tensor], gamma: float = 0.99, n_step: int = 1, rew_norm: bool = False, ) -> Batch: r"""Compute n-step return for Q-learning targets. .. math:: G_t = \sum_{i = t}^{t + n - 1} \gamma^{i - t}(1 - d_i)r_i + \gamma^n (1 - d_{t + n}) Q_{\mathrm{target}}(s_{t + n}) where :math:`\gamma` is the discount factor, :math:`\gamma \in [0, 1]`, :math:`d_t` is the done flag of step :math:`t`. :param batch: a data batch, which is equal to buffer[indice]. :type batch: :class:`~tianshou.data.Batch` :param buffer: a data buffer which contains several full-episode data chronologically. :type buffer: :class:`~tianshou.data.ReplayBuffer` :param indice: sampled timestep. :type indice: numpy.ndarray :param function target_q_fn: a function receives :math:`t+n-1` step's data and compute target Q value. :param float gamma: the discount factor, should be in [0, 1], defaults to 0.99. :param int n_step: the number of estimation step, should be an int greater than 0, defaults to 1. :param bool rew_norm: normalize the reward to Normal(0, 1), defaults to False. :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with shape (bsz, ). """ rew = buffer.rew if rew_norm: bfr = rew[:min(len(buffer), 1000)] # avoid large buffer mean, std = bfr.mean(), bfr.std() if np.isclose(std, 0, 1e-2): mean, std = 0.0, 1.0 else: mean, std = 0.0, 1.0 buf_len = len(buffer) terminal = (indice + n_step - 1) % buf_len target_q_torch = target_q_fn(buffer, terminal).flatten() # (bsz, ) target_q = to_numpy(target_q_torch) target_q = _nstep_return(rew, buffer.done, target_q, indice, gamma, n_step, len(buffer), mean, std) batch.returns = to_torch_as(target_q, target_q_torch) # prio buffer update if isinstance(buffer, PrioritizedReplayBuffer): batch.weight = to_torch_as(batch.weight, target_q_torch) return batch
def test_utils_to_torch_numpy(): batch = Batch(a=np.float64(1.0), b=Batch(c=np.ones((1, ), dtype=np.float32), d=torch.ones((1, ), dtype=torch.float64))) a_torch_float = to_torch(batch.a, dtype=torch.float32) assert a_torch_float.dtype == torch.float32 a_torch_double = to_torch(batch.a, dtype=torch.float64) assert a_torch_double.dtype == torch.float64 batch_torch_float = to_torch(batch, dtype=torch.float32) assert batch_torch_float.a.dtype == torch.float32 assert batch_torch_float.b.c.dtype == torch.float32 assert batch_torch_float.b.d.dtype == torch.float32 data_list = [float('nan'), 1] data_list_torch = to_torch(data_list) assert data_list_torch.dtype == torch.float64 data_list_2 = [np.random.rand(3, 3), np.random.rand(3, 3)] data_list_2_torch = to_torch(data_list_2) assert data_list_2_torch.shape == (2, 3, 3) assert np.allclose(to_numpy(to_torch(data_list_2)), data_list_2) data_list_3 = [np.zeros((3, 2)), np.zeros((3, 3))] data_list_3_torch = to_torch(data_list_3) assert isinstance(data_list_3_torch, list) assert all(isinstance(e, torch.Tensor) for e in data_list_3_torch) assert all( starmap(np.allclose, zip(to_numpy(to_torch(data_list_3)), data_list_3))) data_list_4 = [np.zeros((2, 3)), np.zeros((3, 3))] data_list_4_torch = to_torch(data_list_4) assert isinstance(data_list_4_torch, list) assert all(isinstance(e, torch.Tensor) for e in data_list_4_torch) assert all( starmap(np.allclose, zip(to_numpy(to_torch(data_list_4)), data_list_4))) data_list_5 = [np.zeros(2), np.zeros((3, 3))] data_list_5_torch = to_torch(data_list_5) assert isinstance(data_list_5_torch, list) assert all(isinstance(e, torch.Tensor) for e in data_list_5_torch) data_array = np.random.rand(3, 2, 2) data_empty_tensor = to_torch(data_array[[]]) assert isinstance(data_empty_tensor, torch.Tensor) assert data_empty_tensor.shape == (0, 2, 2) data_empty_array = to_numpy(data_empty_tensor) assert isinstance(data_empty_array, np.ndarray) assert data_empty_array.shape == (0, 2, 2) assert np.allclose(to_numpy(to_torch(data_array)), data_array)
def compute_nstep_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, target_q_fn: Callable[[ReplayBuffer, np.ndarray], torch.Tensor], gamma: float = 0.99, n_step: int = 1, rew_norm: bool = False, use_mixed: bool = False, ) -> Batch: r"""Compute n-step return for Q-learning targets. .. math:: G_t = \sum_{i = t}^{t + n - 1} \gamma^{i - t}(1 - d_i)r_i + \gamma^n (1 - d_{t + n}) Q_{\mathrm{target}}(s_{t + n}) where :math:`\gamma` is the discount factor, :math:`\gamma \in [0, 1]`, :math:`d_t` is the done flag of step :math:`t`. :param Batch batch: a data batch, which is equal to buffer[indice]. :param ReplayBuffer buffer: the data buffer. :param function target_q_fn: a function which compute target Q value of "obs_next" given data buffer and wanted indices. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param int n_step: the number of estimation step, should be an int greater than 0. Default to 1. :param bool rew_norm: normalize the reward to Normal(0, 1), Default to False. :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with the same shape as target_q_fn's return tensor. """ assert not rew_norm, \ "Reward normalization in computing n-step returns is unsupported now." rew = buffer.rew bsz = len(indice) indices = [indice] for _ in range(n_step - 1): indices.append(buffer.next(indices[-1])) indices = np.stack(indices) # terminal indicates buffer indexes nstep after 'indice', # and are truncated at the end of each episode terminal = indices[-1] with autocast(enabled=use_mixed): with torch.no_grad(): target_q_torch = target_q_fn(buffer, terminal) # (bsz, ?) target_q = to_numpy(target_q_torch.float().reshape(bsz, -1)) target_q = target_q * BasePolicy.value_mask(buffer, terminal).reshape( -1, 1) end_flag = buffer.done.copy() end_flag[buffer.unfinished_index()] = True target_q = _nstep_return(rew, end_flag, target_q, indices, gamma, n_step) batch.returns = to_torch_as(target_q, target_q_torch) if hasattr(batch, "weight"): # prio buffer update batch.weight = to_torch_as(batch.weight, target_q_torch) return batch
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indices: np.ndarray) -> Batch: """Pre-process the data from the provided replay buffer. Used in :meth:`update`. Check out :ref:`process_fn` for more information. """ # update reward with torch.no_grad(): batch.rew = to_numpy(-F.logsigmoid(-self.disc(batch)).flatten()) return super().process_fn(batch, buffer, indices)
def compute_episodic_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, v_s: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, ) -> Tuple[np.ndarray, np.ndarray]: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) to calculate q/advantage value of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch should be marked by done flag, unfinished (or collecting) episodes will be recongized by buffer.unfinished_index(). :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to buffer[indice]. :param np.ndarray v_s_: the value function of all next states :math:`V(s')`. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. :return: two numpy arrays (returns, advantage) with each shape (bsz, ). """ rew = batch.rew if v_s_ is None: assert np.isclose(gae_lambda, 1.0) v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) # type: ignore v_s_ = v_s_ * BasePolicy.value_mask(buffer, indice) v_s = np.roll(v_s_, 1) if v_s is None else to_numpy(v_s.flatten()) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) returns = advantage + v_s # normalization varies from each policy, so we don't do it here return returns, advantage
def update_weight(self, index: np.ndarray, new_weight: Union[np.ndarray, torch.Tensor]) -> None: """Update priority weight by index in this buffer. :param np.ndarray index: index you want to update weight. :param np.ndarray new_weight: new priority weight you want to update. """ weight = np.abs(to_numpy(new_weight)) + self.__eps self.weight[index] = weight**self._alpha self._max_prio = max(self._max_prio, weight.max()) self._min_prio = min(self._min_prio, weight.min())
def predict_next_action(self): """ Predicts the next action given observation 'self.data.obs' and policy 'self.policy', and stores it in 'self.data.act' :return: outcome of policy forward pass """ with torch.no_grad(): self.data.obs = np.expand_dims(self.data.obs, axis=0) result = self.policy(self.data, last_state=None) self.data.act = to_numpy(result.act) return result
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indices: np.ndarray) -> Batch: """Pre-process the data from the provided replay buffer. Used in :meth:`update`. Check out :ref:`process_fn` for more information. """ mse_loss, act_hat = self.model(batch.obs, batch.act, batch.obs_next) batch.policy = Batch(orig_rew=batch.rew, act_hat=act_hat, mse_loss=mse_loss) batch.rew += to_numpy(mse_loss * self.reward_scale) return self.policy.process_fn(batch, buffer, indices)
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: v_s, v_s_, old_log_prob = [], [], [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_s.append(self.critic(b.obs)) v_s_.append(self.critic(b.obs_next)) old_log_prob.append( self(b).dist.log_prob(to_torch_as(b.act, v_s[0]))) batch.v_s = torch.cat(v_s, dim=0).flatten() # old value v_s = to_numpy(batch.v_s) v_s_ = to_numpy(torch.cat(v_s_, dim=0).flatten()) # when normalizing values, we do not minus self.ret_rms.mean to be numerically # consistent with OPENAI baselines' value normalization pipeline. Emperical # study also shows that "minus mean" will harm performances a tiny little bit # due to unknown reasons (on Mujoco envs, not confident, though). if self._rew_norm: # unnormalize v_s & v_s_ v_s = v_s * np.sqrt(self.ret_rms.var + self._eps) v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) unnormalized_returns, advantages = self.compute_episodic_return( batch, buffer, indice, v_s_, v_s, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = unnormalized_returns / \ np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) mean, std = np.mean(advantages), np.std(advantages) advantages = (advantages - mean) / std # per-batch norm else: batch.returns = unnormalized_returns batch.act = to_torch_as(batch.act, batch.v_s) batch.logp_old = torch.cat(old_log_prob, dim=0) batch.returns = to_torch_as(batch.returns, batch.v_s) batch.adv = to_torch_as(advantages, batch.v_s) return batch
def forward( self, batch: Batch, state: Optional[Union[dict, Batch, np.ndarray]] = None, model: str = "model", input: str = "obs", **kwargs: Any, ) -> Batch: """Compute action over the given batch data. :return: A :class:`~tianshou.data.Batch` which has 2 keys: * ``act`` the action. * ``state`` the hidden state. .. seealso:: Please refer to :meth:`~tianshou.policy.DQNPolicy.forward` for more detailed explanation. """ model = getattr(self, model) obs = batch[input] obs_ = obs.obs if hasattr(obs, "obs") else obs dist, h = model(obs_, state=state, info=batch.info) q = (dist * self.support).sum(2) act: np.ndarray = to_numpy(q.max(dim=1)[1]) if hasattr(obs, "mask"): # some of actions are masked, they cannot be selected q_: np.ndarray = to_numpy(q) q_[~obs.mask] = -np.inf act = q_.argmax(axis=1) # add eps to act in training or testing phase if not self.updating and not np.isclose(self.eps, 0.0): for i in range(len(q)): if np.random.rand() < self.eps: q_ = np.random.rand(*q[i].shape) if hasattr(obs, "mask"): q_[~obs.mask[i]] = -np.inf act[i] = q_.argmax() return Batch(logits=dist, act=act, state=h)
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._lambda in [0, 1]: return self.compute_episodic_return( batch, None, gamma=self._gamma, gae_lambda=self._lambda) v_ = [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_.append(to_numpy(self.critic(b.obs_next))) v_ = np.concatenate(v_, axis=0) return self.compute_episodic_return( batch, v_, gamma=self._gamma, gae_lambda=self._lambda, rew_norm=self._rew_norm)
def forward( self, s: Union[np.ndarray, torch.Tensor], a: Optional[Union[np.ndarray, torch.Tensor]] = None, info: Dict[str, Any] = {}, ) -> torch.Tensor: """Mapping: (s, a) -> logits -> Q(s, a).""" if a is not None: if s.dtype == np.object: a = to_numpy(a) s_0 = np.stack(s[:, 0], axis=0) s_1 = np.vstack(s[:, 1]) s_1 = np.hstack((s_1, a)) s = (s_0, s_1) else: s = s.reshape(s.shape[0], -1) a = to_numpy(a) a = a.reshape(a.shape[0], -1) s = np.concatenate((s, a), axis=1) logits, h = self.preprocess(s) logits = self.last(logits) return logits
def forward( self, batch: Batch, state: Optional[Union[Dict, Batch, np.ndarray]] = None, model: str = "model", input: str = "obs", **kwargs: Any, ) -> Batch: model = getattr(self, model) obs = batch[input] obs_next = obs.obs if hasattr(obs, "obs") else obs logits, hidden = model(obs_next, state=state, info=batch.info) act = to_numpy(logits.max(dim=-1)[1]) return Batch(logits=logits, act=act, state=hidden)
def test_nstep_returns(size=10000): buf = ReplayBuffer(10) for i in range(12): buf.add(obs=0, act=0, rew=i + 1, done=i % 4 == 3) batch, indice = buf.sample(0) assert np.allclose(indice, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]) # rew: [10, 11, 2, 3, 4, 5, 6, 7, 8, 9] # done: [ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0] # test nstep = 1 returns = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn, gamma=.1, n_step=1).pop('returns')) assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12]) r_ = compute_nstep_return_base(1, .1, buf, indice) assert np.allclose(returns, r_), (r_, returns) returns_multidim = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=1 ).pop('returns')) assert np.allclose(returns_multidim, returns[:, np.newaxis]) # test nstep = 2 returns = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn, gamma=.1, n_step=2).pop('returns')) assert np.allclose(returns, [ 3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12]) r_ = compute_nstep_return_base(2, .1, buf, indice) assert np.allclose(returns, r_) returns_multidim = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=2 ).pop('returns')) assert np.allclose(returns_multidim, returns[:, np.newaxis]) # test nstep = 10 returns = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn, gamma=.1, n_step=10).pop('returns')) assert np.allclose(returns, [ 3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12]) r_ = compute_nstep_return_base(10, .1, buf, indice) assert np.allclose(returns, r_) returns_multidim = to_numpy(BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn_multidim, gamma=.1, n_step=10 ).pop('returns')) assert np.allclose(returns_multidim, returns[:, np.newaxis]) if __name__ == '__main__': buf = ReplayBuffer(size) for i in range(int(size * 1.5)): buf.add(obs=0, act=0, rew=i + 1, done=np.random.randint(3) == 0) batch, indice = buf.sample(256) def vanilla(): return compute_nstep_return_base(3, .1, buf, indice) def optimized(): return BasePolicy.compute_nstep_return( batch, buf, indice, target_q_fn, gamma=.1, n_step=3) cnt = 3000 print('nstep vanilla', timeit(vanilla, setup=vanilla, number=cnt)) print('nstep optim ', timeit(optimized, setup=optimized, number=cnt))
def add(self, x: Union[float, list, np.ndarray, torch.Tensor]) -> float: """Add a scalar into :class:`MovAvg`. You can add ``torch.Tensor`` with only one element, a python scalar, or a list of python scalar. """ if isinstance(x, torch.Tensor): x = to_numpy(x.flatten()) if isinstance(x, list) or isinstance(x, np.ndarray): for _ in x: if _ not in self.banned: self.cache.append(_) elif x not in self.banned: self.cache.append(x) if self.size > 0 and len(self.cache) > self.size: self.cache = self.cache[-self.size:] return self.get()
def process_fn(self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray) -> Batch: if self._rew_norm: mean, std = batch.rew.mean(), batch.rew.std() if not np.isclose(std, 0): batch.rew = (batch.rew - mean) / std if self._lambda in [0, 1]: return self.compute_episodic_return( batch, None, gamma=self._gamma, gae_lambda=self._lambda) v_ = [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False): v_.append(self.critic(b.obs_next)) v_ = to_numpy(torch.cat(v_, dim=0)) return self.compute_episodic_return( batch, v_, gamma=self._gamma, gae_lambda=self._lambda)
def compute_nstep_return_base(nstep, gamma, buffer, indice): returns = np.zeros_like(indice, dtype=np.float) buf_len = len(buffer) for i in range(len(indice)): flag, r = False, 0. for n in range(nstep): idx = (indice[i] + n) % buf_len r += buffer.rew[idx] * gamma**n if buffer.done[idx]: flag = True break if not flag: idx = (indice[i] + nstep - 1) % buf_len r += to_numpy(target_q_fn(buffer, idx)) * gamma**nstep returns[i] = r return returns
def obs_attacks(self, data, target_action: List[int]): """ Performs an image adversarial attack on the observation stored in 'obs' respect to the action 'target_action' using the method defined in 'self.obs_adv_atk' """ data = deepcopy(data) obs = torch.FloatTensor(data.obs).to( self.device) # convert observation to tensor act = torch.tensor(target_action).to( self.device) # convert action to tensor adv_obs = self.obs_adv_atk.perturb( obs, act) # create adversarial observation with torch.no_grad(): adv_obs = adv_obs.cpu().detach().numpy() data.obs = adv_obs result = self.policy(data, last_state=None) return to_numpy(result.act), adv_obs
def forward( self, batch: Batch, state: Optional[Union[dict, Batch, np.ndarray]] = None, model: str = "model", input: str = "obs", **kwargs: Any, ) -> Batch: """Compute action over the given batch data. If you need to mask the action, please add a "mask" into batch.obs, for example, if we have an environment that has "0/1/2" three actions: :: batch == Batch( obs=Batch( obs="original obs, with batch_size=1 for demonstration", mask=np.array([[False, True, False]]), # action 1 is available # action 0 and 2 are unavailable ), ... ) :param float eps: in [0, 1], for epsilon-greedy exploration method. :return: A :class:`~tianshou.data.Batch` which has 3 keys: * ``act`` the action. * ``logits`` the network's raw output. * ``state`` the hidden state. .. seealso:: Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for more detailed explanation. """ model = getattr(self, model) obs = batch[input] obs_ = obs.obs if hasattr(obs, "obs") else obs logits, h = model(obs_, state=state, info=batch.info) q = self.compute_q_value(logits, getattr(obs, "mask", None)) if not hasattr(self, "max_action_num"): self.max_action_num = q.shape[1] act = to_numpy(q.max(dim=1)[1]) return Batch(logits=logits, act=act, state=h)
def test_nstep_returns(size=10000): buf = ReplayBuffer(10) for i in range(12): buf.add(Batch(obs=0, act=0, rew=i + 1, done=i % 4 == 3)) batch, indices = buf.sample(0) assert np.allclose(indices, [2, 3, 4, 5, 6, 7, 8, 9, 0, 1]) # rew: [11, 12, 3, 4, 5, 6, 7, 8, 9, 10] # done: [ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0] # test nstep = 1 returns = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn, gamma=.1, n_step=1 ).pop('returns').reshape(-1) ) assert np.allclose(returns, [2.6, 4, 4.4, 5.3, 6.2, 8, 8, 8.9, 9.8, 12]) r_ = compute_nstep_return_base(1, .1, buf, indices) assert np.allclose(returns, r_), (r_, returns) returns_multidim = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=1 ).pop('returns') ) assert np.allclose(returns_multidim, returns[:, np.newaxis]) # test nstep = 2 returns = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn, gamma=.1, n_step=2 ).pop('returns').reshape(-1) ) assert np.allclose(returns, [3.4, 4, 5.53, 6.62, 7.8, 8, 9.89, 10.98, 12.2, 12]) r_ = compute_nstep_return_base(2, .1, buf, indices) assert np.allclose(returns, r_) returns_multidim = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=2 ).pop('returns') ) assert np.allclose(returns_multidim, returns[:, np.newaxis]) # test nstep = 10 returns = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn, gamma=.1, n_step=10 ).pop('returns').reshape(-1) ) assert np.allclose(returns, [3.4, 4, 5.678, 6.78, 7.8, 8, 10.122, 11.22, 12.2, 12]) r_ = compute_nstep_return_base(10, .1, buf, indices) assert np.allclose(returns, r_) returns_multidim = to_numpy( BasePolicy.compute_nstep_return( batch, buf, indices, target_q_fn_multidim, gamma=.1, n_step=10 ).pop('returns') ) assert np.allclose(returns_multidim, returns[:, np.newaxis])
def learn(self, batch: Batch, **kwargs) -> Dict[str, float]: if self._target and self._cnt % self._freq == 0: self.sync_weight() self.optim.zero_grad() q = self(batch).logits q = q[np.arange(len(q)), batch.act] r = to_torch_as(batch.returns, q) if hasattr(batch, 'update_weight'): td = r - q batch.update_weight(batch.indice, to_numpy(td)) impt_weight = to_torch_as(batch.impt_weight, q) loss = (td.pow(2) * impt_weight).mean() else: loss = F.mse_loss(q, r) loss.backward() self.optim.step() self._cnt += 1 return {'loss': loss.item()}
def compute_episodic_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, rew_norm: bool = False, ) -> Batch: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) to calculate q function/reward to go of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch should be marked by done flag, unfinished (or collecting) episodes will be recongized by buffer.unfinished_index(). :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to buffer[indice]. :param np.ndarray v_s_: the value function of all next states :math:`V(s')`. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. :param bool rew_norm: normalize the reward to Normal(0, 1). Default to False. :return: a Batch. The result will be stored in batch.returns as a numpy array with shape (bsz, ). """ rew = batch.rew if v_s_ is None: assert np.isclose(gae_lambda, 1.0) v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) * BasePolicy.value_mask( buffer, indice) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True returns = _episodic_return(v_s_, rew, end_flag, gamma, gae_lambda) if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2): returns = (returns - returns.mean()) / returns.std() batch.returns = returns return batch
def process_fn( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray ) -> Batch: v_s_ = [] with torch.no_grad(): for b in batch.split(self._batch, shuffle=False, merge_last=True): v_s_.append(to_numpy(self.critic(b.obs_next))) v_s_ = np.concatenate(v_s_, axis=0) if self._rew_norm: # unnormalize v_s_ v_s_ = v_s_ * np.sqrt(self.ret_rms.var + self._eps) + self.ret_rms.mean unnormalized_returns, _ = self.compute_episodic_return( batch, buffer, indice, v_s_=v_s_, gamma=self._gamma, gae_lambda=self._lambda) if self._rew_norm: batch.returns = (unnormalized_returns - self.ret_rms.mean) / \ np.sqrt(self.ret_rms.var + self._eps) self.ret_rms.update(unnormalized_returns) else: batch.returns = unnormalized_returns return batch
def compute_nstep_return_base(nstep, gamma, buffer, indices): returns = np.zeros_like(indices, dtype=float) buf_len = len(buffer) for i in range(len(indices)): flag, rew = False, 0. real_step_n = nstep for n in range(nstep): idx = (indices[i] + n) % buf_len rew += buffer.rew[idx] * gamma**n if buffer.done[idx]: if not (hasattr(buffer, 'info') and buffer.info['TimeLimit.truncated'][idx]): flag = True real_step_n = n + 1 break if not flag: idx = (indices[i] + real_step_n - 1) % buf_len rew += to_numpy(target_q_fn(buffer, idx)) * gamma**real_step_n returns[i] = rew return returns
def forward(self, obs, eps=False): #eps can be False (greedy) or a number in [0,1] q, _ = self.model(obs) # something like this, on random exploration? # q=q+torch.rand_like(q)*0.1 # epsilon greedy # boltzman # torch.softmax(q) #[0.249999999,0.250000001,0.24999999,0.249999999] #[0,0,0,0] # q =to_numpy(torch.softmax(q[0],dim=-1)) # action = np.random.choice(np.arange(q.shape[-1]),p=q) if eps != False and np.random.rand() < eps: action = np.random.choice(np.arange(q.shape[-1])) else: action = to_numpy(q.max(dim=1)[1]) # choose max q return action