def optimize(self, obs: Arrayable, action: Arrayable, max_action: Tensor, next_obs: Arrayable, max_next_action: Tensor, reward: Arrayable, done: Arrayable, time_limit: Arrayable, weights: Arrayable) -> Tensor: action = arr_to_th(action, self._device) reward = arr_to_th(reward, self._device) weights = arr_to_th(check_array(weights), self._device) done = arr_to_th(check_array(done).astype('float'), self._device) obs = check_array(obs) next_obs = check_array(next_obs) q = self.critic(obs, action) q_next = self.critic(next_obs, max_next_action, target=True) * (1 - done) expected_q = (reward * self._dt + self._gamma**self._dt * q_next).detach() critic_loss = (q - expected_q)**2 self._q_optimizer.zero_grad() critic_loss.mean().backward(retain_graph=True) self._q_optimizer.step() soft_update(self._q_function, self._target_q_function, self._tau) return critic_loss
def observe(self, next_obs: Arrayable, reward: Arrayable, done: Arrayable, time_limit: Optional[Arrayable] = None) -> None: if self._mode != "train": return None self._count += 1 reward = check_array(reward) done = check_array(done) if time_limit is None: time_limit = np.zeros(done.shape) time_limit = check_array(time_limit) if not self._current_trajectories: self._nb_train_env = done.shape[0] self._current_trajectories = \ [Trajectory(boundlength=self._T) for _ in range(self._nb_train_env)] for k, traj in enumerate(self._current_trajectories): traj.push(self._current_obs[k], self._current_action[k], reward[k], float(done[k]), float(time_limit[k])) self.learn()
def step(self, obs: Arrayable) -> np.ndarray: if self._mode != "eval": action = th_to_arr(self._actor.act_noisy(obs)) else: action = th_to_arr(self._actor.act(obs)) self._current_obs = check_array(obs) self._current_action = check_array(action) if isinstance(self._actor, OnlineActorContinuous): action = np.clip(action, -1, 1) return action
def compute_return(rewards: Arrayable, dones: Arrayable) -> float: """Compute return from rewards and termination signals. :args rewards: (seq_len, batch_size) reward array :args dones: (seq_len, batch_size) termination signal array :return: averaged undiscounted return """ R = 0 rewards = check_array(rewards) dones = check_array(dones) for r, d in zip(rewards[::-1], dones[::-1]): R = r + R * (1 - d) return np.mean(R)
def push(self, obs: Arrayable, action: Arrayable, reward: float, done: float, time_limit: float) -> None: """ Push a single transition on a trajectory (before seing the next observation). """ obs = check_array(obs) action = check_array(action) self._obs.append(obs) self._actions.append(action) self._rewards.append(reward) self._done.append(done) self._time_limit.append(time_limit) self.boundlength()
def sample(self, to_observe: bool = True) -> Tuple[Arrayable, ...]: if to_observe: assert self._idxs is None, "No observe after sample ..." idxs, priorities = zip( *[self._sum_tree.sample() for _ in range(self._batch_size)]) idxs, priorities = check_array(idxs), check_array(priorities) obs, action, next_obs, reward, done, _, time_limit = self._memory.sample( idxs) weights = (self._sum_tree.total / self._memory.size / priorities)**self._beta weights = weights / weights.max() if to_observe: self._idxs = idxs return obs, action, next_obs, reward, done, weights, time_limit
def push(self, obs: Arrayable, action: Arrayable, next_obs: Arrayable, reward: Arrayable, done: Arrayable, time_limit: Optional[Arrayable]) -> None: self._memory.push(obs, action, next_obs, reward, done, time_limit) assert self._sum_tree.size == self._memory.size for _ in check_array(obs): self._sum_tree.add(self._max_priority**self._alpha)
def observe(self, priorities: Arrayable): assert self._idxs is not None, "No sample before observe ..." priorities = check_array(priorities) self._max_priority = max(self._max_priority, priorities.max()) for idx, prio in zip(self._idxs, priorities): self._sum_tree.modify(idx, prio**self._alpha) self._idxs = None
def optimize(self, obs: Arrayable, action: Arrayable, max_action: Tensor, next_obs: Arrayable, max_next_action: Tensor, reward: Arrayable, done: Arrayable, time_limit: Arrayable, weights: Arrayable) -> Tensor: """Optimizes using the DAU variant of advantage updating. Note that this variant uses max_action, and not max_next_action, as is more common with standard Q-Learning. It relies on the set of equations V^*(s) + dt A^*(s, a) = r(s, a) dt + gamma^dt V^*(s) A^*(s, a) = adv_function(s, a) - adv_function(s, max_action) """ obs = check_array(obs) batch_size = obs.shape[0] action = arr_to_th(action, self._device).type_as(max_action) reward = arr_to_th(reward, self._device) weights = arr_to_th(check_array(weights), self._device) done = arr_to_th(check_array(done).astype('float'), self._device) v = self._val_function(obs).squeeze() next_v = (1 - done) * self._target_val_function(next_obs).squeeze() pre_advs = self.critic(np.concatenate([obs, obs], axis=0), torch.cat([action, max_action], dim=0)) pre_adv, pre_max_adv = pre_advs[:batch_size], pre_advs[batch_size:] adv = pre_adv - pre_max_adv q = v + self._dt * adv # next_adv = 0 by definition expected_q = (reward * self._dt + self._gamma**self._dt * next_v).detach() critic_loss = (q - expected_q)**2 self._val_optimizer.zero_grad() self._adv_optimizer.zero_grad() critic_loss.mean().backward(retain_graph=True) self._val_optimizer.step() self._adv_optimizer.step() soft_update(self._adv_function, self._target_adv_function, self._tau) soft_update(self._val_function, self._target_val_function, self._tau) return critic_loss
def push(self, obs: Arrayable, action: Arrayable, next_obs: Arrayable, reward: Arrayable, done: Arrayable, time_limit: Optional[Arrayable]) -> None: """Push a transition on the buffer.""" # if empty, initialize buffer obs = check_array(obs) action = check_array(action) next_obs = check_array(next_obs) reward = check_array(reward) done = check_array(done) if time_limit is not None: time_limit = check_array(time_limit) nb_envs = obs.shape[0] if self._true_size == -1: self._true_size = (self._size // nb_envs) * nb_envs self._obs = np.zeros((self._true_size, *obs.shape[1:])) self._action = np.zeros((self._true_size, *action.shape[1:])) self._next_obs = np.zeros((self._true_size, *next_obs.shape[1:])) self._reward = np.zeros((self._true_size, *reward.shape[1:])) self._done = np.zeros((self._true_size, *done.shape[1:])) if time_limit is not None: self._time_limit = np.zeros( (self._true_size, *time_limit.shape[1:])) # initialize reference point self._ref_obs = obs.copy() self._obs[self._cur:self._cur + nb_envs] = obs self._action[self._cur:self._cur + nb_envs] = action self._next_obs[self._cur:self._cur + nb_envs] = next_obs self._reward[self._cur:self._cur + nb_envs] = reward self._done[self._cur:self._cur + nb_envs] = done if self._time_limit is not None: self._time_limit[self._cur:self._cur + nb_envs] = time_limit if self._cur + nb_envs == self._true_size: self._full = True self._cur = (self._cur + nb_envs) % (self._true_size)