Beispiel #1
0
    def backward_gae(self, grids, cells, vals, chs, rewards, next_grid,
                     next_cell) -> float:
        """Generalized Advantage Estimation"""
        # Estimated value after trajectory, V(S_t+n)
        bootstrap_val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(next_grid),
                self.cell: nutils.prep_data_cells(next_cell)
            })
        rewards_plus = np.asarray(rewards + [bootstrap_val])
        discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1]
        value_plus = np.asarray(vals + [bootstrap_val])
        advantages = nutils.discount(
            rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma)

        data = {
            self.grid: nutils.prep_data_grids(np.array(grids)),
            self.cell: nutils.prep_data_cells(cells),
            self.value_target: discounted_rewards,
            self.action: chs,
            self.psi: advantages
        }
        return self._backward(data)
Beispiel #2
0
 def backward_multi_nstep(self,
                          grids,
                          cells,
                          chs,
                          rewards,
                          next_grid,
                          next_cell,
                          gamma,
                          next_ch=None) -> (float, float):
     """
     Multi n-step. Train on n-step, (n-1)-step, (n-2)-step, ..., 1-step returns
     """
     next_qvals = self._double_q_target(next_grid, next_cell, next_ch)
     rewards_plus = np.asarray(rewards + [next_qvals])
     # q_targets:
     # [(r0 + g*r1 + g**2*r2 +..+ g**n*q_n), (r1 + g*r2 +..+ g**(n-1)*q_n), ..,
     # (r(n-1) + g*q_n)] where g: gamma, q_n: next_qval (bootstrap val)
     q_targets = discount(rewards_plus, gamma)[:-1]
     return self.backward_supervised(grids, cells, chs, q_targets)
Beispiel #3
0
 def backward_gae(self,
                  grids,
                  cells,
                  chs,
                  rewards,
                  next_grid,
                  next_cell,
                  gamma,
                  next_ch=None) -> (float, float):
     """Generalized Advantage Estimation"""
     next_qvals = self._double_q_target(next_grid, next_cell, next_ch)
     vals = self.sess.run(
         self.target_q_max, {
             self.grids: prep_data_grids(grids, self.grid_split),
             self.oh_cells: prep_data_cells(cells),
             self.chs: chs
         })
     value_plus = np.zeros((len(vals) + 1))
     value_plus[:len(vals)] = vals
     value_plus[-1] = next_qvals
     advantages = discount(
         rewards + gamma * value_plus[1:] - value_plus[:-1], gamma)
     return self.backward_supervised(grids, cells, chs, q_targes=advantages)