def backward_gae(self, grids, cells, vals, chs, rewards, next_grid, next_cell) -> float: """Generalized Advantage Estimation""" # Estimated value after trajectory, V(S_t+n) bootstrap_val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(next_grid), self.cell: nutils.prep_data_cells(next_cell) }) rewards_plus = np.asarray(rewards + [bootstrap_val]) discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1] value_plus = np.asarray(vals + [bootstrap_val]) advantages = nutils.discount( rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma) data = { self.grid: nutils.prep_data_grids(np.array(grids)), self.cell: nutils.prep_data_cells(cells), self.value_target: discounted_rewards, self.action: chs, self.psi: advantages } return self._backward(data)
def backward_multi_nstep(self, grids, cells, chs, rewards, next_grid, next_cell, gamma, next_ch=None) -> (float, float): """ Multi n-step. Train on n-step, (n-1)-step, (n-2)-step, ..., 1-step returns """ next_qvals = self._double_q_target(next_grid, next_cell, next_ch) rewards_plus = np.asarray(rewards + [next_qvals]) # q_targets: # [(r0 + g*r1 + g**2*r2 +..+ g**n*q_n), (r1 + g*r2 +..+ g**(n-1)*q_n), .., # (r(n-1) + g*q_n)] where g: gamma, q_n: next_qval (bootstrap val) q_targets = discount(rewards_plus, gamma)[:-1] return self.backward_supervised(grids, cells, chs, q_targets)
def backward_gae(self, grids, cells, chs, rewards, next_grid, next_cell, gamma, next_ch=None) -> (float, float): """Generalized Advantage Estimation""" next_qvals = self._double_q_target(next_grid, next_cell, next_ch) vals = self.sess.run( self.target_q_max, { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells), self.chs: chs }) value_plus = np.zeros((len(vals) + 1)) value_plus[:len(vals)] = vals value_plus[-1] = next_qvals advantages = discount( rewards + gamma * value_plus[1:] - value_plus[:-1], gamma) return self.backward_supervised(grids, cells, chs, q_targes=advantages)