def backward(self, grid, cell, ch, reward, next_grid, next_cell) -> float: # TODO Save and pass 'val' from earlier forward pass val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(grid), self.cell: nutils.prep_data_cells(cell) })[0] next_val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(next_grid), self.cell: nutils.prep_data_cells(next_cell) })[0] target_val = reward + self.gamma * next_val advantage = target_val - val data = { self.grid: nutils.prep_data_grids(np.array(grid)), self.cell: nutils.prep_data_cells(cell), self.value_target: target_val, self.action: [ch], self.psi: advantage } return self._backward(data)
def backward(self, *, freps, rewards, next_freps, discount, weights, avg_reward=None, grids=None, next_grids=None, **kwargs): # NOTE can possible take in val, next_val here as theyre already known assert len(freps) == 1 # Hard coded for one-step data1 = {self.frep: freps} data2 = {self.frep: next_freps} if self.grid_inp: pgrids = prep_data_grids(grids, self.grid_split) pnext_grids = prep_data_grids(next_grids, self.grid_split) data1[self.grid] = pgrids data2[self.grid] = pnext_grids value = self.sess.run(self.value, feed_dict=data1)[0, 0] next_value = self.sess.run(self.value, feed_dict=data2)[0, 0] if avg_reward is None: td_err = rewards[0] + discount * next_value - value else: td_err = rewards[0] - avg_reward + next_value - value if self.grid_inp: # print(pgrids[0].shape, freps[0].shape) inp = np.dstack((pgrids[0], freps[0])) next_inp = np.dstack((pnext_grids[0], next_freps[0])) inp_colvec = np.reshape(inp, [-1, 1]) next_inp_colvec = np.reshape(next_inp, [-1, 1]) else: inp_colvec = np.reshape(freps[0], [-1, 1]) next_inp_colvec = np.reshape(next_freps[0], [-1, 1]) td_inp = td_err * inp_colvec dot = np.dot(inp_colvec.T, np.dot(self.weights, td_inp)) nextv = np.dot(next_inp_colvec, dot) if avg_reward is None: grad = -2 * weights[0] * (td_inp - discount * nextv) else: grad = -2 * weights[0] * (td_inp + avg_reward - nextv) lr, _ = self.sess.run([self.lr, self.do_train], feed_dict={self.grads[0][0]: grad}, options=self.options, run_metadata=self.run_metadata) # up = np.dot(np.dot(self.weights, np.dot(inp_colvec, inp_colvec.T)), self.weights) # lo = 1 + np.dot(dot, inp_colvec) # self.weights -= up / lo v = np.dot(self.weights.T, next_inp_colvec) lo = 1 + np.dot(v.T, inp_colvec) self.weights -= np.dot(np.dot(self.weights, inp_colvec), v.T) / lo return td_err**2, lr, td_err
def _double_q_target(self, grids, cells, freps=None, target_chs=None) -> [float]: """Find bootstrap value, i.e. Q(Stn, A; Wt). where Stn: state at time t+n A: target_chs, if specified, else argmax(Q(Stn, a; Wo)) n: usually 1, unless n-step Q-learning Wo/Wt: online/target network""" data = { self.grids: prep_data_grids(grids, self.grid_split), } if self.pp['bighead']: if type(cells) == tuple: cells = [cells] data[self.cells] = cells else: data[self.cells] = prep_data_cells(cells) if self.pp['dueling_qnet']: target_q = self.target_value elif target_chs is None: # Greedy Q-Learning target_q = self.target_q_max else: # SARSA or Eligible Q-learning target_q = self.target_q_selected data[self.chs] = target_chs if freps is not None: data[self.freps] = freps qvals = self.sess.run(target_q, data) if self.pp['dueling_qnet']: qvals = qvals[0] return qvals
def backward(self, *, freps, value_targets, grids=None, weights=[1], **kwargs): data = { self.frep: freps, self.value_target: value_targets, self.weight: weights } if self.grid_inp: data[self.grid] = prep_data_grids(grids, self.grid_split) _, loss, lr, errs = self.sess.run( [self.do_train, self.loss, self.lr, self.err], feed_dict=data, options=self.options, run_metadata=self.run_metadata) # if len(freps) > 1: # print(loss, loss.shape, errs, errs.shape, freps.shape, value_targets, # value_targets.shape, weights, weights.shape) errs = np.squeeze(errs, 1) assert errs.shape == np.array(value_targets).shape return loss, lr, errs
def _double_q_target(self, grids, cells, freps=None, target_chs=None) -> [float]: """Find bootstrap value, i.e. Q(Stn, A; Wt). where Stn: state at time t+n A: target_chs, if specified, else argmax(Q(Stn, a; Wo)) n: usually 1, unless n-step Q-learning Wo/Wt: online/target network""" data = { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells) } data[self.online_state_in] = self.online_state if target_chs is None: # Greedy Q-Learning target_q = self.online_q_max else: # SARSA or Eligible Q-learning target_q = self.online_q_selected data[self.chs] = target_chs if freps is not None: data[self.freps] = freps qvals, self.online_state = self.sess.run( [target_q, self.online_state_out], data) return qvals
def backward(self, grid, reward, next_grid): next_value = self.sess.run(self.value, feed_dict={ self.grid: prep_data_grids(next_grid, self.pp['empty_neg']) }) value_target = reward + self.gamma * next_value data = { self.grid: prep_data_grids(grid, self.pp['empty_neg']), self.value_target: value_target, } _, loss = self.sess.run([self.do_train, self.loss], feed_dict=data, options=self.options, run_metadata=self.run_metadata) return loss
def forward(self, freps, grids): data = {self.frep: freps} if self.grid_inp: data[self.grid] = prep_data_grids(grids, self.grid_split) values = self.sess.run(self.value, feed_dict=data, options=self.options, run_metadata=self.run_metadata) vals = np.reshape(values, [-1]) return vals
def forward(self, grids): values = self.sess.run(self.value, feed_dict={ self.grid: prep_data_grids( grids, empty_neg=self.pp['empty_neg']), }, options=self.options, run_metadata=self.run_metadata) vals = np.reshape(values, [-1]) return vals
def forward(self, grid, cell) -> Tuple[List[float], float]: a_dist, val = self.sess.run( [self.policy, self.value], feed_dict={ self.grid: nutils.prep_data_grids(grid), self.cell: nutils.prep_data_cells(cell) }, options=self.options, run_metadata=self.run_metadata) assert val.shape == (1, 1) assert a_dist.shape == (1, self.n_channels) return a_dist[0], val[0, 0]
def _get_vals_inps(self, freps, next_freps, grids, next_grids): data = {self.frep: freps, self.next_frep: next_freps} if self.grid_inp: pgrids = prep_data_grids(grids, self.grid_split) pnext_grids = prep_data_grids(next_grids, self.grid_split) data[self.grid] = pgrids data[self.next_grid] = pnext_grids val, next_val, lr = self.sess.run( [self.value, self.next_value, self.lr], feed_dict=data) value, next_value = val[0, 0], next_val[0, 0] if self.grid_inp: # print(pgrids[0].shape, freps[0].shape) inp = np.dstack((pgrids[0], freps[0])) next_inp = np.dstack((pnext_grids[0], next_freps[0])) inp_colvec = np.reshape(inp, [-1, 1]) next_inp_colvec = np.reshape(next_inp, [-1, 1]) else: inp_colvec = np.reshape(freps[0], [-1, 1]) next_inp_colvec = np.reshape(next_freps[0], [-1, 1]) return value, next_value, inp_colvec, next_inp_colvec, lr
def forward(self, freps, grids): data = {self.frep: freps} if self.grid_inp: data[self.grid] = prep_data_grids(grids, self.grid_split) p, a = self.sess.run( [self.prob, self.act], feed_dict=data, options=self.options, run_metadata=self.run_metadata) p, a = p[0, 0], a[0, 0] # print(a, p) return a, p
def backward(self, *, freps, rewards, next_freps, discount=None, weights=[1], avg_reward=None, grids=None, next_grids=None, **kwargs): assert len(freps) == 1 # Hard coded for one-step assert discount is not None or avg_reward is not None assert weights is not None assert weights[0] is not None if avg_reward is None: avg_reward = 0 else: discount = 1 data = { self.frep: freps, self.next_frep: next_freps, self.reward: rewards, self.discount: [discount], self.avg_reward: avg_reward, self.ph_grad_beta: self.grad_beta, self.imp_weight: weights[0] } if self.grid_inp: data[self.grid] = prep_data_grids(grids, self.grid_split) data[self.next_grid] = prep_data_grids(next_grids, self.grid_split) lr, td_err, _, _ = self.sess.run( [self.lr, self.loss_grad, self.do_train, self.update_weights], feed_dict=data, options=self.options, run_metadata=self.run_metadata) self.grad_beta *= self.grad_beta_decay td_err = td_err[0, 0] return td_err**2, lr, td_err
def backward_gae(self, grids, cells, vals, chs, rewards, next_grid, next_cell) -> float: """Generalized Advantage Estimation""" # Estimated value after trajectory, V(S_t+n) bootstrap_val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(next_grid), self.cell: nutils.prep_data_cells(next_cell) }) rewards_plus = np.asarray(rewards + [bootstrap_val]) discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1] value_plus = np.asarray(vals + [bootstrap_val]) advantages = nutils.discount( rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma) data = { self.grid: nutils.prep_data_grids(np.array(grids)), self.cell: nutils.prep_data_cells(cells), self.value_target: discounted_rewards, self.action: chs, self.psi: advantages } return self._backward(data)
def backward_supervised(self, grids, cells, chs, q_targets, freps=None, weights=None): data = { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells), self.chs: chs, self.q_targets: q_targets, } if freps is not None: data[self.freps] = freps if weights is not None: data[self.weights] = weights return self._backward(data)
def backward(self, *, freps, grids, rewards, avg_reward, actions, action_probs): assert len(freps) == 1, (len(freps), type(freps), freps.shape) # Hard coded for one-step data = { self.frep: freps, self.reward: rewards, self.avg_reward: avg_reward, self.act_ph: actions, self.prob_ph: action_probs } if self.grid_inp: data[self.grid] = prep_data_grids(grids, self.grid_split) _ = self.sess.run( [self.do_train], feed_dict=data, options=self.options, run_metadata=self.run_metadata)
def forward(self, grid, cell, ce_type, frep=None): data = { self.grids: prep_data_grids(grid, split=self.grid_split), self.oh_cells: prep_data_cells(cell), self.online_state_in: self.online_state } if frep is not None: data[self.freps] = [frep] if self.pp['dueling_qnet']: q_vals_op = self.advantages else: q_vals_op = self.online_q_vals q_vals, self.online_state = self.sess.run( [q_vals_op, self.online_state_out], data, options=self.options, run_metadata=self.run_metadata) q_vals = q_vals[0] assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}" return q_vals
def forward(self, grid, cell, ce_type, frep=None): data = { self.grids: prep_data_grids(grid, split=self.grid_split), } if self.pp['bighead']: if type(cell) == tuple: cell = [cell] data[self.cells] = cell else: data[self.cells] = prep_data_cells(cell) if frep is not None: data[self.freps] = [frep] if self.pp['dueling_qnet']: q_vals_op = self.online_advantages else: q_vals_op = self.online_q_vals q_vals = self.sess.run(q_vals_op, data, options=self.options, run_metadata=self.run_metadata) q_vals = q_vals[0] assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}" return q_vals
def backward_supervised(self, grids, cells, chs, q_targets, freps=None, weights=None): data = { self.grids: prep_data_grids(grids, self.grid_split), self.chs: chs, self.q_targets: q_targets, } if self.pp['bighead']: if type(cells) == tuple: cells = [cells] data[self.cells] = cells else: data[self.cells] = prep_data_cells(cells) if freps is not None: data[self.freps] = freps if weights is not None: data[self.weights] = weights return self._backward(data)
def backward_gae(self, grids, cells, chs, rewards, next_grid, next_cell, gamma, next_ch=None) -> (float, float): """Generalized Advantage Estimation""" next_qvals = self._double_q_target(next_grid, next_cell, next_ch) vals = self.sess.run( self.target_q_max, { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells), self.chs: chs }) value_plus = np.zeros((len(vals) + 1)) value_plus[:len(vals)] = vals value_plus[-1] = next_qvals advantages = discount( rewards + gamma * value_plus[1:] - value_plus[:-1], gamma) return self.backward_supervised(grids, cells, chs, q_targes=advantages)