def backward(self, grid, cell, ch, reward, next_grid, next_cell) -> float: # TODO Save and pass 'val' from earlier forward pass val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(grid), self.cell: nutils.prep_data_cells(cell) })[0] next_val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(next_grid), self.cell: nutils.prep_data_cells(next_cell) })[0] target_val = reward + self.gamma * next_val advantage = target_val - val data = { self.grid: nutils.prep_data_grids(np.array(grid)), self.cell: nutils.prep_data_cells(cell), self.value_target: target_val, self.action: [ch], self.psi: advantage } return self._backward(data)
def _double_q_target(self, grids, cells, freps=None, target_chs=None) -> [float]: """Find bootstrap value, i.e. Q(Stn, A; Wt). where Stn: state at time t+n A: target_chs, if specified, else argmax(Q(Stn, a; Wo)) n: usually 1, unless n-step Q-learning Wo/Wt: online/target network""" data = { self.grids: prep_data_grids(grids, self.grid_split), } if self.pp['bighead']: if type(cells) == tuple: cells = [cells] data[self.cells] = cells else: data[self.cells] = prep_data_cells(cells) if self.pp['dueling_qnet']: target_q = self.target_value elif target_chs is None: # Greedy Q-Learning target_q = self.target_q_max else: # SARSA or Eligible Q-learning target_q = self.target_q_selected data[self.chs] = target_chs if freps is not None: data[self.freps] = freps qvals = self.sess.run(target_q, data) if self.pp['dueling_qnet']: qvals = qvals[0] return qvals
def backward(self, step, buf, n_step): # TODO: # - collect nsteps of data. 16-128 # - train noptepochs consecutive times on pg net. 4 # next_values = self.sess.run( # self.value, feed_dict={ # self.freps: [e.next_frep for e in buf] # }).squeeze() value_target = step.reward - self.avg_reward + step.next_val loss, lr, err = self.backward_vf([step.frep], [value_target]) if len(buf) != n_step: return loss, lr, err # np.random.shuffle(buf) next_values = np.array([e.next_val for e in buf]) rewards = [e.reward for e in buf] value_targets = rewards + next_values - self.avg_reward freps = [e.frep for e in buf] cells = [e.cell for e in buf] neglogpacs = [e.neglogpac for e in buf] chs = [e.ch for e in buf] for _ in range(4): self.sess.run( [self.do_train_pg], { self.freps: freps, self.cells: prep_data_cells(cells), self.value_target: value_targets, self.old_neglogpac: neglogpacs, self.action: chs }) return loss, lr, err
def _double_q_target(self, grids, cells, freps=None, target_chs=None) -> [float]: """Find bootstrap value, i.e. Q(Stn, A; Wt). where Stn: state at time t+n A: target_chs, if specified, else argmax(Q(Stn, a; Wo)) n: usually 1, unless n-step Q-learning Wo/Wt: online/target network""" data = { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells) } data[self.online_state_in] = self.online_state if target_chs is None: # Greedy Q-Learning target_q = self.online_q_max else: # SARSA or Eligible Q-learning target_q = self.online_q_selected data[self.chs] = target_chs if freps is not None: data[self.freps] = freps qvals, self.online_state = self.sess.run( [target_q, self.online_state_out], data) return qvals
def get_neglogpac(self, frep, cell, ch): policy = self.sess.run(self.policy, { self.freps: [frep], self.cells: prep_data_cells(cell), })[0] neglogpac = self.sess.run(self.neglogpac_out, { self.policy_in: policy, self.action: [ch] }) return neglogpac
def forward(self, grids, freps, cells): values = self.sess.run(self.online_q_vals, feed_dict={ self.grids: grids, self.freps: freps, self.cells: cells, self.oh_cells: prep_data_cells(cells), }, options=self.options, run_metadata=self.run_metadata) vals = np.squeeze(values, axis=1) return vals
def forward(self, grid, cell) -> Tuple[List[float], float]: a_dist, val = self.sess.run( [self.policy, self.value], feed_dict={ self.grid: nutils.prep_data_grids(grid), self.cell: nutils.prep_data_cells(cell) }, options=self.options, run_metadata=self.run_metadata) assert val.shape == (1, 1) assert a_dist.shape == (1, self.n_channels) return a_dist[0], val[0, 0]
def backward(self, *, grids, freps, cells, chs, value_targets, **kwargs): data = { # self.grid: grids, self.frep: freps, self.oh_cell: prep_data_cells(cells), self.ch: chs, self.q_target: value_targets } _, loss, lr, err = self.sess.run( [self.do_train, self.loss, self.lr, self.err], feed_dict=data, options=self.options, run_metadata=self.run_metadata) return loss, lr, err
def backward(self, grids, freps, cells, chs, rewards, next_grids, next_elig, next_freps, next_cells, discount): # next_value = self.sess.run( # self.online_q_selected, { # self.freps: next_freps, # self.oh_cells: prep_data_cells(next_cells), # self.chs: next_chs # })[0] next_value = self.sess.run( self.q_target_out, { self.grids: next_grids, self.elig: next_elig, self.freps: next_freps, # self.cells: [next_cells], self.oh_cells: prep_data_cells(next_cells), }) assert next_value.shape == (1, ) value_target = rewards + discount * next_value[0] data = { self.grids: grids, self.freps: freps, self.cells: cells, self.oh_cells: prep_data_cells(cells), self.chs: chs, self.q_targets: [value_target] } _, loss, lr, err = self.sess.run( [self.do_train, self.loss, self.lr, self.err], feed_dict=data, options=self.options, run_metadata=self.run_metadata) return loss, lr, err
def backward_gae(self, grids, cells, vals, chs, rewards, next_grid, next_cell) -> float: """Generalized Advantage Estimation""" # Estimated value after trajectory, V(S_t+n) bootstrap_val = self.sess.run( self.value, feed_dict={ self.grid: nutils.prep_data_grids(next_grid), self.cell: nutils.prep_data_cells(next_cell) }) rewards_plus = np.asarray(rewards + [bootstrap_val]) discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1] value_plus = np.asarray(vals + [bootstrap_val]) advantages = nutils.discount( rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma) data = { self.grid: nutils.prep_data_grids(np.array(grids)), self.cell: nutils.prep_data_cells(cells), self.value_target: discounted_rewards, self.action: chs, self.psi: advantages } return self._backward(data)
def forward(self, grids, freps, cells): data = { self.frep: freps, # self.cell: cells, self.oh_cell: prep_data_cells(cells), } # if self.grid_inp: # data[self.grid] = grids values = self.sess.run( self.online_q_vals, feed_dict=data, options=self.options, run_metadata=self.run_metadata) vals = np.reshape(values, [-1]) return vals
def backward_supervised(self, grids, cells, chs, q_targets, freps=None, weights=None): data = { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells), self.chs: chs, self.q_targets: q_targets, } if freps is not None: data[self.freps] = freps if weights is not None: data[self.weights] = weights return self._backward(data)
def forward(self, grid, cell, ce_type, frep=None): data = { self.grids: prep_data_grids(grid, split=self.grid_split), self.oh_cells: prep_data_cells(cell), self.online_state_in: self.online_state } if frep is not None: data[self.freps] = [frep] if self.pp['dueling_qnet']: q_vals_op = self.advantages else: q_vals_op = self.online_q_vals q_vals, self.online_state = self.sess.run( [q_vals_op, self.online_state_out], data, options=self.options, run_metadata=self.run_metadata) q_vals = q_vals[0] assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}" return q_vals
def forward_action(self, frep, cell, ce_type, chs): # u = tf.random_uniform(tf.shape(self.policy)) # self.sample_action = tf.argmax(elig_policy - tf.log(-tf.log(u)), axis=-1) policy = self.sess.run(self.policy, { self.freps: [frep], self.cells: prep_data_cells(cell), })[0] # u = np.random.uniform(policy.shape) # policy_ent = policy - np.log(-np.log(u)) # NOTE TODO should this be argmin for END? if ce_type == CEvent.END: idx = np.argmin(policy[chs]) else: idx = np.argmax(policy[chs]) ch = chs[idx] neglogpac = self.sess.run(self.neglogpac_out, { self.policy_in: policy, self.action: [ch] }) return ch, neglogpac
def backward_gae(self, grids, cells, chs, rewards, next_grid, next_cell, gamma, next_ch=None) -> (float, float): """Generalized Advantage Estimation""" next_qvals = self._double_q_target(next_grid, next_cell, next_ch) vals = self.sess.run( self.target_q_max, { self.grids: prep_data_grids(grids, self.grid_split), self.oh_cells: prep_data_cells(cells), self.chs: chs }) value_plus = np.zeros((len(vals) + 1)) value_plus[:len(vals)] = vals value_plus[-1] = next_qvals advantages = discount( rewards + gamma * value_plus[1:] - value_plus[:-1], gamma) return self.backward_supervised(grids, cells, chs, q_targes=advantages)
def backward_supervised(self, grids, cells, chs, q_targets, freps=None, weights=None): data = { self.grids: prep_data_grids(grids, self.grid_split), self.chs: chs, self.q_targets: q_targets, } if self.pp['bighead']: if type(cells) == tuple: cells = [cells] data[self.cells] = cells else: data[self.cells] = prep_data_cells(cells) if freps is not None: data[self.freps] = freps if weights is not None: data[self.weights] = weights return self._backward(data)
def forward(self, grid, cell, ce_type, frep=None): data = { self.grids: prep_data_grids(grid, split=self.grid_split), } if self.pp['bighead']: if type(cell) == tuple: cell = [cell] data[self.cells] = cell else: data[self.cells] = prep_data_cells(cell) if frep is not None: data[self.freps] = [frep] if self.pp['dueling_qnet']: q_vals_op = self.online_advantages else: q_vals_op = self.online_q_vals q_vals = self.sess.run(q_vals_op, data, options=self.options, run_metadata=self.run_metadata) q_vals = q_vals[0] assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}" return q_vals