Exemple #1
0
    def backward(self, grid, cell, ch, reward, next_grid, next_cell) -> float:
        # TODO Save and pass 'val' from earlier forward pass
        val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(grid),
                self.cell: nutils.prep_data_cells(cell)
            })[0]
        next_val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(next_grid),
                self.cell: nutils.prep_data_cells(next_cell)
            })[0]
        target_val = reward + self.gamma * next_val
        advantage = target_val - val

        data = {
            self.grid: nutils.prep_data_grids(np.array(grid)),
            self.cell: nutils.prep_data_cells(cell),
            self.value_target: target_val,
            self.action: [ch],
            self.psi: advantage
        }
        return self._backward(data)
Exemple #2
0
 def _double_q_target(self,
                      grids,
                      cells,
                      freps=None,
                      target_chs=None) -> [float]:
     """Find bootstrap value, i.e. Q(Stn, A; Wt).
     where Stn: state at time t+n
           A: target_chs, if specified, else argmax(Q(Stn, a; Wo))
           n: usually 1, unless n-step Q-learning
           Wo/Wt: online/target network"""
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
     }
     if self.pp['bighead']:
         if type(cells) == tuple:
             cells = [cells]
         data[self.cells] = cells
     else:
         data[self.cells] = prep_data_cells(cells)
     if self.pp['dueling_qnet']:
         target_q = self.target_value
     elif target_chs is None:
         # Greedy Q-Learning
         target_q = self.target_q_max
     else:
         # SARSA or Eligible Q-learning
         target_q = self.target_q_selected
         data[self.chs] = target_chs
     if freps is not None:
         data[self.freps] = freps
     qvals = self.sess.run(target_q, data)
     if self.pp['dueling_qnet']:
         qvals = qvals[0]
     return qvals
Exemple #3
0
 def backward(self, step, buf, n_step):
     # TODO:
     # - collect nsteps of data. 16-128
     # - train noptepochs consecutive times on pg net. 4
     # next_values = self.sess.run(
     #     self.value, feed_dict={
     #         self.freps: [e.next_frep for e in buf]
     #     }).squeeze()
     value_target = step.reward - self.avg_reward + step.next_val
     loss, lr, err = self.backward_vf([step.frep], [value_target])
     if len(buf) != n_step:
         return loss, lr, err
     # np.random.shuffle(buf)
     next_values = np.array([e.next_val for e in buf])
     rewards = [e.reward for e in buf]
     value_targets = rewards + next_values - self.avg_reward
     freps = [e.frep for e in buf]
     cells = [e.cell for e in buf]
     neglogpacs = [e.neglogpac for e in buf]
     chs = [e.ch for e in buf]
     for _ in range(4):
         self.sess.run(
             [self.do_train_pg], {
                 self.freps: freps,
                 self.cells: prep_data_cells(cells),
                 self.value_target: value_targets,
                 self.old_neglogpac: neglogpacs,
                 self.action: chs
             })
     return loss, lr, err
Exemple #4
0
 def _double_q_target(self,
                      grids,
                      cells,
                      freps=None,
                      target_chs=None) -> [float]:
     """Find bootstrap value, i.e. Q(Stn, A; Wt).
     where Stn: state at time t+n
           A: target_chs, if specified, else argmax(Q(Stn, a; Wo))
           n: usually 1, unless n-step Q-learning
           Wo/Wt: online/target network"""
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.oh_cells: prep_data_cells(cells)
     }
     data[self.online_state_in] = self.online_state
     if target_chs is None:
         # Greedy Q-Learning
         target_q = self.online_q_max
     else:
         # SARSA or Eligible Q-learning
         target_q = self.online_q_selected
         data[self.chs] = target_chs
     if freps is not None:
         data[self.freps] = freps
     qvals, self.online_state = self.sess.run(
         [target_q, self.online_state_out], data)
     return qvals
Exemple #5
0
 def get_neglogpac(self, frep, cell, ch):
     policy = self.sess.run(self.policy, {
         self.freps: [frep],
         self.cells: prep_data_cells(cell),
     })[0]
     neglogpac = self.sess.run(self.neglogpac_out, {
         self.policy_in: policy,
         self.action: [ch]
     })
     return neglogpac
Exemple #6
0
 def forward(self, grids, freps, cells):
     values = self.sess.run(self.online_q_vals,
                            feed_dict={
                                self.grids: grids,
                                self.freps: freps,
                                self.cells: cells,
                                self.oh_cells: prep_data_cells(cells),
                            },
                            options=self.options,
                            run_metadata=self.run_metadata)
     vals = np.squeeze(values, axis=1)
     return vals
Exemple #7
0
 def forward(self, grid, cell) -> Tuple[List[float], float]:
     a_dist, val = self.sess.run(
         [self.policy, self.value],
         feed_dict={
             self.grid: nutils.prep_data_grids(grid),
             self.cell: nutils.prep_data_cells(cell)
         },
         options=self.options,
         run_metadata=self.run_metadata)
     assert val.shape == (1, 1)
     assert a_dist.shape == (1, self.n_channels)
     return a_dist[0], val[0, 0]
Exemple #8
0
 def backward(self, *, grids, freps, cells, chs, value_targets, **kwargs):
     data = {
         # self.grid: grids,
         self.frep: freps,
         self.oh_cell: prep_data_cells(cells),
         self.ch: chs,
         self.q_target: value_targets
     }
     _, loss, lr, err = self.sess.run(
         [self.do_train, self.loss, self.lr, self.err],
         feed_dict=data,
         options=self.options,
         run_metadata=self.run_metadata)
     return loss, lr, err
Exemple #9
0
 def backward(self, grids, freps, cells, chs, rewards, next_grids,
              next_elig, next_freps, next_cells, discount):
     # next_value = self.sess.run(
     #     self.online_q_selected, {
     #         self.freps: next_freps,
     #         self.oh_cells: prep_data_cells(next_cells),
     #         self.chs: next_chs
     #     })[0]
     next_value = self.sess.run(
         self.q_target_out,
         {
             self.grids:
             next_grids,
             self.elig:
             next_elig,
             self.freps:
             next_freps,
             # self.cells: [next_cells],
             self.oh_cells:
             prep_data_cells(next_cells),
         })
     assert next_value.shape == (1, )
     value_target = rewards + discount * next_value[0]
     data = {
         self.grids: grids,
         self.freps: freps,
         self.cells: cells,
         self.oh_cells: prep_data_cells(cells),
         self.chs: chs,
         self.q_targets: [value_target]
     }
     _, loss, lr, err = self.sess.run(
         [self.do_train, self.loss, self.lr, self.err],
         feed_dict=data,
         options=self.options,
         run_metadata=self.run_metadata)
     return loss, lr, err
Exemple #10
0
    def backward_gae(self, grids, cells, vals, chs, rewards, next_grid,
                     next_cell) -> float:
        """Generalized Advantage Estimation"""
        # Estimated value after trajectory, V(S_t+n)
        bootstrap_val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(next_grid),
                self.cell: nutils.prep_data_cells(next_cell)
            })
        rewards_plus = np.asarray(rewards + [bootstrap_val])
        discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1]
        value_plus = np.asarray(vals + [bootstrap_val])
        advantages = nutils.discount(
            rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma)

        data = {
            self.grid: nutils.prep_data_grids(np.array(grids)),
            self.cell: nutils.prep_data_cells(cells),
            self.value_target: discounted_rewards,
            self.action: chs,
            self.psi: advantages
        }
        return self._backward(data)
Exemple #11
0
 def forward(self, grids, freps, cells):
     data = {
         self.frep: freps,
         # self.cell: cells,
         self.oh_cell: prep_data_cells(cells),
     }
     # if self.grid_inp:
     #     data[self.grid] = grids
     values = self.sess.run(
         self.online_q_vals,
         feed_dict=data,
         options=self.options,
         run_metadata=self.run_metadata)
     vals = np.reshape(values, [-1])
     return vals
Exemple #12
0
 def backward_supervised(self,
                         grids,
                         cells,
                         chs,
                         q_targets,
                         freps=None,
                         weights=None):
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.oh_cells: prep_data_cells(cells),
         self.chs: chs,
         self.q_targets: q_targets,
     }
     if freps is not None:
         data[self.freps] = freps
     if weights is not None:
         data[self.weights] = weights
     return self._backward(data)
Exemple #13
0
 def forward(self, grid, cell, ce_type, frep=None):
     data = {
         self.grids: prep_data_grids(grid, split=self.grid_split),
         self.oh_cells: prep_data_cells(cell),
         self.online_state_in: self.online_state
     }
     if frep is not None:
         data[self.freps] = [frep]
     if self.pp['dueling_qnet']:
         q_vals_op = self.advantages
     else:
         q_vals_op = self.online_q_vals
     q_vals, self.online_state = self.sess.run(
         [q_vals_op, self.online_state_out],
         data,
         options=self.options,
         run_metadata=self.run_metadata)
     q_vals = q_vals[0]
     assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}"
     return q_vals
Exemple #14
0
    def forward_action(self, frep, cell, ce_type, chs):
        # u = tf.random_uniform(tf.shape(self.policy))
        # self.sample_action = tf.argmax(elig_policy - tf.log(-tf.log(u)), axis=-1)
        policy = self.sess.run(self.policy, {
            self.freps: [frep],
            self.cells: prep_data_cells(cell),
        })[0]

        # u = np.random.uniform(policy.shape)
        # policy_ent = policy - np.log(-np.log(u))
        # NOTE TODO should this be argmin for END?
        if ce_type == CEvent.END:
            idx = np.argmin(policy[chs])
        else:
            idx = np.argmax(policy[chs])
        ch = chs[idx]
        neglogpac = self.sess.run(self.neglogpac_out, {
            self.policy_in: policy,
            self.action: [ch]
        })
        return ch, neglogpac
Exemple #15
0
 def backward_gae(self,
                  grids,
                  cells,
                  chs,
                  rewards,
                  next_grid,
                  next_cell,
                  gamma,
                  next_ch=None) -> (float, float):
     """Generalized Advantage Estimation"""
     next_qvals = self._double_q_target(next_grid, next_cell, next_ch)
     vals = self.sess.run(
         self.target_q_max, {
             self.grids: prep_data_grids(grids, self.grid_split),
             self.oh_cells: prep_data_cells(cells),
             self.chs: chs
         })
     value_plus = np.zeros((len(vals) + 1))
     value_plus[:len(vals)] = vals
     value_plus[-1] = next_qvals
     advantages = discount(
         rewards + gamma * value_plus[1:] - value_plus[:-1], gamma)
     return self.backward_supervised(grids, cells, chs, q_targes=advantages)
Exemple #16
0
 def backward_supervised(self,
                         grids,
                         cells,
                         chs,
                         q_targets,
                         freps=None,
                         weights=None):
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.chs: chs,
         self.q_targets: q_targets,
     }
     if self.pp['bighead']:
         if type(cells) == tuple:
             cells = [cells]
         data[self.cells] = cells
     else:
         data[self.cells] = prep_data_cells(cells)
     if freps is not None:
         data[self.freps] = freps
     if weights is not None:
         data[self.weights] = weights
     return self._backward(data)
Exemple #17
0
 def forward(self, grid, cell, ce_type, frep=None):
     data = {
         self.grids: prep_data_grids(grid, split=self.grid_split),
     }
     if self.pp['bighead']:
         if type(cell) == tuple:
             cell = [cell]
         data[self.cells] = cell
     else:
         data[self.cells] = prep_data_cells(cell)
     if frep is not None:
         data[self.freps] = [frep]
     if self.pp['dueling_qnet']:
         q_vals_op = self.online_advantages
     else:
         q_vals_op = self.online_q_vals
     q_vals = self.sess.run(q_vals_op,
                            data,
                            options=self.options,
                            run_metadata=self.run_metadata)
     q_vals = q_vals[0]
     assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}"
     return q_vals