Ejemplo n.º 1
0
    def backward(self, grid, cell, ch, reward, next_grid, next_cell) -> float:
        # TODO Save and pass 'val' from earlier forward pass
        val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(grid),
                self.cell: nutils.prep_data_cells(cell)
            })[0]
        next_val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(next_grid),
                self.cell: nutils.prep_data_cells(next_cell)
            })[0]
        target_val = reward + self.gamma * next_val
        advantage = target_val - val

        data = {
            self.grid: nutils.prep_data_grids(np.array(grid)),
            self.cell: nutils.prep_data_cells(cell),
            self.value_target: target_val,
            self.action: [ch],
            self.psi: advantage
        }
        return self._backward(data)
Ejemplo n.º 2
0
    def backward(self,
                 *,
                 freps,
                 rewards,
                 next_freps,
                 discount,
                 weights,
                 avg_reward=None,
                 grids=None,
                 next_grids=None,
                 **kwargs):
        # NOTE can possible take in val, next_val here as theyre already known
        assert len(freps) == 1  # Hard coded for one-step
        data1 = {self.frep: freps}
        data2 = {self.frep: next_freps}
        if self.grid_inp:
            pgrids = prep_data_grids(grids, self.grid_split)
            pnext_grids = prep_data_grids(next_grids, self.grid_split)
            data1[self.grid] = pgrids
            data2[self.grid] = pnext_grids
        value = self.sess.run(self.value, feed_dict=data1)[0, 0]
        next_value = self.sess.run(self.value, feed_dict=data2)[0, 0]
        if avg_reward is None:
            td_err = rewards[0] + discount * next_value - value
        else:
            td_err = rewards[0] - avg_reward + next_value - value

        if self.grid_inp:
            # print(pgrids[0].shape, freps[0].shape)
            inp = np.dstack((pgrids[0], freps[0]))
            next_inp = np.dstack((pnext_grids[0], next_freps[0]))
            inp_colvec = np.reshape(inp, [-1, 1])
            next_inp_colvec = np.reshape(next_inp, [-1, 1])
        else:
            inp_colvec = np.reshape(freps[0], [-1, 1])
            next_inp_colvec = np.reshape(next_freps[0], [-1, 1])
        td_inp = td_err * inp_colvec
        dot = np.dot(inp_colvec.T, np.dot(self.weights, td_inp))
        nextv = np.dot(next_inp_colvec, dot)
        if avg_reward is None:
            grad = -2 * weights[0] * (td_inp - discount * nextv)
        else:
            grad = -2 * weights[0] * (td_inp + avg_reward - nextv)
        lr, _ = self.sess.run([self.lr, self.do_train],
                              feed_dict={self.grads[0][0]: grad},
                              options=self.options,
                              run_metadata=self.run_metadata)
        # up = np.dot(np.dot(self.weights, np.dot(inp_colvec, inp_colvec.T)), self.weights)
        # lo = 1 + np.dot(dot, inp_colvec)
        # self.weights -= up / lo
        v = np.dot(self.weights.T, next_inp_colvec)
        lo = 1 + np.dot(v.T, inp_colvec)
        self.weights -= np.dot(np.dot(self.weights, inp_colvec), v.T) / lo
        return td_err**2, lr, td_err
Ejemplo n.º 3
0
Archivo: qnet.py Proyecto: namch29/dca
 def _double_q_target(self,
                      grids,
                      cells,
                      freps=None,
                      target_chs=None) -> [float]:
     """Find bootstrap value, i.e. Q(Stn, A; Wt).
     where Stn: state at time t+n
           A: target_chs, if specified, else argmax(Q(Stn, a; Wo))
           n: usually 1, unless n-step Q-learning
           Wo/Wt: online/target network"""
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
     }
     if self.pp['bighead']:
         if type(cells) == tuple:
             cells = [cells]
         data[self.cells] = cells
     else:
         data[self.cells] = prep_data_cells(cells)
     if self.pp['dueling_qnet']:
         target_q = self.target_value
     elif target_chs is None:
         # Greedy Q-Learning
         target_q = self.target_q_max
     else:
         # SARSA or Eligible Q-learning
         target_q = self.target_q_selected
         data[self.chs] = target_chs
     if freps is not None:
         data[self.freps] = freps
     qvals = self.sess.run(target_q, data)
     if self.pp['dueling_qnet']:
         qvals = qvals[0]
     return qvals
Ejemplo n.º 4
0
Archivo: singh.py Proyecto: namch29/dca
 def backward(self,
              *,
              freps,
              value_targets,
              grids=None,
              weights=[1],
              **kwargs):
     data = {
         self.frep: freps,
         self.value_target: value_targets,
         self.weight: weights
     }
     if self.grid_inp:
         data[self.grid] = prep_data_grids(grids, self.grid_split)
     _, loss, lr, errs = self.sess.run(
         [self.do_train, self.loss, self.lr, self.err],
         feed_dict=data,
         options=self.options,
         run_metadata=self.run_metadata)
     # if len(freps) > 1:
     #     print(loss, loss.shape, errs, errs.shape, freps.shape, value_targets,
     #           value_targets.shape, weights, weights.shape)
     errs = np.squeeze(errs, 1)
     assert errs.shape == np.array(value_targets).shape
     return loss, lr, errs
Ejemplo n.º 5
0
 def _double_q_target(self,
                      grids,
                      cells,
                      freps=None,
                      target_chs=None) -> [float]:
     """Find bootstrap value, i.e. Q(Stn, A; Wt).
     where Stn: state at time t+n
           A: target_chs, if specified, else argmax(Q(Stn, a; Wo))
           n: usually 1, unless n-step Q-learning
           Wo/Wt: online/target network"""
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.oh_cells: prep_data_cells(cells)
     }
     data[self.online_state_in] = self.online_state
     if target_chs is None:
         # Greedy Q-Learning
         target_q = self.online_q_max
     else:
         # SARSA or Eligible Q-learning
         target_q = self.online_q_selected
         data[self.chs] = target_chs
     if freps is not None:
         data[self.freps] = freps
     qvals, self.online_state = self.sess.run(
         [target_q, self.online_state_out], data)
     return qvals
Ejemplo n.º 6
0
 def backward(self, grid, reward, next_grid):
     next_value = self.sess.run(self.value,
                                feed_dict={
                                    self.grid:
                                    prep_data_grids(next_grid,
                                                    self.pp['empty_neg'])
                                })
     value_target = reward + self.gamma * next_value
     data = {
         self.grid: prep_data_grids(grid, self.pp['empty_neg']),
         self.value_target: value_target,
     }
     _, loss = self.sess.run([self.do_train, self.loss],
                             feed_dict=data,
                             options=self.options,
                             run_metadata=self.run_metadata)
     return loss
Ejemplo n.º 7
0
 def forward(self, freps, grids):
     data = {self.frep: freps}
     if self.grid_inp:
         data[self.grid] = prep_data_grids(grids, self.grid_split)
     values = self.sess.run(self.value,
                            feed_dict=data,
                            options=self.options,
                            run_metadata=self.run_metadata)
     vals = np.reshape(values, [-1])
     return vals
Ejemplo n.º 8
0
 def forward(self, grids):
     values = self.sess.run(self.value,
                            feed_dict={
                                self.grid:
                                prep_data_grids(
                                    grids, empty_neg=self.pp['empty_neg']),
                            },
                            options=self.options,
                            run_metadata=self.run_metadata)
     vals = np.reshape(values, [-1])
     return vals
Ejemplo n.º 9
0
 def forward(self, grid, cell) -> Tuple[List[float], float]:
     a_dist, val = self.sess.run(
         [self.policy, self.value],
         feed_dict={
             self.grid: nutils.prep_data_grids(grid),
             self.cell: nutils.prep_data_cells(cell)
         },
         options=self.options,
         run_metadata=self.run_metadata)
     assert val.shape == (1, 1)
     assert a_dist.shape == (1, self.n_channels)
     return a_dist[0], val[0, 0]
Ejemplo n.º 10
0
    def _get_vals_inps(self, freps, next_freps, grids, next_grids):
        data = {self.frep: freps, self.next_frep: next_freps}
        if self.grid_inp:
            pgrids = prep_data_grids(grids, self.grid_split)
            pnext_grids = prep_data_grids(next_grids, self.grid_split)
            data[self.grid] = pgrids
            data[self.next_grid] = pnext_grids
        val, next_val, lr = self.sess.run(
            [self.value, self.next_value, self.lr], feed_dict=data)
        value, next_value = val[0, 0], next_val[0, 0]

        if self.grid_inp:
            # print(pgrids[0].shape, freps[0].shape)
            inp = np.dstack((pgrids[0], freps[0]))
            next_inp = np.dstack((pnext_grids[0], next_freps[0]))
            inp_colvec = np.reshape(inp, [-1, 1])
            next_inp_colvec = np.reshape(next_inp, [-1, 1])
        else:
            inp_colvec = np.reshape(freps[0], [-1, 1])
            next_inp_colvec = np.reshape(next_freps[0], [-1, 1])
        return value, next_value, inp_colvec, next_inp_colvec, lr
Ejemplo n.º 11
0
 def forward(self, freps, grids):
     data = {self.frep: freps}
     if self.grid_inp:
         data[self.grid] = prep_data_grids(grids, self.grid_split)
     p, a = self.sess.run(
         [self.prob, self.act],
         feed_dict=data,
         options=self.options,
         run_metadata=self.run_metadata)
     p, a = p[0, 0], a[0, 0]
     # print(a, p)
     return a, p
Ejemplo n.º 12
0
    def backward(self,
                 *,
                 freps,
                 rewards,
                 next_freps,
                 discount=None,
                 weights=[1],
                 avg_reward=None,
                 grids=None,
                 next_grids=None,
                 **kwargs):
        assert len(freps) == 1  # Hard coded for one-step
        assert discount is not None or avg_reward is not None
        assert weights is not None
        assert weights[0] is not None
        if avg_reward is None:
            avg_reward = 0
        else:
            discount = 1

        data = {
            self.frep: freps,
            self.next_frep: next_freps,
            self.reward: rewards,
            self.discount: [discount],
            self.avg_reward: avg_reward,
            self.ph_grad_beta: self.grad_beta,
            self.imp_weight: weights[0]
        }
        if self.grid_inp:
            data[self.grid] = prep_data_grids(grids, self.grid_split)
            data[self.next_grid] = prep_data_grids(next_grids, self.grid_split)
        lr, td_err, _, _ = self.sess.run(
            [self.lr, self.loss_grad, self.do_train, self.update_weights],
            feed_dict=data,
            options=self.options,
            run_metadata=self.run_metadata)
        self.grad_beta *= self.grad_beta_decay
        td_err = td_err[0, 0]
        return td_err**2, lr, td_err
Ejemplo n.º 13
0
    def backward_gae(self, grids, cells, vals, chs, rewards, next_grid,
                     next_cell) -> float:
        """Generalized Advantage Estimation"""
        # Estimated value after trajectory, V(S_t+n)
        bootstrap_val = self.sess.run(
            self.value,
            feed_dict={
                self.grid: nutils.prep_data_grids(next_grid),
                self.cell: nutils.prep_data_cells(next_cell)
            })
        rewards_plus = np.asarray(rewards + [bootstrap_val])
        discounted_rewards = nutils.discount(rewards_plus, self.gamma)[:-1]
        value_plus = np.asarray(vals + [bootstrap_val])
        advantages = nutils.discount(
            rewards + self.gamma * value_plus[1:] - value_plus[:-1], self.gamma)

        data = {
            self.grid: nutils.prep_data_grids(np.array(grids)),
            self.cell: nutils.prep_data_cells(cells),
            self.value_target: discounted_rewards,
            self.action: chs,
            self.psi: advantages
        }
        return self._backward(data)
Ejemplo n.º 14
0
 def backward_supervised(self,
                         grids,
                         cells,
                         chs,
                         q_targets,
                         freps=None,
                         weights=None):
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.oh_cells: prep_data_cells(cells),
         self.chs: chs,
         self.q_targets: q_targets,
     }
     if freps is not None:
         data[self.freps] = freps
     if weights is not None:
         data[self.weights] = weights
     return self._backward(data)
Ejemplo n.º 15
0
    def backward(self, *, freps, grids, rewards, avg_reward, actions, action_probs):
        assert len(freps) == 1, (len(freps), type(freps),
                                 freps.shape)  # Hard coded for one-step

        data = {
            self.frep: freps,
            self.reward: rewards,
            self.avg_reward: avg_reward,
            self.act_ph: actions,
            self.prob_ph: action_probs
        }
        if self.grid_inp:
            data[self.grid] = prep_data_grids(grids, self.grid_split)
        _ = self.sess.run(
            [self.do_train],
            feed_dict=data,
            options=self.options,
            run_metadata=self.run_metadata)
Ejemplo n.º 16
0
 def forward(self, grid, cell, ce_type, frep=None):
     data = {
         self.grids: prep_data_grids(grid, split=self.grid_split),
         self.oh_cells: prep_data_cells(cell),
         self.online_state_in: self.online_state
     }
     if frep is not None:
         data[self.freps] = [frep]
     if self.pp['dueling_qnet']:
         q_vals_op = self.advantages
     else:
         q_vals_op = self.online_q_vals
     q_vals, self.online_state = self.sess.run(
         [q_vals_op, self.online_state_out],
         data,
         options=self.options,
         run_metadata=self.run_metadata)
     q_vals = q_vals[0]
     assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}"
     return q_vals
Ejemplo n.º 17
0
Archivo: qnet.py Proyecto: namch29/dca
 def forward(self, grid, cell, ce_type, frep=None):
     data = {
         self.grids: prep_data_grids(grid, split=self.grid_split),
     }
     if self.pp['bighead']:
         if type(cell) == tuple:
             cell = [cell]
         data[self.cells] = cell
     else:
         data[self.cells] = prep_data_cells(cell)
     if frep is not None:
         data[self.freps] = [frep]
     if self.pp['dueling_qnet']:
         q_vals_op = self.online_advantages
     else:
         q_vals_op = self.online_q_vals
     q_vals = self.sess.run(q_vals_op,
                            data,
                            options=self.options,
                            run_metadata=self.run_metadata)
     q_vals = q_vals[0]
     assert q_vals.shape == (self.n_channels, ), f"{q_vals.shape}\n{q_vals}"
     return q_vals
Ejemplo n.º 18
0
Archivo: qnet.py Proyecto: namch29/dca
 def backward_supervised(self,
                         grids,
                         cells,
                         chs,
                         q_targets,
                         freps=None,
                         weights=None):
     data = {
         self.grids: prep_data_grids(grids, self.grid_split),
         self.chs: chs,
         self.q_targets: q_targets,
     }
     if self.pp['bighead']:
         if type(cells) == tuple:
             cells = [cells]
         data[self.cells] = cells
     else:
         data[self.cells] = prep_data_cells(cells)
     if freps is not None:
         data[self.freps] = freps
     if weights is not None:
         data[self.weights] = weights
     return self._backward(data)
Ejemplo n.º 19
0
Archivo: qnet.py Proyecto: namch29/dca
 def backward_gae(self,
                  grids,
                  cells,
                  chs,
                  rewards,
                  next_grid,
                  next_cell,
                  gamma,
                  next_ch=None) -> (float, float):
     """Generalized Advantage Estimation"""
     next_qvals = self._double_q_target(next_grid, next_cell, next_ch)
     vals = self.sess.run(
         self.target_q_max, {
             self.grids: prep_data_grids(grids, self.grid_split),
             self.oh_cells: prep_data_cells(cells),
             self.chs: chs
         })
     value_plus = np.zeros((len(vals) + 1))
     value_plus[:len(vals)] = vals
     value_plus[-1] = next_qvals
     advantages = discount(
         rewards + gamma * value_plus[1:] - value_plus[:-1], gamma)
     return self.backward_supervised(grids, cells, chs, q_targes=advantages)