Esempio n. 1
0
 def _load_model(self):
     self._net_saver.restore(self._sess, DQNSolver.PATH_NET % self._RESTORE_STEP)
     with open(DQNSolver.PATH_VAR % self._RESTORE_STEP, "r") as f:
         var = json.load(f)
     self._epsilon = var["epsilon"]
     self._beta = var["beta"]
     self._learn_step = self._RESTORE_STEP + 1
     log("model loaded | RESTORE_STEP: %d | epsilon: %.6f | beta: %.6f"
         % (self._RESTORE_STEP, self._epsilon, self._beta))
Esempio n. 2
0
 def _load_model(self):
     self._net_saver.restore(self._sess,
                             DQNSolver.PATH_NET % self._RESTORE_STEP)
     with open(DQNSolver.PATH_VAR % self._RESTORE_STEP, "r") as f:
         var = json.load(f)
     self._epsilon = var["epsilon"]
     self._beta = var["beta"]
     self._learn_step = self._RESTORE_STEP + 1
     log("model loaded | RESTORE_STEP: %d | epsilon: %.6f | beta: %.6f" %
         (self._RESTORE_STEP, self._epsilon, self._beta))
Esempio n. 3
0
 def load(self, beg_step, end_step):
     log("Loading history...")
     self._history_loss = np.load(History.PATH_DATA % ("loss", beg_step, end_step))
     self._history_avg_reward = np.load(History.PATH_DATA % ("avg-reward", beg_step, end_step))
     self._history_min_reward = np.load(History.PATH_DATA % ("min-reward", beg_step, end_step))
     self._history_max_reward = np.load(History.PATH_DATA % ("max-reward", beg_step, end_step))
     self._history_avg_len = np.load(History.PATH_DATA % ("avg-len", beg_step, end_step))
     self._history_min_len = np.load(History.PATH_DATA % ("min-len", beg_step, end_step))
     self._history_max_len = np.load(History.PATH_DATA % ("max-len", beg_step, end_step))
     self._history_avg_step = np.load(History.PATH_DATA % ("avg-step", beg_step, end_step))
     self._history_min_step = np.load(History.PATH_DATA % ("min-step", beg_step, end_step))
     self._history_max_step = np.load(History.PATH_DATA % ("max-step", beg_step, end_step))
Esempio n. 4
0
 def save(self, beg_step, end_step):
     log("Saving history...")
     np.save(History.PATH_DATA % ("loss", beg_step, end_step), self._history_loss)
     np.save(History.PATH_DATA % ("avg-reward", beg_step, end_step), self._history_avg_reward)
     np.save(History.PATH_DATA % ("min-reward", beg_step, end_step), self._history_min_reward)
     np.save(History.PATH_DATA % ("max-reward", beg_step, end_step), self._history_max_reward)
     np.save(History.PATH_DATA % ("avg-len", beg_step, end_step), self._history_avg_len)
     np.save(History.PATH_DATA % ("min-len", beg_step, end_step), self._history_min_len)
     np.save(History.PATH_DATA % ("max-len", beg_step, end_step), self._history_max_len)
     np.save(History.PATH_DATA % ("avg-step", beg_step, end_step), self._history_avg_step)
     np.save(History.PATH_DATA % ("min-step", beg_step, end_step), self._history_min_step)
     np.save(History.PATH_DATA % ("max-step", beg_step, end_step), self._history_max_step)
Esempio n. 5
0
    def train(self):
        state_cur = self._state()
        action = self._choose_action()
        reward, state_nxt, done = self._step(action)
        self._store_transition(state_cur, action, reward, state_nxt, done)
        self._history.add_snake_step(done, reward, self.snake)

        if self._mem_cnt >= self._MEM_SIZE:
            if self._mem_cnt % self._FREQ_LEARN == 0:
                self._learn()
        elif self._mem_cnt % self._FREQ_LOG == 0:
            log("mem_cnt: %d" % self._mem_cnt)

        learn_end = self._learn_step > self._MAX_LEARN_STEP

        return done, learn_end
Esempio n. 6
0
    def train(self):
        state_cur = self._state()
        action = self._choose_action()
        reward, state_nxt, done = self._step(action)
        self._store_transition(state_cur, action, reward, state_nxt, done)
        self._history.add_snake_step(done, reward, self.snake)

        if self._mem_cnt >= self._MEM_SIZE:
            if self._mem_cnt % self._FREQ_LEARN == 0:
                self._learn()
        elif self._mem_cnt % self._FREQ_LOG == 0:
            log("mem_cnt: %d" % self._mem_cnt)

        learn_end = self._learn_step > self._MAX_LEARN_STEP

        return done, learn_end
Esempio n. 7
0
    def _learn(self):
        log_msg = "step %d | mem_cnt: %d | epsilon: %.6f | beta: %.6f" % \
                  (self._learn_step, self._mem_cnt, self._epsilon, self._beta)

        # Compute average
        avg_reward, avg_len, avg_steps, new_max_avg = self._history.add_learn_step()
        log_msg += " | avg_reward: %.6f | avg_len: %.2f | avg_steps: %.2f" \
                   % (avg_reward, avg_len, avg_steps)

        # Save model
        saved = False
        if new_max_avg or self._learn_step % self._FREQ_SAVE == 0:
            self._save_model()
            saved = True
            log_msg += " | model saved"

        # Sample batch from memory
        batch, IS_weights, tree_indices = self._mem.sample(self._MEM_BATCH, self._beta)
        batch_state_cur = [x[0] for x in batch]
        batch_action = [x[1] for x in batch]
        batch_reward = [x[2] for x in batch]
        batch_state_nxt = [x[3] for x in batch]
        batch_done = [x[4] for x in batch]

        # Compute eval net output for next state (to compute q target)
        q_eval_all_nxt = self._sess.run(
            self._q_eval_all,
            feed_dict={
                self._state_eval: batch_state_nxt,
            }
        )

        # Learn
        _, loss, abs_errs = self._sess.run(
            [self._train, self._loss, self._td_err_abs],
            feed_dict={
                self._state_eval: batch_state_cur,
                self._state_target: batch_state_nxt,
                self._action: batch_action,
                self._reward: batch_reward,
                self._done: batch_done,
                self._q_eval_all_nxt: q_eval_all_nxt,
                self._IS_weights: IS_weights,
            }
        )
        self._history.add_loss(loss)
        log_msg += " | loss: %.6f" % loss

        # Update sum tree
        self._mem.update(tree_indices, abs_errs)

        # Replace target
        if self._learn_step == 1 or self._learn_step % self._FREQ_REPLACE == 0:
            self._sess.run(self._replace_target)
            log_msg += " | target net replaced"

        if saved or self._learn_step == 1 or self._learn_step % self._FREQ_LOG == 0:
            log(log_msg)

        self._learn_step += 1
        self._epsilon = max(self._EPSILON_MIN, self._epsilon - self._EPSILON_DEC)
        self._beta = min(1.0, self._beta + self._BETA_INC)
Esempio n. 8
0
    def _learn(self):
        log_msg = "step %d | mem_cnt: %d | epsilon: %.6f | beta: %.6f" % \
                  (self._learn_step, self._mem_cnt, self._epsilon, self._beta)

        # Compute average
        avg_reward, avg_len, avg_steps, new_max_avg = self._history.add_learn_step(
        )
        log_msg += " | avg_reward: %.6f | avg_len: %.2f | avg_steps: %.2f" \
                   % (avg_reward, avg_len, avg_steps)

        # Save model
        saved = False
        if new_max_avg or self._learn_step % self._FREQ_SAVE == 0:
            self._save_model()
            saved = True
            log_msg += " | model saved"

        # Sample batch from memory
        batch, IS_weights, tree_indices = self._mem.sample(
            self._MEM_BATCH, self._beta)
        batch_state_cur = [x[0] for x in batch]
        batch_action = [x[1] for x in batch]
        batch_reward = [x[2] for x in batch]
        batch_state_nxt = [x[3] for x in batch]
        batch_done = [x[4] for x in batch]

        # Compute eval net output for next state (to compute q target)
        q_eval_all_nxt = self._sess.run(self._q_eval_all,
                                        feed_dict={
                                            self._state_eval: batch_state_nxt,
                                        })

        # Learn
        _, loss, abs_errs = self._sess.run(
            [self._train, self._loss, self._td_err_abs],
            feed_dict={
                self._state_eval: batch_state_cur,
                self._state_target: batch_state_nxt,
                self._action: batch_action,
                self._reward: batch_reward,
                self._done: batch_done,
                self._q_eval_all_nxt: q_eval_all_nxt,
                self._IS_weights: IS_weights,
            })
        self._history.add_loss(loss)
        log_msg += " | loss: %.6f" % loss

        # Update sum tree
        self._mem.update(tree_indices, abs_errs)

        # Replace target
        if self._learn_step == 1 or self._learn_step % self._FREQ_REPLACE == 0:
            self._sess.run(self._replace_target)
            log_msg += " | target net replaced"

        if saved or self._learn_step == 1 or self._learn_step % self._FREQ_LOG == 0:
            log(log_msg)

        self._learn_step += 1
        self._epsilon = max(self._EPSILON_MIN,
                            self._epsilon - self._EPSILON_DEC)
        self._beta = min(1.0, self._beta + self._BETA_INC)