Ejemplo n.º 1
0
class DqnHalfPongSyr(PyGamePlayer):
    def __init__(self, playback_mode, mod=None, net_name='pong_syr'):
        self._playback_mode = playback_mode
        self._last_reward = 0
        super(DqnHalfPongSyr, self).__init__(force_game_fps=8,
                                             run_real_time=playback_mode)

        self._last_state = None
        self._last_action = np.zeros(DIM_ACTION)
        self._last_action[1] = 1

        self.sess = tf.Session()
        self.agent = DDQNAgent(self.sess,
                               DIM_STATE,
                               DIM_ACTION,
                               LR,
                               TAU,
                               net_name=net_name)
        self.sess.run(tf.global_variables_initializer())
        self.agent.update_target_paras()
        self.saver = tf.train.Saver()

        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        self.explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode)
        self.summary = Summary(self.sess, DIR_SUM)

        self.summary.add_variable(tf.Variable(0.), 'reward')
        self.summary.add_variable(tf.Variable(0.), 'loss')
        self.summary.build()
        self.summary.write_variables(FLAGS)

        self._steps = 0
        self._sum_reward = [0]
        self._dif_reward = deque(maxlen=EP_STEPS)

        if mod and os.path.exists(FLAGS.dir_mod.format(mod)):
            checkpoint = tf.train.get_checkpoint_state(
                FLAGS.dir_mod.format(mod))
            self.saver.restore(self.sess,
                               save_path=checkpoint.model_checkpoint_path)
            print("Loaded checkpoints {0}".format(
                checkpoint.model_checkpoint_path))

    def get_keys_pressed(self, screen_array, feedback, terminal):
        _, screen_binary = cv2.threshold(
            cv2.cvtColor(screen_array, cv2.COLOR_BGR2GRAY), 1, 255,
            cv2.THRESH_BINARY)

        if self._last_state is None:
            self._last_state = np.stack(tuple(screen_binary
                                              for _ in range(STATE_FRAMES)),
                                        axis=2)
            return DqnHalfPongSyr._key_presses_from_action(self._last_action)

        screen_binary = np.reshape(screen_binary,
                                   (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
        current_state = np.append(self._last_state[:, :, 1:],
                                  screen_binary,
                                  axis=2)

        if not self._playback_mode:
            self.replay_buffer.add(self._last_state, self._last_action,
                                   feedback, current_state, terminal)
            if len(self.replay_buffer) > OBV_STEPS:
                loss = self._train()
                self._sum_reward.append(feedback)
                if feedback != 0.0:
                    self._dif_reward.append(feedback)
                if not self._steps % EP_STEPS:
                    print(
                        '| Step: %i' % self._steps,
                        '| Epoch: %i' % (self._steps / EP_STEPS),
                        '| Sum_Reward: %i' % sum(self._sum_reward),
                        '| Dif_Reward: %.4f' %
                        (sum(self._dif_reward) / len(self._dif_reward)))
                    if not self._steps % (EP_STEPS * 10):
                        self.summary.run(feed_dict={
                            'loss': loss,
                            'reward': sum(self._sum_reward)
                        })
                    self._sum_reward = [0]

        self._last_state = current_state
        self._last_action = self._get_action()

        return DqnHalfPongSyr._key_presses_from_action(self._last_action)

    def _get_action(self):
        target_q = self.agent.predict([self._last_state])[0]
        return self.explorer.get_action(target_q)

    def _train(self):
        self._steps += 1
        batch_state, batch_action, batch_reward, batch_state_next, batch_done = \
            self.replay_buffer.sample_batch(MINI_BATCH)

        q_value = self.agent.predict(batch_state_next)
        max_q_value_index = np.argmax(q_value, axis=1)
        target_q_value = self.agent.predict_target(batch_state_next)
        double_q = target_q_value[range(len(target_q_value)),
                                  max_q_value_index]

        batch_y = []
        for r, q, d in zip(batch_reward, double_q, batch_done):
            if d:
                batch_y.append(r)
            else:
                batch_y.append(r + GAMMA * q)

        opt, loss = self.agent.train(batch_state, batch_action, batch_y)
        self.agent.update_target_paras()

        if not self._steps % CKP_STEP:
            self.saver.save(self.sess,
                            DIR_MOD + '/net',
                            global_step=self._steps)
            print('Mod saved!')

        return loss

    def get_feedback(self):
        from Env.games.half_pong import score

        # get the difference in score between this and the last run
        score_change = (score - self._last_reward)
        self._last_reward = score

        return float(score_change), score_change == -1

    @staticmethod
    def _key_presses_from_action(action_set):
        if action_set[0] == 1:
            return [K_DOWN]
        elif action_set[1] == 1:
            return []
        elif action_set[2] == 1:
            return [K_UP]
        raise Exception("Unexpected action")

    def start(self):
        super(DqnHalfPongSyr, self).start()

        from Env.games.half_pong import run
        run(screen_width=SCREEN_WIDTH, screen_height=SCREEN_HEIGHT)
Ejemplo n.º 2
0
class DdqnPong():

    def __init__(self, playback_mode, env, render=True, mod=None):
        self._playback_mode = playback_mode

        self._env = env
        self._render = render

        self._sess = tf.Session()
        self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='flat')
        self._sess.run(tf.global_variables_initializer())
        self._agent.update_target_paras()

        self._saver = tf.train.Saver()
        self._replay_buffer = ReplayBuffer(BUFFER_SIZE)
        self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode)
        self.summary = Summary(self._sess, DIR_SUM)

        self.summary.add_variable(tf.Variable(0.), 'reward')
        self.summary.add_variable(tf.Variable(0.), 'loss')
        self.summary.add_variable(tf.Variable(0.), 'maxq')
        self.summary.build()
        self.summary.write_variables(FLAGS)

        self._steps = 0

        if mod and os.path.exists(FLAGS.dir_mod.format(mod)):
            checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod))
            self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path)
            print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))

    def start(self):
        for ep in range(MAX_EP):

            sum_reward = 0
            last_state = []
            last_img = self._env.reset()
            last_img = (pre_process_image(last_img, SCREEN_WIDTH, SCREEN_HEIGHT))

            for _ in range(STATE_FRAMES):
                last_state.append(last_img)
            last_state = np.dstack(last_state)

            for step in range(EP_STEPS):
                if self._render:
                    env.render()

                q_value = self._agent.predict([last_state])[0]
                last_max_qvalue = np.max(q_value)

                act_1_hot = self._explorer.get_action(q_value)
                act_index = np.argmax(act_1_hot)

                # ['NOOP-XX', 'FIRE->NOOP->0', 'RIGHT->UP->1', 'LEFT->DOWN->2', 'RIGHTFIRE-XX', 'LEFTFIRE-XX']
                observation, reward, done, info = env.step(act_index+1)
                if reward == 0:
                    reward += 0.1

                state = pre_process_image(observation, SCREEN_WIDTH, SCREEN_HEIGHT)
                state = np.reshape(state, (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
                state = np.append(state, last_state[:, :, :3], axis=2)

                self._replay_buffer.add(last_state, act_1_hot, reward, state, done)

                loss = None
                if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS:
                    loss = self._train()

                last_state = state
                sum_reward += reward
                self._steps += 1

                if done or step == EP_STEPS - 1:
                    print('| Step: %i' % self._steps,
                          '| Episode: %i' % ep,
                          '| Epoch: %i' % step,
                          '| qvalue: %.5f' % last_max_qvalue,
                          '| Sum_Reward: %i' % sum_reward)
                    if loss != None:
                        self.summary.run(feed_dict={
                            'loss': loss,
                            'reward': sum_reward,
                            'maxq': last_max_qvalue})
                    break


    def _train(self):
        batch_state, batch_action, batch_reward, batch_state_next, batch_done = \
            self._replay_buffer.sample_batch(MINI_BATCH)


        q_value = self._agent.predict(batch_state_next)
        max_q_value_index = np.argmax(q_value, axis=1)
        target_q_value = self._agent.predict_target(batch_state_next)
        double_q = target_q_value[range(len(target_q_value)), max_q_value_index]

        batch_y = []
        for r, q, d in zip(batch_reward, double_q, batch_done):
            if d:
                batch_y.append(r)
            else:
                batch_y.append(r + GAMMA * q)


        opt, loss = self._agent.train(batch_state, batch_action, batch_y)
        self._agent.update_target_paras()

        if not self._steps % CKP_STEP:
            self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps)
            print('Mod saved!')

        return loss
Ejemplo n.º 3
0
class DdqnBirdSyr():

    def __init__(self, playback_mode, mod=None):
        self._playback_mode = playback_mode

        env = FlappyBird(pipe_gap=200)
        self._ple = PLE(env, fps=30, display_screen=DISPLAY)
        self._ple.init()

        self._sess = tf.Session()
        self._agent = DDQNAgent(self._sess, DIM_STATE, DIM_ACTION, LR, TAU, net_name='cnn_bird')
        self._sess.run(tf.global_variables_initializer())
        self._agent.update_target_paras()

        self._saver = tf.train.Saver()
        self._replay_buffer = ReplayBuffer(BUFFER_SIZE)
        self._explorer = Explorer(EPS_BEGIN, EPS_END, EPS_STEPS, playback_mode)
        self.summary = Summary(self._sess, DIR_SUM)

        self.summary.add_variable(tf.Variable(0.), 'reward')
        self.summary.add_variable(tf.Variable(0.), 'loss')
        self.summary.add_variable(tf.Variable(0.), 'maxq')
        self.summary.build()
        self.summary.write_variables(FLAGS)

        self._steps = 0

        if mod and os.path.exists(FLAGS.dir_mod.format(mod)):
            checkpoint = tf.train.get_checkpoint_state(FLAGS.dir_mod.format(mod))
            self._saver.restore(self._sess, save_path=checkpoint.model_checkpoint_path)
            print("Loaded checkpoints {0}".format(checkpoint.model_checkpoint_path))

    def start(self):
        for ep in range(MAX_EP):
            sum_reward = 0
            last_state = []
            for _ in range(STATE_FRAMES):
                last_state.append(self._ple.getScreenGrayscale())
            last_state = np.dstack(last_state)

            last_max_qvalue = 0

            for step in range(EP_STEPS):
                time.sleep(0.01)
                if not step % STATE_FRAMES:
                    q_value = self._agent.predict([last_state])[0]
                    last_max_qvalue = np.max(q_value)

                    act_1_hot = self._explorer.get_action(q_value)
                    act_index = np.argmax(act_1_hot)
                else:
                    # do nothing
                    act_index = 1
                    act_1_hot = np.zeros(DIM_ACTION)
                    act_1_hot[act_index] = 1

                reward = self._ple.act(self._ple.getActionSet()[act_index])
                if reward == 0:
                    reward = 0.1
                elif reward == -5:
                    reward = -1

                state = np.reshape(self._ple.getScreenGrayscale(), (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
                state = np.append(state, last_state[:, :, :3], axis=2)

                done = False
                if self._ple.game_over():
                    done = True

                self._replay_buffer.add(last_state, act_1_hot, reward, state, done)

                loss = None
                if not self._playback_mode and len(self._replay_buffer) > OBV_STEPS:
                    loss = self._train()

                last_state = state
                sum_reward += reward
                self._steps += 1

                if done or step == EP_STEPS - 1:
                    print('| Step: %i' % self._steps,
                          '| Episode: %i' % ep,
                          '| Epoch: %i' % step,
                          '| qvalue: %.5f' % last_max_qvalue,
                          '| Sum_Reward: %i' % sum_reward)
                    if loss != None:
                        self.summary.run(feed_dict={
                            'loss': loss,
                            'reward': sum_reward,
                            'maxq': last_max_qvalue})
                    self._ple.reset_game()
                    break


    def _train(self):
        batch_state, batch_action, batch_reward, batch_state_next, batch_done = \
            self._replay_buffer.sample_batch(MINI_BATCH)

        q_value = self._agent.predict(batch_state_next)
        max_q_value_index = np.argmax(q_value, axis=1)
        target_q_value = self._agent.predict_target(batch_state_next)
        double_q = target_q_value[range(len(target_q_value)), max_q_value_index]

        batch_y = []
        for r, q, d in zip(batch_reward, double_q, batch_done):
            if d:
                batch_y.append(r)
            else:
                batch_y.append(r + GAMMA * q)

        opt, loss = self._agent.train(batch_state, batch_action, batch_y)
        self._agent.update_target_paras()

        if not self._steps % CKP_STEP:
            self._saver.save(self._sess, DIR_MOD + '/net', global_step=self._steps)
            print('Mod saved!')

        return loss