Beispiel #1
0
    def _learn(self, max_episode, save_as_file='TD_policy.dat'):
        env = Env()
        agents = [
            TD_agent(self.epsilon, self.alpha, self.decay_rate),
            TD_agent(self.epsilon, self.alpha, self.decay_rate)
        ]

        for _ in tqdm(range(max_episode)):
            # reset to the initial state, env keep a counter for current round
            # odd round->x, even round->o, because for each piece, it has a submark on it!
            env.reset()
            for agent in agents:
                agent.decay_epsilon()

            while True:
                curr_qttt, mark = env.get_state()

                agent = ProgramDriver.get_agent_by_mark(agents, mark)

                free_qblock_id_lists, collapsed_qttts = env.get_valid_moves()

                collapsed_qttt, agent_move = agent.act(free_qblock_id_lists,
                                                       collapsed_qttts, mark)

                next_qttt, next_round, reward, done = env.step(
                    collapsed_qttt, agent_move, mark)

                agent.bellman_backup(curr_qttt, next_qttt, reward, mark)

                if done:
                    GameTree.set_state_value(next_qttt.get_state(), reward)
                    break

        ProgramDriver.save_model(save_as_file, max_episode, self.epsilon,
                                 self.alpha, self.decay_rate)
Beispiel #2
0
 def bellman_backup(self, qttt, next_qttt, reward, mark):
     """
     Bellman backup for TD learning
     :param Qttt state: current state of qttt
     :param Qttt next_state: next state after action is take
     :param int  reward: immediate reward for this round
     :return: None
     """
     state_value = GameTree.get_state_val(qttt.get_state())
     next_state_value = GameTree.get_state_val(next_qttt.get_state())
     updated_state_value = state_value + self.alpha * (
         reward + gamma * next_state_value - state_value)
     GameTree.set_state_value(qttt.get_state(), updated_state_value)
Beispiel #3
0
    def play_with_human(self, save_as_file='TD_human_policy.dat'):
        ProgramDriver.load_model(save_as_file)
        env = Env()
        agents = [
            TD_agent(self.epsilon, self.alpha, self.decay_rate),
            HumanAgent(1),
        ]

        while True:
            env.reset()
            td_agent = agents[0]
            td_agent.decay_epsilon()
            env.render()

            while True:
                curr_qttt, mark = env.get_state()

                agent = ProgramDriver.get_agent_by_mark(agents, mark)

                free_qblock_id_lists, collapsed_qttts = env.get_valid_moves()

                collapsed_qttt, agent_move = agent.act(free_qblock_id_lists,
                                                       collapsed_qttts, mark)

                if collapsed_qttt is None:
                    ProgramDriver.save_model(save_as_file, 0, self.epsilon,
                                             self.alpha, self.decay_rate)
                    print("Model saved.")
                    sys.exit()

                next_qttt, next_round, reward, done = env.step(
                    collapsed_qttt, agent_move, mark)

                print('')
                env.render()

                td_agent.bellman_backup(curr_qttt, next_qttt, reward, mark)

                if done:
                    GameTree.set_state_value(next_qttt.get_state(), reward)
                    next_qttt.show_result()
                    break