def _learn(self, max_episode, save_as_file='TD_policy.dat'): env = Env() agents = [ TD_agent(self.epsilon, self.alpha, self.decay_rate), TD_agent(self.epsilon, self.alpha, self.decay_rate) ] for _ in tqdm(range(max_episode)): # reset to the initial state, env keep a counter for current round # odd round->x, even round->o, because for each piece, it has a submark on it! env.reset() for agent in agents: agent.decay_epsilon() while True: curr_qttt, mark = env.get_state() agent = ProgramDriver.get_agent_by_mark(agents, mark) free_qblock_id_lists, collapsed_qttts = env.get_valid_moves() collapsed_qttt, agent_move = agent.act(free_qblock_id_lists, collapsed_qttts, mark) next_qttt, next_round, reward, done = env.step( collapsed_qttt, agent_move, mark) agent.bellman_backup(curr_qttt, next_qttt, reward, mark) if done: GameTree.set_state_value(next_qttt.get_state(), reward) break ProgramDriver.save_model(save_as_file, max_episode, self.epsilon, self.alpha, self.decay_rate)
def bellman_backup(self, qttt, next_qttt, reward, mark): """ Bellman backup for TD learning :param Qttt state: current state of qttt :param Qttt next_state: next state after action is take :param int reward: immediate reward for this round :return: None """ state_value = GameTree.get_state_val(qttt.get_state()) next_state_value = GameTree.get_state_val(next_qttt.get_state()) updated_state_value = state_value + self.alpha * ( reward + gamma * next_state_value - state_value) GameTree.set_state_value(qttt.get_state(), updated_state_value)
def play_with_human(self, save_as_file='TD_human_policy.dat'): ProgramDriver.load_model(save_as_file) env = Env() agents = [ TD_agent(self.epsilon, self.alpha, self.decay_rate), HumanAgent(1), ] while True: env.reset() td_agent = agents[0] td_agent.decay_epsilon() env.render() while True: curr_qttt, mark = env.get_state() agent = ProgramDriver.get_agent_by_mark(agents, mark) free_qblock_id_lists, collapsed_qttts = env.get_valid_moves() collapsed_qttt, agent_move = agent.act(free_qblock_id_lists, collapsed_qttts, mark) if collapsed_qttt is None: ProgramDriver.save_model(save_as_file, 0, self.epsilon, self.alpha, self.decay_rate) print("Model saved.") sys.exit() next_qttt, next_round, reward, done = env.step( collapsed_qttt, agent_move, mark) print('') env.render() td_agent.bellman_backup(curr_qttt, next_qttt, reward, mark) if done: GameTree.set_state_value(next_qttt.get_state(), reward) next_qttt.show_result() break