def play(total_score, total_test_case): thread_total_score = 0 thread_test_case = NUM_OF_TEST_CASE // NUM_OF_THREAD for i in range(thread_test_case): game = Game(show=False) for j in range(ROUND_PER_EPISODE): ## pick an action from stategy if j % 3 == 0: gameboard = GameBoard(game.gameboard.board) num_available_choices = len(gameboard.get_available_choices()) init_state = State(gameboard.board, 0, [], num_available_choices) root_node = Node(state=init_state) current_node = root_node current_node = monte_carlo_tree_search(current_node) x, y = current_node.state.get_choice() ########################## game.input_pos(x, y) thread_total_score += game.gameboard.score - ROUND_PER_EPISODE * PENALTY_PER_STEP lock.acquire() total_test_case.value += thread_test_case total_score.value += thread_total_score lock.release()
def policy_play(agent): game = Game(show=False) while not game.termination(): board = game.gameboard.board choice = agent.best_move(board) game.input_pos(choice[0], choice[1]) return game.gameboard.score
def td_learning(mode): global epsilon max_score = 0 if mode != "new": start_episode = int(mode) net.load_state_dict(torch.load(network_path + "net" + mode + ".pth")) target_net.load_state_dict(torch.load(network_path + "net" + mode + ".pth")) load_train_data(mode) global epsilon epsilon = epsilon - epsilon_decay * start_episode print("load the network and train data " + mode) else: start_episode = 0 for episode in range(start_episode, nEpisode): ## init state game = Game(show=False) round = 0 while not game.termination(): ## pick an action possible_actions = game.gameboard.get_available_choices() ## choice is a flatten action current_state = game.gameboard.board choice = greedy_policy(current_state, episode, possible_actions, net) choice2d = deflatten_action(choice) next_state, reward = game.input_pos(choice2d[0], choice2d[1]) ## simulation last_state, total_reward = game_simulation(next_state) total_reward += reward if epsilon > final_epsilon and episode > observations_steps: epsilon -= epsilon_decay replay_memory.append((current_state, choice, total_reward, last_state)) if episode > observations_steps: if round == 0: batch_learning() if episode % target_model_period == 0 and game.gameboard.round_index == 1: target_net.load_state_dict(net.state_dict()) print("Update the target net") if game.gameboard.score > max_score and episode > observations_steps: if game.gameboard.round_index == 1 and episode == observations_steps + 1: print("Finish observations") max_score = game.gameboard.score print("max score is %d in episode %d" % (max_score, episode)) round = (round + 1) % (batch_size // batch_size) if episode % save_model_period == 0: print("save model in episode %d" % (episode)) save_net(net, episode) save_train_data(episode) print("save model in episode %d" % (nEpisode)) save_net(net, nEpisode) save_train_data(nEpisode)
def mcts_play(agent): game = Game(show=False) mcts = MCTS(game.gameboard, agent) while not game.termination(): current_node = mcts.get_next_node() choice = current_node.state.choice game.input_pos(choice[0], choice[1]) return game.gameboard.score
def play(agent, show=False): game = Game(show=show) state = game.gameboard.board #game.gameboard.print_board() while not game.termination(): choice = agent.get_action(game.gameboard.board) game.input_pos(choice[0], choice[1]) return game.gameboard.score
def play(mode): game = Game(filepath) load_net(mode) state = game.gameboard.board while not game.termination(): choice = get_action(state) choice2d = deflatten_action(choice) state, reward = game.input_pos(choice2d[0], choice2d[1])
def play(agent, show=False): game = Game(show=show) state = game.gameboard.board #game.gameboard.print_board() while not game.termination(): choice = agent.best_move(game.gameboard.board, game.gameboard.get_available_choices()) game.input_pos(choice[0], choice[1]) return game.gameboard.score
def play(filename, number): fd = open("./output/" + filename, 'w') for i in range(number): game = Game(show=False) state = game.gameboard.board game.gameboard.print_board() while not game.termination(): choice = game.random_player() game.input_pos(choice[0], choice[1]) fd.write(str(game.gameboard.score) + "\n") fd.close()
def play(): game = Game() state = game.gameboard.board #game.gameboard.print_board() while not game.termination(): choice = get_action(state) choice2d = deflatten_action(choice) ### check the action is available? state, reward = game.input_pos(choice2d[0], choice2d[1]) return game.gameboard.score
def get_memory(replay_memory, agent, n=10): for _ in range(n): game = Game(show=False) state = game.gameboard.board #game.gameboard.print_board() while not game.termination(): board = copy.deepcopy(game.gameboard.board) choice = agent.best_move(game.gameboard.board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) replay_memory.append((board, choice, reward, next_board))
def play(mode, filename, number): load_net(mode) fd = open("./output/" + filename, 'w') for i in range(number): game = Game(show=False) state = game.gameboard.board game.gameboard.print_board() while not game.termination(): choice = get_action(state) choice2d = deflatten_action(choice) state, reward = game.input_pos(choice2d[0], choice2d[1]) fd.write(str(game.gameboard.score) + "\n") fd.close()
def run_episode(agent): train_data = [] game = Game(show=False) while not game.termination(): board = copy.deepcopy(game.gameboard.board) choice = agent.greedy_policy(board, game.gameboard.get_available_choices()) _, reward = game.input_pos(choice[0], choice[1]) train_data.append([board, reward, choice]) ## correct the reward for i in reversed(range(len(train_data) - 1)): train_data[i][1] += GAMMA_RATE * train_data[i + 1][1] return train_data[0:-CUT]
def run_episode(agent): train_data = [] game = Game(show=False) mcts = MCTS(game.gameboard, agent) while not game.termination(): board = copy.deepcopy(game.gameboard.board) current_node = mcts.get_next_node() if current_node == None: break choice = current_node.state.get_choice() _, reward = game.input_pos(choice[0], choice[1]) train_data.append([board, reward, choice]) return train_data
def run_episode(): train_data = [] game = Game(show=False) current_node = init_first_node(game.gameboard) while not game.termination(): current_node, pi = monte_carlo_tree_search(current_node) choice = current_node.state.get_choice() flat_choice = flatten_action(choice) net_index = action_index2net_index(flat_choice) one_data = [deepcopy(game.gameboard.board), net_index, pi, 0] state, reward = game.input_pos(choice[0], choice[1]) one_data[3] = reward train_data.append(one_data) ## correct the reward for i in reversed(range(len(train_data) - 1)): train_data[i][3] += GAMMA_RATE * train_data[i + 1][3] return train_data
def run_episode(agent): train_data = [] game = Game(show=False) mcts = MCTS(game.gameboard, agent) while not game.termination(): board = copy.deepcopy(game.gameboard.board) current_node = mcts.get_next_node() if current_node == None: break choice = current_node.state.get_choice() _, reward = game.input_pos(choice[0], choice[1]) train_data.append([board, reward, choice]) ## correct the reward for i in reversed(range(len(train_data) - 1)): train_data[i][1] += GAMMA_RATE * train_data[i + 1][1] # for i in range(len(train_data) - 2): # train_data[i][1] += train_data[i+1][1] + train_data[i+2][1] return train_data
return best_next_node def get_best_child(node): best_quality_value = 0 best_child = None for child in node.child: if child.quality_value > best_quality_value: best_quality_value = child.quality_value best_child = child return best_child if __name__ == "__main__": gameplay = Game() num_available_choices = len(gameplay.gameboard.get_available_choices()) init_state = State(gameplay.gameboard.board, 0, [], num_available_choices) root_node = Node(state=init_state) current_node = root_node gameplay.gameboard.print_board() for i in range(20): if i % MAX_ROUND_NUMBER == 0: num_available_choices = len( gameplay.gameboard.get_available_choices()) init_state = State(gameplay.gameboard.board, 0, [], num_available_choices) root_node = Node(state=init_state)
def init_game(): game = Game(show=False) return game
best_next_node = self.best_child(node, False) #print("quality value:", best_next_node.quality_value) return best_next_node def get_next_node(self): self.current_node = self.monte_carlo_tree_search(self.current_node) return self.current_node if __name__ == "__main__": total_score = 0 total_game = 1000 score_list = [] for game in range(total_game): gameplay = Game(show=False) mcts = MCTS(gameplay.gameboard) for i in range(15): current_node = mcts.get_next_node() choice = current_node.state.get_choice() #print("You have choosen : " + str(choice[0]) + " " + str(choice[1])) gameplay.input_pos(choice[0], choice[1]) score_list.append(gameplay.gameboard.score) total_score += gameplay.gameboard.score ave_score = total_score / total_game print(ave_score) print(ave_score / 15) with open("./plot/mcts_alone.txt", "w") as fd: