def __init__(self,number_actions=9,policy_value_model=None): self.Q = {} self.N = {} self.P = {} self.number_actions = number_actions self.tictactoe_functions = tictactoe_methods() self.model = model_wrapper(policy_value_model=policy_value_model)
def pit(old_model,new_model,number_of_games=25): # test the models against each other and accpet the new model # if it wins 55% of the non tie games games tictactoe_functions = tictactoe_methods() winners = {0:0,1:0,2:0} # winners = {'tie':0,'old_model':0,'new_model':0} turn = 1 # check the two models are different for game in range(number_of_games): board = tictactoe_functions.get_initial_board() if(game < number_of_games //2): # old_model goes first player1 = old_model player2 = new_model else: # new model goes first player1 = new_model player2 = old_model for player_turn in range(9): if(player_turn % 2 == 0): # get player 1's action action_list = get_model_action(player1,board,turn) action = np.argmax(action_list) else: action_list = get_model_action(player2,board,turn) action = np.argmax(action_list) if(player_turn == 0): # first move completely random action = np.random.choice(9,1)[0] board = tictactoe_functions.get_next_board(board, action, turn) winner = tictactoe_functions.get_winner(board) if(winner != -1): # update winners array if(game >= number_of_games // 2): # flip the winner so it corresponds with the correct model if(winner == 2): winner = 1 elif(winner == 1): winner = 2 winners[winner] += 1 # clear both players mcts tree player1.clear_tree() player2.clear_tree() break turn = 2 if turn == 1 else 1 return winners
def run_game(mcts_model, temp=1.0): tictactoe_functions = tictactoe_methods() board = tictactoe_functions.get_initial_board() experience = [] turn = 1 for game_step in range(10): # temp controls how likely # the tree will explore # 1 for exploration # 0 for the pit, always take the best action action_probs = get_game_action_probs(mcts_model, board, turn, temp=temp) #chose a action from these probabilities action = np.random.choice(9, 1, p=action_probs)[0] if (game_step == 0): # truly random for first step action = np.random.choice(9, 1)[0] # initially use a placeholder value for value # this experience is [obs, action, value] # quadruple our experience by rotating the board # if turn == 2 we have to flip the board for training training_board = board if (turn == 2): training_board = tictactoe_functions.flip_board(board) #experience.append([training_board,action_probs,-9999]) # rotating boards to get more info rotated_boards = tictactoe_functions.get_rotated_boards(training_board) rotated_action_probs = tictactoe_functions.get_rotated_boards( action_probs) for i in range(4): experience.append( [rotated_boards[i], rotated_action_probs[i], -9999]) board = tictactoe_functions.get_next_board(board, action, turn) winner = tictactoe_functions.get_winner(board) if (winner != -1): return winner, experience break turn = 2 if turn == 1 else 1
def run_game_with_human(mcts_model, human_player_pos): tictactoe_functions = tictactoe_methods() board = tictactoe_functions.get_initial_board() turn = 1 for game_steps in range(11): print("Turn #%d:" % turn) tictactoe_functions.pretty_print(board) if (turn % 2 == human_player_pos): # robot turn # simulating the games simulation_steps = 300 for i in range(simulation_steps): #print('sim:',i) mcts_model.simulate_step(board, turn) # getting the actions from the mcts tree searched_board = board if (turn == 2): searched_board = tictactoe_functions.flip_board(board) actions_list = [n for n in mcts_model.get_N(searched_board)] action = np.argmax(actions_list) print("Visit counts:", actions_list) print("value of each next state:", mcts_model.get_Q(searched_board)) print("policy:", mcts_model.get_P(searched_board)) else: # player turn x, y = [int(x) for x in input("input[0-2] x y: ").split()] action = y * 3 + x board = tictactoe_functions.get_next_board(board, action, turn) winner = tictactoe_functions.get_winner(board) if (winner != -1): tictactoe_functions.pretty_print(board) return winner turn = 2 if turn == 1 else 1