def __init__(self, learning_rate, epsilon, disc, batch_size, exp_buffer_size): """ :param learning_rate: learning rate for the neural network :param epsilon: defines the exploration rate :param disc: the discount factor :param batch_size: the batch size to train the training network :param exp_buffer_size: the size of the experience replay buffer """ self.learningRate = learning_rate # learning rate for the stochastic gradient decent self.epsilon = epsilon # epsilon to choose the epsilon greedy move self.disc = disc # the discount factor for the q update self.batch_size = batch_size # the size of the experience replay buffer self.training_network = Network( learning_rate) # used to be trained in every step self.target_network = copy.deepcopy( self.training_network) # used to calculate the targets self.board = tic_tac_toe.BitBoard() # tic tac toe board self.experience_buffer = ExperienceBuffer( exp_buffer_size) # buffer that saves all experiences # send the networks to the corresponding devices self.training_network = self.training_network.to(Globals.device) self.target_network = self.target_network.to(Globals.device)
def __init__(self, learning_rate, epsilon, disc, lambda_param, batch_size, exp_buffer_size): """ :param learning_rate: learning rate for the neural network :param epsilon: defines the exploration rate :param disc: the discount factor :param lambda_param: the lambda parameter of TD(lambda) :param batch_size: the experience buffer batch size to train the training network :param exp_buffer_size: the size of the experience replay buffer """ self.learningRate = learning_rate # learning rate for the stochastic gradient decent self.epsilon = epsilon # epsilon to choose the epsilon greedy move self.disc = disc # the discount factor for the td update self.lambda_param = lambda_param # the lambda parameter of TD(lambda) self.batch_size = batch_size # the size of the experience replay buffer self.network = Network(learning_rate).to(Globals.device) # the neural network to train self.board = tic_tac_toe.BitBoard() # tic tac toe board self.experience_buffer = ExperienceBuffer(exp_buffer_size) # buffer that saves all experiences # to save the experience of one episode self.state_list = [] self.action_index_list = [] self.reward_list = [] self.not_terminal_list = [] self.succ_state_list = [] self.succ_player_list = [] self.legal_moves_list = []
def reset_game(self): self.board = tic_tac_toe.BitBoard() # reset the experience lists self.state_list = [] self.player_list = [] self.reward_list = [] self.not_terminal_list = [] self.succ_state_list = [] self.succ_player_list = []
def play_match(game_count, player1, player2): """ lets the two passed players play against each other. the number of matches need to be even or the total number of games will be the next lower even number each player will play half the games as white and half the games as black the players will get the following scores: loss: 0 draw: 0.5 win: 1 :param game_count: the number of games per match :param player1: player 1 :param player2: player 2 :return: average score of the player1 between 0 and 1 """ half_game_count = int(game_count / 2) score_player1 = 0 for _ in range(half_game_count): # play half the games where player1 is white board = tic_tac_toe.BitBoard() while not board.terminal: if board.player == CONST.WHITE: player1.play_move(board) else: player2.play_move(board) score_player1 += board.white_score() # play half the games where player1 is black board = tic_tac_toe.BitBoard() while not board.terminal: if board.player == CONST.WHITE: player2.play_move(board) else: player1.play_move(board) score_player1 += board.black_score() return score_player1 / (2 * half_game_count)
def __self_play_worker__(net, mcts_sim_count, c_puct, temp_threshold, temp, alpha_dirich, game_count): """ plays a number of self play games :param net: the alpha zero network :param mcts_sim_count: the monte carlo simulation count :param c_puct: constant that controls the exploration :param temp_threshold: up to this move count the temperature will be temp, later it will be 0 :param temp: the temperature :param alpha_dirich: dirichlet parameter alpha :param game_count: the number of self-play games to play :return: state_list, policy_list, value_list """ state_list = [] policy_list = [] value_list = [] position_cache = {} # faster than a shared position dict for i in range(game_count): board = tic_tac_toe.BitBoard() mcts = MCTS(c_puct) # reset the search tree # reset the players list player_list = [] move_count = 0 while not board.terminal: state, player = board.white_perspective() temp = 0 if move_count >= temp_threshold else temp policy = mcts.policy_values(board, position_cache, net, mcts_sim_count, temp, alpha_dirich) # sample from the policy to determine the move to play move = np.random.choice(len(policy), p=policy) board.play_move(move) # save the training example state_list.append(state) player_list.append(player) policy_list.append(policy) move_count += 1 # calculate the values from the perspective of the player who's move it is reward = board.reward() for player in player_list: value = reward if player == CONST.WHITE_MOVE else -reward value_list.append(value) return state_list, policy_list, value_list
def fill_state_dict(): """ fills the state dict with the white score values :return: """ if len(state_dict) > 0: return logger.debug("start to fill the minimax state dict") start_time = time.time() board = tic_tac_toe.BitBoard() # fill in the first state state = board.state_id() # go through the whole game score = minimax(board, board.player, True) state_dict[state] = score end_time = time.time() elapsed_time = end_time - start_time logger.debug("elapsed time to fill the state dict: {}".format(elapsed_time)) logger.debug("size of the state dict: {}".format(len(state_dict)))
def __init__(self, learning_rate, mcts_sim_count, c_puct, temp, batch_size, exp_buffer_size): """ :param learning_rate: learning rate for the neural network :param mcts_sim_count: the number of simulations for the monte-carlo tree search :param c_puct: the higher this constant the more the mcts explores :param temp: the temperature, controls the policy value distribution :param batch_size: the experience buffer batch size to train the training network :param exp_buffer_size: the size of the experience replay buffer """ self.learningRate = learning_rate # learning rate for the stochastic gradient decent self.mcts_sim_count = mcts_sim_count # the number of simulations for the monte-carlo tree search self.c_puct = c_puct # the higher this constant the more the mcts explores self.temp = temp # the temperature, controls the policy value distribution self.batch_size = batch_size # the size of the experience replay buffer self.network = Network(learning_rate) # the network for the policy and value prediction self.board = tic_tac_toe.BitBoard() # tic tac toe board self.experience_buffer = ExperienceBuffer(exp_buffer_size) # buffer that saves all experiences # send the network to the configured device self.network.to(Globals.device)
def reset_game(self): self.board = tic_tac_toe.BitBoard()