コード例 #1
0
    def __init__(self, learning_rate, epsilon, disc, batch_size,
                 exp_buffer_size):
        """
        :param learning_rate:    learning rate for the neural network
        :param epsilon:          defines the exploration rate
        :param disc:             the discount factor
        :param batch_size:       the batch size to train the training network
        :param exp_buffer_size:  the size of the experience replay buffer
        """

        self.learningRate = learning_rate  # learning rate for the stochastic gradient decent
        self.epsilon = epsilon  # epsilon to choose the epsilon greedy move
        self.disc = disc  # the discount factor for the q update
        self.batch_size = batch_size  # the size of the experience replay buffer
        self.training_network = Network(
            learning_rate)  # used to be trained in every step
        self.target_network = copy.deepcopy(
            self.training_network)  # used to calculate the targets

        self.board = tic_tac_toe.BitBoard()  # tic tac toe board
        self.experience_buffer = ExperienceBuffer(
            exp_buffer_size)  # buffer that saves all experiences

        # send the networks to the corresponding devices
        self.training_network = self.training_network.to(Globals.device)
        self.target_network = self.target_network.to(Globals.device)
コード例 #2
0
    def __init__(self, learning_rate, epsilon, disc, lambda_param, batch_size, exp_buffer_size):
        """
        :param learning_rate:       learning rate for the neural network
        :param epsilon:             defines the exploration rate
        :param disc:                the discount factor
        :param lambda_param:        the lambda parameter of TD(lambda)
        :param batch_size:          the experience buffer batch size to train the training network
        :param exp_buffer_size:     the size of the experience replay buffer
        """

        self.learningRate = learning_rate                            # learning rate for the stochastic gradient decent
        self.epsilon = epsilon                                       # epsilon to choose the epsilon greedy move
        self.disc = disc                                             # the discount factor for the td update
        self.lambda_param = lambda_param                             # the lambda parameter of TD(lambda)
        self.batch_size = batch_size                                 # the size of the experience replay buffer
        self.network = Network(learning_rate).to(Globals.device)     # the neural network to train

        self.board = tic_tac_toe.BitBoard()                          # tic tac toe board
        self.experience_buffer = ExperienceBuffer(exp_buffer_size)   # buffer that saves all experiences

        # to save the experience of one episode
        self.state_list = []
        self.action_index_list = []
        self.reward_list = []
        self.not_terminal_list = []
        self.succ_state_list = []
        self.succ_player_list = []
        self.legal_moves_list = []
コード例 #3
0
    def reset_game(self):
        self.board = tic_tac_toe.BitBoard()

        # reset the experience lists
        self.state_list = []
        self.player_list = []
        self.reward_list = []
        self.not_terminal_list = []
        self.succ_state_list = []
        self.succ_player_list = []
コード例 #4
0
ファイル: tournament.py プロジェクト: 13rian/rl_tic_tac_toe
def play_match(game_count, player1, player2):
    """
    lets the two passed players play against each other. the number of matches need to be even
    or the total number of games will be the next lower even number
    each player will play half the games as white and half the games as black
    the players will get the following scores:
    loss:  0
    draw:  0.5
    win:   1
    :param game_count:  the number of games per match
    :param player1:     player 1
    :param player2:     player 2
    :return:            average score of the player1 between 0 and 1
    """
    half_game_count = int(game_count / 2)
    score_player1 = 0

    for _ in range(half_game_count):
        # play half the games where player1 is white
        board = tic_tac_toe.BitBoard()
        while not board.terminal:
            if board.player == CONST.WHITE:
                player1.play_move(board)
            else:
                player2.play_move(board)

        score_player1 += board.white_score()

        # play half the games where player1 is black
        board = tic_tac_toe.BitBoard()
        while not board.terminal:
            if board.player == CONST.WHITE:
                player2.play_move(board)
            else:
                player1.play_move(board)

        score_player1 += board.black_score()

    return score_player1 / (2 * half_game_count)
コード例 #5
0
def __self_play_worker__(net, mcts_sim_count, c_puct, temp_threshold, temp, alpha_dirich, game_count):
    """
    plays a number of self play games
    :param net:                 the alpha zero network
    :param mcts_sim_count:      the monte carlo simulation count
    :param c_puct:              constant that controls the exploration
    :param temp_threshold:      up to this move count the temperature will be temp, later it will be 0
    :param temp:                the temperature
    :param alpha_dirich:        dirichlet parameter alpha
    :param game_count:          the number of self-play games to play
    :return:                    state_list, policy_list, value_list
    """

    state_list = []
    policy_list = []
    value_list = []
    position_cache = {}         # faster than a shared position dict

    for i in range(game_count):
        board = tic_tac_toe.BitBoard()
        mcts = MCTS(c_puct)  # reset the search tree

        # reset the players list
        player_list = []

        move_count = 0
        while not board.terminal:
            state, player = board.white_perspective()
            temp = 0 if move_count >= temp_threshold else temp
            policy = mcts.policy_values(board, position_cache, net, mcts_sim_count, temp, alpha_dirich)

            # sample from the policy to determine the move to play
            move = np.random.choice(len(policy), p=policy)
            board.play_move(move)

            # save the training example
            state_list.append(state)
            player_list.append(player)
            policy_list.append(policy)
            move_count += 1

        # calculate the values from the perspective of the player who's move it is
        reward = board.reward()
        for player in player_list:
            value = reward if player == CONST.WHITE_MOVE else -reward
            value_list.append(value)

    return state_list, policy_list, value_list
コード例 #6
0
ファイル: minimax.py プロジェクト: 13rian/rl_tic_tac_toe
def fill_state_dict():
    """
    fills the state dict with the white score values
    :return:
    """
    if len(state_dict) > 0:
        return

    logger.debug("start to fill the minimax state dict")
    start_time = time.time()
    board = tic_tac_toe.BitBoard()

    # fill in the first state
    state = board.state_id()

    # go through the whole game
    score = minimax(board, board.player, True)
    state_dict[state] = score
    end_time = time.time()
    elapsed_time = end_time - start_time
    logger.debug("elapsed time to fill the state dict: {}".format(elapsed_time))
    logger.debug("size of the state dict: {}".format(len(state_dict)))
コード例 #7
0
    def __init__(self, learning_rate, mcts_sim_count, c_puct, temp, batch_size, exp_buffer_size):
        """
        :param learning_rate:       learning rate for the neural network
        :param mcts_sim_count:      the number of simulations for the monte-carlo tree search
        :param c_puct:              the higher this constant the more the mcts explores
        :param temp:                the temperature, controls the policy value distribution
        :param batch_size:          the experience buffer batch size to train the training network
        :param exp_buffer_size:     the size of the experience replay buffer
        """

        self.learningRate = learning_rate                            # learning rate for the stochastic gradient decent
        self.mcts_sim_count = mcts_sim_count                         # the number of simulations for the monte-carlo tree search
        self.c_puct = c_puct                                         # the higher this constant the more the mcts explores
        self.temp = temp                                             # the temperature, controls the policy value distribution
        self.batch_size = batch_size                                 # the size of the experience replay buffer
        self.network = Network(learning_rate)                        # the network for the policy and value prediction

        self.board = tic_tac_toe.BitBoard()                          # tic tac toe board
        self.experience_buffer = ExperienceBuffer(exp_buffer_size)   # buffer that saves all experiences

        # send the network to the configured device
        self.network.to(Globals.device)
コード例 #8
0
 def reset_game(self):
     self.board = tic_tac_toe.BitBoard()