def do(self, board_real, dice, actor_theta, player):
        commentary = False
        print_results = False
        for i in range(0, 25):
            board = np.copy(board_real)
            old_state = np.copy(board_real)
            self.z = np.zeros(198)
            if(len(board) == 0):
                break
            count = 0
            while not Backgammon.game_over(board) and not Backgammon.check_for_error(board):
                if commentary:
                    print("Simulationgame: lets go player ", player)

                dice = Backgammon.roll_dice()
                if commentary:
                    print("Simulationgame: rolled dices:", dice)

                # make a move (2 moves if the same number appears on the dice)
                for i in range(1 + int(dice[0] == dice[1])):
                    board_copy = np.copy(board)
                    if player == 1:
                        
                        move, new_state = self.nextMove(board_copy, dice, player, actor_theta)
                        
                        
                    elif player == -1:
                        move = agentX.action(board_copy, dice, player, i)
                    if len(move) != 0:

                        for m in move:
                            board = Backgammon.update_board(board, m, player)
                        if(player == 1 and count > 1):
                            new_state = np.copy(board)  
                            if(not Backgammon.game_over(new_state) and not Backgammon.check_for_error(new_state)):


                                delta = 0 + self.getValue(new_state, actor_theta, player) - self.getValue(old_state, actor_theta, player)
                                self.theta = self.theta + (self.alpha * delta * self.z)
                                self.z = self.lamb * self.z + getFeatures(old_state, player)
                                old_state = new_state

                    if commentary:
                        print("Simulationgame: move from player", player, ":")
                        Backgammon.pretty_print(board)
                player = -player
                count = count + 1   
            if(print_results):
                print("simulation game nr", i)
                Backgammon.pretty_print(board)
            delta = player * -1 + 0 - self.getValue(old_state, actor_theta, player)
            
            self.theta = np.add(self.theta , (self.alpha * delta * self.z))
            self.z = self.lamb * self.z + getFeatures(old_state, player)
Esempio n. 2
0
def play_a_game_random(commentary=False):
    board = BG.init_board()  # initialize the board
    player = np.random.randint(2) * 2 - 1  # which player begins?
    randomPlayer = -1
    while not BG.game_over(board) and not BG.check_for_error(board):
        if commentary: print("lets go player ", player)

        # roll dice
        dice = BG.roll_dice()
        if commentary: print("rolled dices:", dice)

        # make a move (2 moves if the same number appears on the dice)
        for i in range(1 + int(dice[0] == dice[1])):
            board_copy = np.copy(board)

            if player == randomPlayer:
                move = flipped_agent.action(board_copy, dice, player, i)
            else:
                move = action(board_copy, dice, player, i)

            # update the board
            if len(move) != 0:
                for m in move:
                    board = BG.update_board(board, m, player)

            # give status after every move:
            if commentary:
                print("move from player", player, ":")
                BG.pretty_print(board)

        # players take turns
        player = -player

    # return the winner
    return -1 * player
Esempio n. 3
0
    def PlayPubEval(self, test_games=1):
        wins = []

        for _ in range(test_games):

            env = backgammon()
            done = False

            while not done:
                dice = B.roll_dice()
                for _ in range(1 + int(dice[0] == dice[1])):

                    possible_moves, possible_boards = env.legal_moves(dice, 1)
                    n_actions = len(possible_moves)

                    if n_actions == 0:
                        break

                    action = self.sample_action(possible_boards)
                    old_board, new_board, reward, done = env.step(
                        possible_moves[action], player=1)

                    if done:
                        break

                if not done:
                    #env.swap_player()
                    dice = B.roll_dice()

                    for __ in range(1 + int(dice[0] == dice[1])):
                        action = pubeval.agent_pubeval(np.copy(env.board),
                                                       dice,
                                                       oplayer=-1)
                        old_board, new_board, reward, done = env.step(
                            action, player=-1)
                        if B.check_for_error(env.board):
                            PubEvalErBilað
                        if done:
                            reward = 0
                            break
            #env.swap_player()
            wins.append(float(reward == 1))

        return (np.mean(wins))
Esempio n. 4
0
def learnit(numgames, lam_w, lam_th, alpha_w, alpha_th):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float)

        if games % 100 == 0:
            print(games)

        count = 0
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                #Mögulega taka mean af xtheta??
                move, xtheta = action(np.copy(board), dice, player, i, True)
                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                # if the player gets a double and wins the game in the first move.
                if BG.game_over(board):
                    break

            if BG.game_over(board):
                winner = player
                break

            if player == -1:
                board = flip_board(np.copy(board))
            if (count > 1):
                if player == -1:
                    #One-hot encoding of the board
                    xflip = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

                    #Feed forward w-nn for old and new
                    target, _ = feed_forward_w(xflip)
                    old_target, h_sigmoid = feed_forward_w(xflipold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
                        gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip,
                        Z_b2_flip)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1_flip
                    b1.data = b1.data + alpha_w * delta * Z_b1_flip
                    w2.data = w2.data + alpha_w * delta * Z_w2_flip
                    b2.data = b2.data + alpha_w * delta * Z_b2_flip
                    #Update theta
                    grad_ln_pi = h_sigmoid - xtheta
                    theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
                        1, len(grad_ln_pi))
                    xthetaflipold = xtheta
                else:
                    #One-hot encoding of the board
                    x = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

                    #Feed forward w-nn for old and new
                    target, _ = feed_forward_w(x)
                    old_target, h_sigmoid = feed_forward_w(xold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                        gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1
                    b1.data = b1.data + alpha_w * delta * Z_b1
                    w2.data = w2.data + alpha_w * delta * Z_w2
                    b2.data = b2.data + alpha_w * delta * Z_b2
                    #Update theta
                    grad_ln_pi = h_sigmoid - xtheta
                    theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
                        1, len(grad_ln_pi))
                    xthetaold = xtheta


# we need to keep track of the last board state visited by the players
            if (count < 2):
                if player == -1:
                    xflipold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
            else:
                if player == -1:
                    xflipold = Variable(
                        torch.tensor(xflip, dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = Variable(
                        torch.tensor(x, dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1

        if winner == 1:
            reward = 1
            reward_flip = -1
            xthetaold = xtheta
        else:
            reward = -1
            reward_flip = 1
            xthetaflipold = xtheta

        #update fyrir player 1
        #Feed forward old state using w-NN
        old_target, h_sigmoid = feed_forward_w(xold)
        delta = reward + 0 - old_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        old_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1
        b1.data = b1.data + alpha_w * delta * Z_b1
        w2.data = w2.data + alpha_w * delta * Z_w2
        b2.data = b2.data + alpha_w * delta * Z_b2

        #Update theta
        grad_ln_pi = h_sigmoid - xthetaold
        theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
            1, len(grad_ln_pi))

        # update fyrir flipped player
        # and then for the neural network:
        #Feed forward w-NN

        #Feed forward old state using w-NN
        flip_target, h_sigmoid = feed_forward_w(xflipold)
        delta = reward_flip + 0 - flip_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        flip_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
            gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1_flip
        b1.data = b1.data + alpha_w * delta * Z_b1_flip
        w2.data = w2.data + alpha_w * delta * Z_w2_flip
        b2.data = b2.data + alpha_w * delta * Z_b2_flip

        #Update theta
        grad_ln_pi = h_sigmoid - xthetaflipold
        theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
            1, len(grad_ln_pi))
Esempio n. 5
0
def learnit(numgames, lam_w, alpha1, alpha2):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        epsilon = 15000 / (15000 + games)
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        count = 0
        if games % 1000 == 0:
            print(games)
        if games % 5000 == 0:
            print('Compete:')
            wins_for_player_1 = 0
            loss_for_player_1 = 0
            competition_games = 500
            for j in range(competition_games):
                winner = play_a_game_random(commentary=False)
                if (winner == 1):
                    wins_for_player_1 += 1.0
                else:
                    loss_for_player_1 += 1.0
            print(wins_for_player_1, loss_for_player_1)
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                move = action(np.copy(board), epsilon, dice, player, i)

                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)

            if BG.game_over(board):
                break
            if player == -1:
                board = flip_board(np.copy(board))
            if (count > 1):
                # One-hot encoding of the board
                x = Variable(
                    torch.tensor(ice_hot_encoding(board),
                                 dtype=torch.float,
                                 device=device)).view(7 * (n - 1) * 2, 1)

                #Feed forward w-nn
                target = feed_forward_w(x)
                #Feed forward old state
                old_target = feed_forward_w(xolder)

                delta2 = 0 + gamma * target.detach().cpu().numpy(
                ) - old_target.detach().cpu().numpy(
                )  # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                old_target.backward()
                # update the eligibility traces using the gradients
                Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                    gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                # zero the gradients
                zero_gradients_critic()
                # perform now the update for the weights
                delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
                w1.data = w1.data + alpha1 * delta2 * Z_w1
                b1.data = b1.data + alpha1 * delta2 * Z_b1
                w2.data = w2.data + alpha2 * delta2 * Z_w2
                b2.data = b2.data + alpha2 * delta2 * Z_b2
                # we need to keep track of the last board state visited by the players
            if (count > 0):
                xolder = xold

            if (not BG.game_over(board)):
                if (count < 2):
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = x

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1
    # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards
        reward = 1
        #update fyrir winner
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        win_target = feed_forward_w(xold)
        delta2 = reward + gamma * 0 - win_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        win_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2

        # update fyrir lúser
        reward = -1
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        loser_target = feed_forward_w(x)  # squash the output
        delta2 = reward + gamma * 0 - loser_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        loser_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2
Esempio n. 6
0
def learnit(numgames, lam_w, lam_th, alpha1, alpha2):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?

        # initilize all the eligibility traces for the NN for critic w
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        # initilize all the eligibility traces for the NN for actor theta
        Z_theta_1 = torch.zeros(theta_1.size(),
                                device=device,
                                dtype=torch.float)
        Z_thetab1 = torch.zeros(thetab1.size(),
                                device=device,
                                dtype=torch.float)
        Z_theta_2 = torch.zeros(theta_2.size(),
                                device=device,
                                dtype=torch.float)
        Z_thetab2 = torch.zeros(thetab2.size(),
                                device=device,
                                dtype=torch.float)
        if games % 100 == 0:
            print(games)
        count = 0
        delta = 0
        # play a game
        while not BG.game_over(board) and not BG.check_for_error(board):

            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                move, prob, index = action(np.copy(board), dice, player, i,
                                           True)
                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                # if the player gets a double and wins the game in the first move.
                if BG.game_over(board):
                    break

            # check to see if the game is over
            if BG.game_over(board):
                break

            if player == -1:
                board = flip_board(np.copy(board))

            # only update after the first two moves, because we are using afterstates
            # and both players have to make one move first.
            if (count > 1):

                # Ice-hot encoding of the board
                x = Variable(
                    torch.tensor(ice_hot_encoding(board),
                                 dtype=torch.float,
                                 device=device)).view(2 * (n - 1) * 7, 1)

                #Feed forward w-NN
                target = feed_forward_w(x)

                #Feed forward old state using w-NN
                old_target = feed_forward_w(xolder)
                delta = 0 + gamma * target.detach().cpu().numpy(
                ) - old_target.detach().cpu().numpy(
                )  # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                old_target.backward()
                # update the eligibility traces using the gradients
                delta = torch.tensor(delta, dtype=torch.float, device=device)
                Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                    gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                # zero the gradients
                zero_gradients_critic()
                # perform the update for the weights for the critic, w
                w1.data = w1.data + alpha1 * delta * Z_w1
                b1.data = b1.data + alpha1 * delta * Z_b1
                w2.data = w2.data + alpha2 * delta * Z_w2
                b2.data = b2.data + alpha2 * delta * Z_b2

                #Update theta
                logTarget = torch.log(prob)
                logTarget.backward(retain_graph=True)

                # update the eligibility traces using the gradients
                Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
                    gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2,
                    I)
                zero_gradients_actor()  # zero the gradients

                # perform the update for the weights for the actor, theta
                theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
                thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
                theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
                thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2

                I = gamma * I

            # keep track of the last state the player was in
            if (count > 0):
                xolder = xold

            # keep track of the last state
            if (not BG.game_over(board)):
                if (count < 2):
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(2 * (n - 1) * 7, 1)
                else:
                    xold = x

            # keep track of the old values from the NN to update the player who lost
            probold = prob
            indexold = index

            if player == -1:
                board = flip_board(np.copy(board))

            # swap players
            player = -player
            count += 1

        # The game episode has ended and we know the outcome of the game, and can find the terminal rewards
        reward = 1
        # update for the winner
        # these are basically the same updates as in the inner loop but for the final-after-states (x and xold)
        # and then for the neural network:
        win_target = feed_forward_w(xold)

        delta = reward + gamma * 0 - win_target.detach().cpu().numpy(
        )  # this is the usual TD error
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        # using autograd and the contructed computational graph in pytorch compute all gradients
        win_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2

        # Update theta
        logTarget = torch.log(prob)
        logTarget.backward()

        # update the eligibility traces using the gradients
        Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
            gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I)
        # zero the gradients
        zero_gradients_actor()

        theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
        thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
        theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
        thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2

        # update fyrir lúser
        reward = -1
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        loser_target = feed_forward_w(x)  # squash the output
        delta = reward + gamma * 0 - loser_target.detach().cpu().numpy(
        )  # this is the usual TD error
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        # using autograd and the contructed computational graph in pytorch compute all gradients
        loser_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2

        #Update theta
        logTarget = torch.log(probold)
        logTarget.backward()

        # update the eligibility traces using the gradients
        Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
            gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I)
        # zero the gradients
        zero_gradients_actor()

        theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
        thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
        theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
        thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2
Esempio n. 7
0
def learnitDyna(numgames, epsilon, lam_w, alpha_w, gamma, numthink):
    A = np.zeros(4)
    for games in range(0, numgames):
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        count = 0
        delta = 0
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)
        Z_w3 = torch.zeros(w3.size(), device=device, dtype=torch.float)
        Z_b3 = torch.zeros(b3.size(), device=device, dtype=torch.float)

        Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float)
        Z_w3_flip = torch.zeros(w3.size(), device=device, dtype=torch.float)
        Z_b3_flip = torch.zeros(b3.size(), device=device, dtype=torch.float)

        if games % 100 == 0:
            print(games)

        #play a game
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()

            for i in range(1 + int(dice[0] == dice[1])):
                move = action(np.copy(board), epsilon, dice, player, i)

                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                #tvenna og vinnur i fyrri leik. BREAK!!!!
                if BG.game_over(board):
                    break

            if BG.game_over(board):
                winner = player
                break

            if player == -1:
                board = flip_board(np.copy(board))

            if (count > 1):
                if player == -1:
                    #One-hot encoding of the board
                    move_fliptemp = move
                    x_fliptemp = ice_hot_encoding(board)
                    xflip = Variable(
                        torch.tensor(x_fliptemp,
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)

                    #Feed forward w-nn for old and new
                    target = feed_forward_w(xflip)
                    old_target = feed_forward_w(xflipold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
                        gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip,
                        Z_b2_flip, Z_w3_flip, Z_b3_flip)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1_flip
                    b1.data = b1.data + alpha_w * delta * Z_b1_flip
                    w2.data = w2.data + alpha_w * delta * Z_w2_flip
                    b2.data = b2.data + alpha_w * delta * Z_b2_flip
                    w3.data = w3.data + alpha_w * delta * Z_w3_flip
                    b3.data = b3.data + alpha_w * delta * Z_b3_flip
                    # append to the model, for the first time we create A, else we just stack on it.
                    if count == 2 and games == 0:
                        A = np.array([[x_fliptempold], [move], [x_fliptemp],
                                      0])
                    else:
                        add_to_model = np.array([[x_fliptempold], [move],
                                                 [x_fliptemp], 0])
                        A = np.vstack((A, add_to_model))

                else:
                    #One-hot encoding of the board
                    move_temp = move
                    x_temp = ice_hot_encoding(board)
                    x = Variable(
                        torch.tensor(x_temp, dtype=torch.float,
                                     device=device)).view(encSize, 1)

                    #Feed forward w-nn for old and new
                    target = feed_forward_w(x)
                    old_target = feed_forward_w(xold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                        gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1
                    b1.data = b1.data + alpha_w * delta * Z_b1
                    w2.data = w2.data + alpha_w * delta * Z_w2
                    b2.data = b2.data + alpha_w * delta * Z_b2
                    w3.data = w3.data + alpha_w * delta * Z_w3
                    b3.data = b3.data + alpha_w * delta * Z_b3
                    # append to the model, for the first time we create A, else we just stack on it.
                    if count == 2 and games == 0:
                        A = np.array([[x_tempold], [move], [x_temp], 0])
                    else:
                        add_to_model = np.array([[x_tempold], [move], [x_temp],
                                                 0])
                        A = np.vstack((A, add_to_model))

                if count > 2:
                    for thought in range(0, numthink):
                        state_indx = np.random.choice(A.shape[0])
                        state, move_temp, statenew, rewardtemp = A[state_indx]

                        if statenew == 0:
                            #Feed forward old state
                            state = Variable(
                                torch.tensor(state,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            old_target1 = feed_forward_w(state)
                            delta2 = rewardtemp + 0 - old_target1.detach().cpu(
                            ).numpy()
                        else:
                            state = Variable(
                                torch.tensor(state,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            statenew = Variable(
                                torch.tensor(statenew,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            #Feed forward w-nn
                            target1 = feed_forward_w(statenew)
                            #Feed forward old state
                            old_target1 = feed_forward_w(state)
                            delta2 = 0 + gamma * target1.detach().cpu().numpy(
                            ) - old_target1.detach().cpu().numpy(
                            )  # this is the usual TD error

                        # using autograd and the contructed computational graph in pytorch compute all gradients
                        old_target1.backward()
                        # zero the gradients
                        zero_gradients_critic()
                        # perform now the update for the weights
                        delta2 = torch.tensor(delta2,
                                              dtype=torch.float,
                                              device=device)
                        w1.data = w1.data + alpha_w * delta2 * w1.grad.data
                        b1.data = b1.data + alpha_w * delta2 * b1.grad.data
                        w2.data = w2.data + alpha_w * delta2 * w2.grad.data
                        b2.data = b2.data + alpha_w * delta2 * b2.grad.data
                        w3.data = w3.data + alpha_w * delta * w3.grad.data
                        b3.data = b3.data + alpha_w * delta * b3.grad.data

            if (count < 2):
                if player == -1:
                    x_fliptempold = ice_hot_encoding(board)
                    xflipold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)
                else:
                    x_tempold = ice_hot_encoding(board)
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)
            else:
                if player == -1:
                    x_fliptempold = x_fliptemp
                    xflipold = Variable(
                        torch.tensor(xflip, dtype=torch.float,
                                     device=device)).view(encSize, 1)
                else:
                    x_tempold = x_temp
                    xold = Variable(
                        torch.tensor(x, dtype=torch.float,
                                     device=device)).view(encSize, 1)

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1

        if winner == 1:
            reward = 1
            reward_flip = 0
            move_temp = move
        else:
            reward = 0
            reward_flip = 1
            move_fliptemp = move

        #update fyrir player 1
        #Feed forward old state using w-NN
        old_target = feed_forward_w(xold)
        delta = reward + 0 - old_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        old_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
            gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1
        b1.data = b1.data + alpha_w * delta * Z_b1
        w2.data = w2.data + alpha_w * delta * Z_w2
        b2.data = b2.data + alpha_w * delta * Z_b2
        w3.data = w3.data + alpha_w * delta * Z_w3
        b3.data = b3.data + alpha_w * delta * Z_b3

        add_to_model = np.array([[x_tempold], [move_temp], 0, reward])
        A = np.vstack((A, add_to_model))

        #Feed forward old state using w-NN
        flip_target = feed_forward_w(xflipold)
        delta = reward_flip + 0 - flip_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        flip_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
            gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip,
            Z_w3_flip, Z_b3_flip)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1_flip
        b1.data = b1.data + alpha_w * delta * Z_b1_flip
        w2.data = w2.data + alpha_w * delta * Z_w2_flip
        b2.data = b2.data + alpha_w * delta * Z_b2_flip
        w3.data = w3.data + alpha_w * delta * Z_w3_flip
        b3.data = b3.data + alpha_w * delta * Z_b3_flip

        add_to_model = np.array([[x_fliptempold], [move_fliptemp], 0,
                                 reward_flip])
        A = np.vstack((A, add_to_model))