예제 #1
0
def play_a_game_random(commentary=False):
    board = BG.init_board()  # initialize the board
    player = np.random.randint(2) * 2 - 1  # which player begins?
    randomPlayer = -1
    while not BG.game_over(board) and not BG.check_for_error(board):
        if commentary: print("lets go player ", player)

        # roll dice
        dice = BG.roll_dice()
        if commentary: print("rolled dices:", dice)

        # make a move (2 moves if the same number appears on the dice)
        for i in range(1 + int(dice[0] == dice[1])):
            board_copy = np.copy(board)

            if player == randomPlayer:
                move = flipped_agent.action(board_copy, dice, player, i)
            else:
                move = action(board_copy, dice, player, i)

            # update the board
            if len(move) != 0:
                for m in move:
                    board = BG.update_board(board, m, player)

            # give status after every move:
            if commentary:
                print("move from player", player, ":")
                BG.pretty_print(board)

        # players take turns
        player = -player

    # return the winner
    return -1 * player
예제 #2
0
 def ExamplePolicy(self):
     _, st = B.legal_moves(B.init_board(), B.roll_dice(), 1)
     st = np.vstack(st)
     st = st[:, 1:]
     out = np.round(
         self._s.run(self._actor_policy, ({
             self._possible_states: st
         })) * 100) / 100
     out = out.flatten()
     out.sort()
     return out[::-1]
def learnit(numGames, agent):
    numWins = []
    for g in tqdm(range(numGames)):
        if g % 1000 == 0:
            #print(agent.theta)
            wins = compete(agent)
            numWins.append(wins)

        board = Backgammon.init_board()

        agent.zero_el()
        if (0 == np.random.randint(2)):
            player = 1
        else:
            player = -1

        moveNr = 0
        isGameOver = False

        while (isGameOver == False):
            dice = Backgammon.roll_dice()
            for repeat in range(1 + int(dice[0] == dice[1])):

                action = agent.greedy_action(np.copy(board), dice, player,
                                             repeat)
            for i in range(0, len(action)):
                board = Backgammon.update_board(board, action[i], player)

            R = 0
            if (1 == Backgammon.game_over(board)):
                if (player == 1):
                    R = 1.0
                else:
                    R = 0
                isGameOver = True
            if ((1 < moveNr) & (len(action) > 0)):
                agent.update(player, R, board, isGameOver)

            if (len(action) > 0):
                if player == 1:
                    agent.xold = board
                else:
                    agent.xoldF = flip_board(board)
            player = -player
            moveNr += 1

    x = np.arange(0, numGames, 1000)
    fig = plt.figure()
    #plt.figure(figsize=(30, 30))
    ax = fig.add_subplot(111)
    ax.set_xlabel("Number of games")
    ax.set_ylabel("Wins against a random player")
    ax.plot(x, numWins)
def compete(agent):
    winners = {}
    winners["1"] = 0
    winners["-1"] = 0
    for g in range(100):

        board = Backgammon.init_board()

        if (0 == np.random.randint(2)):
            player = 1
        else:
            player = -1

        isGameOver = False
        while (isGameOver == False):
            dice = Backgammon.roll_dice()
            for repeat in range(1 + int(dice[0] == dice[1])):
                if (player == -1):
                    action = Backgammon.random_agent(np.copy(board), dice,
                                                     player, repeat)
                else:
                    action = agent.greedy_action(np.copy(board), dice, player,
                                                 repeat)
                for i in range(0, len(action)):
                    board = Backgammon.update_board(board, action[i], player)
            if (1 == Backgammon.game_over(board)):
                winner = player
                isGameOver = True
                break
            player = -player
        winners[str(winner)] += 1

# numWins.append(winners["1"])
    print("Out of", 100, "games,")
    print("player", 1, "won", winners["1"], "times and")
    print("player", -1, "won", winners["-1"], "times")
    return winners["1"]
예제 #5
0
    def __init__(self):
        self.device = torch.device('cuda')
        self.w1 = Variable(torch.randn(99,
                                       832,
                                       device=self.device,
                                       dtype=torch.float),
                           requires_grad=True)
        self.b1 = Variable(torch.zeros((99, 1),
                                       device=self.device,
                                       dtype=torch.float),
                           requires_grad=True)

        self.w2 = Variable(torch.randn(99,
                                       99,
                                       device=self.device,
                                       dtype=torch.float),
                           requires_grad=True)
        self.b2 = Variable(torch.zeros((99, 1),
                                       device=self.device,
                                       dtype=torch.float),
                           requires_grad=True)

        self.Z_w1 = torch.zeros(self.w1.size(),
                                device=self.device,
                                dtype=torch.float)
        self.Z_b1 = torch.zeros(self.b1.size(),
                                device=self.device,
                                dtype=torch.float)
        self.Z_w2 = torch.zeros(self.w2.size(),
                                device=self.device,
                                dtype=torch.float)
        self.Z_b2 = torch.zeros(self.b2.size(),
                                device=self.device,
                                dtype=torch.float)

        ###The critic
        self.W = Variable(torch.randn(1,
                                      99,
                                      device=self.device,
                                      dtype=torch.float),
                          requires_grad=True)
        self.B = Variable(torch.zeros((1, 1),
                                      device=self.device,
                                      dtype=torch.float),
                          requires_grad=True)
        self.Z_W = torch.zeros(self.W.size(),
                               device=self.device,
                               dtype=torch.float)
        self.Z_B = torch.zeros(self.B.size(),
                               device=self.device,
                               dtype=torch.float)

        ###The actor
        self.theta = Variable(torch.randn(1,
                                          99,
                                          device=self.device,
                                          dtype=torch.float),
                              requires_grad=True)
        self.Z_theta = torch.zeros(self.theta.size(),
                                   device=self.device,
                                   dtype=torch.float)

        ###Stuff we need
        self.target = 0
        self.oldtarget = 0
        self.y_sigmoid = 0
        self.hidden = 0

        ###Parameters
        self.alpha1 = 0.01
        self.alpha2 = 0.01
        self.alphaC = 0.01
        self.alphaA = 0.001

        self.lamC = 1
        self.lamA = 1

        self.gamma = 1

        self.xFlipOld = flip_board(np.copy(Backgammon.init_board()))
        self.xFlipNew = flip_board(np.copy(Backgammon.init_board()))
        self.xold = Backgammon.init_board()
        self.xnew = Backgammon.init_board()

        self.xtheta = None
        self.flipxtheta = 0

        self.firstMove = True
예제 #6
0
 def reset_old_boards(self):
     self.xFlipOld = flip_board(np.copy(Backgammon.init_board()))
     self.xold = Backgammon.init_board()
예제 #7
0
def learnit(numgames, lam_w, lam_th, alpha_w, alpha_th):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float)

        if games % 100 == 0:
            print(games)

        count = 0
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                #Mögulega taka mean af xtheta??
                move, xtheta = action(np.copy(board), dice, player, i, True)
                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                # if the player gets a double and wins the game in the first move.
                if BG.game_over(board):
                    break

            if BG.game_over(board):
                winner = player
                break

            if player == -1:
                board = flip_board(np.copy(board))
            if (count > 1):
                if player == -1:
                    #One-hot encoding of the board
                    xflip = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

                    #Feed forward w-nn for old and new
                    target, _ = feed_forward_w(xflip)
                    old_target, h_sigmoid = feed_forward_w(xflipold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
                        gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip,
                        Z_b2_flip)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1_flip
                    b1.data = b1.data + alpha_w * delta * Z_b1_flip
                    w2.data = w2.data + alpha_w * delta * Z_w2_flip
                    b2.data = b2.data + alpha_w * delta * Z_b2_flip
                    #Update theta
                    grad_ln_pi = h_sigmoid - xtheta
                    theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
                        1, len(grad_ln_pi))
                    xthetaflipold = xtheta
                else:
                    #One-hot encoding of the board
                    x = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

                    #Feed forward w-nn for old and new
                    target, _ = feed_forward_w(x)
                    old_target, h_sigmoid = feed_forward_w(xold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                        gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1
                    b1.data = b1.data + alpha_w * delta * Z_b1
                    w2.data = w2.data + alpha_w * delta * Z_w2
                    b2.data = b2.data + alpha_w * delta * Z_b2
                    #Update theta
                    grad_ln_pi = h_sigmoid - xtheta
                    theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
                        1, len(grad_ln_pi))
                    xthetaold = xtheta


# we need to keep track of the last board state visited by the players
            if (count < 2):
                if player == -1:
                    xflipold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
            else:
                if player == -1:
                    xflipold = Variable(
                        torch.tensor(xflip, dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = Variable(
                        torch.tensor(x, dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1

        if winner == 1:
            reward = 1
            reward_flip = -1
            xthetaold = xtheta
        else:
            reward = -1
            reward_flip = 1
            xthetaflipold = xtheta

        #update fyrir player 1
        #Feed forward old state using w-NN
        old_target, h_sigmoid = feed_forward_w(xold)
        delta = reward + 0 - old_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        old_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1
        b1.data = b1.data + alpha_w * delta * Z_b1
        w2.data = w2.data + alpha_w * delta * Z_w2
        b2.data = b2.data + alpha_w * delta * Z_b2

        #Update theta
        grad_ln_pi = h_sigmoid - xthetaold
        theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
            1, len(grad_ln_pi))

        # update fyrir flipped player
        # and then for the neural network:
        #Feed forward w-NN

        #Feed forward old state using w-NN
        flip_target, h_sigmoid = feed_forward_w(xflipold)
        delta = reward_flip + 0 - flip_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        flip_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
            gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1_flip
        b1.data = b1.data + alpha_w * delta * Z_b1_flip
        w2.data = w2.data + alpha_w * delta * Z_w2_flip
        b2.data = b2.data + alpha_w * delta * Z_b2_flip

        #Update theta
        grad_ln_pi = h_sigmoid - xthetaflipold
        theta.data = theta.data + alpha_th * delta * grad_ln_pi.view(
            1, len(grad_ln_pi))
예제 #8
0
 def reset(self):
     self.board = B.init_board()
     self.done = False
예제 #9
0
 def __init__(self):
     self.board = B.init_board()
예제 #10
0
def learnit(numgames, lam_w, alpha1, alpha2):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        epsilon = 15000 / (15000 + games)
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        count = 0
        if games % 1000 == 0:
            print(games)
        if games % 5000 == 0:
            print('Compete:')
            wins_for_player_1 = 0
            loss_for_player_1 = 0
            competition_games = 500
            for j in range(competition_games):
                winner = play_a_game_random(commentary=False)
                if (winner == 1):
                    wins_for_player_1 += 1.0
                else:
                    loss_for_player_1 += 1.0
            print(wins_for_player_1, loss_for_player_1)
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                move = action(np.copy(board), epsilon, dice, player, i)

                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)

            if BG.game_over(board):
                break
            if player == -1:
                board = flip_board(np.copy(board))
            if (count > 1):
                # One-hot encoding of the board
                x = Variable(
                    torch.tensor(ice_hot_encoding(board),
                                 dtype=torch.float,
                                 device=device)).view(7 * (n - 1) * 2, 1)

                #Feed forward w-nn
                target = feed_forward_w(x)
                #Feed forward old state
                old_target = feed_forward_w(xolder)

                delta2 = 0 + gamma * target.detach().cpu().numpy(
                ) - old_target.detach().cpu().numpy(
                )  # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                old_target.backward()
                # update the eligibility traces using the gradients
                Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                    gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                # zero the gradients
                zero_gradients_critic()
                # perform now the update for the weights
                delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
                w1.data = w1.data + alpha1 * delta2 * Z_w1
                b1.data = b1.data + alpha1 * delta2 * Z_b1
                w2.data = w2.data + alpha2 * delta2 * Z_w2
                b2.data = b2.data + alpha2 * delta2 * Z_b2
                # we need to keep track of the last board state visited by the players
            if (count > 0):
                xolder = xold

            if (not BG.game_over(board)):
                if (count < 2):
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(7 * (n - 1) * 2, 1)
                else:
                    xold = x

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1
    # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards
        reward = 1
        #update fyrir winner
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        win_target = feed_forward_w(xold)
        delta2 = reward + gamma * 0 - win_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        win_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2

        # update fyrir lúser
        reward = -1
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        loser_target = feed_forward_w(x)  # squash the output
        delta2 = reward + gamma * 0 - loser_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        loser_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2
예제 #11
0
def learnit(numgames, lam_w, lam_th, alpha1, alpha2):
    gamma = 1  # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        I = 1
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?

        # initilize all the eligibility traces for the NN for critic w
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)

        # initilize all the eligibility traces for the NN for actor theta
        Z_theta_1 = torch.zeros(theta_1.size(),
                                device=device,
                                dtype=torch.float)
        Z_thetab1 = torch.zeros(thetab1.size(),
                                device=device,
                                dtype=torch.float)
        Z_theta_2 = torch.zeros(theta_2.size(),
                                device=device,
                                dtype=torch.float)
        Z_thetab2 = torch.zeros(thetab2.size(),
                                device=device,
                                dtype=torch.float)
        if games % 100 == 0:
            print(games)
        count = 0
        delta = 0
        # play a game
        while not BG.game_over(board) and not BG.check_for_error(board):

            dice = BG.roll_dice()
            for i in range(1 + int(dice[0] == dice[1])):
                move, prob, index = action(np.copy(board), dice, player, i,
                                           True)
                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                # if the player gets a double and wins the game in the first move.
                if BG.game_over(board):
                    break

            # check to see if the game is over
            if BG.game_over(board):
                break

            if player == -1:
                board = flip_board(np.copy(board))

            # only update after the first two moves, because we are using afterstates
            # and both players have to make one move first.
            if (count > 1):

                # Ice-hot encoding of the board
                x = Variable(
                    torch.tensor(ice_hot_encoding(board),
                                 dtype=torch.float,
                                 device=device)).view(2 * (n - 1) * 7, 1)

                #Feed forward w-NN
                target = feed_forward_w(x)

                #Feed forward old state using w-NN
                old_target = feed_forward_w(xolder)
                delta = 0 + gamma * target.detach().cpu().numpy(
                ) - old_target.detach().cpu().numpy(
                )  # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                old_target.backward()
                # update the eligibility traces using the gradients
                delta = torch.tensor(delta, dtype=torch.float, device=device)
                Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                    gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2)
                # zero the gradients
                zero_gradients_critic()
                # perform the update for the weights for the critic, w
                w1.data = w1.data + alpha1 * delta * Z_w1
                b1.data = b1.data + alpha1 * delta * Z_b1
                w2.data = w2.data + alpha2 * delta * Z_w2
                b2.data = b2.data + alpha2 * delta * Z_b2

                #Update theta
                logTarget = torch.log(prob)
                logTarget.backward(retain_graph=True)

                # update the eligibility traces using the gradients
                Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
                    gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2,
                    I)
                zero_gradients_actor()  # zero the gradients

                # perform the update for the weights for the actor, theta
                theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
                thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
                theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
                thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2

                I = gamma * I

            # keep track of the last state the player was in
            if (count > 0):
                xolder = xold

            # keep track of the last state
            if (not BG.game_over(board)):
                if (count < 2):
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(2 * (n - 1) * 7, 1)
                else:
                    xold = x

            # keep track of the old values from the NN to update the player who lost
            probold = prob
            indexold = index

            if player == -1:
                board = flip_board(np.copy(board))

            # swap players
            player = -player
            count += 1

        # The game episode has ended and we know the outcome of the game, and can find the terminal rewards
        reward = 1
        # update for the winner
        # these are basically the same updates as in the inner loop but for the final-after-states (x and xold)
        # and then for the neural network:
        win_target = feed_forward_w(xold)

        delta = reward + gamma * 0 - win_target.detach().cpu().numpy(
        )  # this is the usual TD error
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        # using autograd and the contructed computational graph in pytorch compute all gradients
        win_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2

        # Update theta
        logTarget = torch.log(prob)
        logTarget.backward()

        # update the eligibility traces using the gradients
        Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
            gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I)
        # zero the gradients
        zero_gradients_actor()

        theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
        thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
        theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
        thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2

        # update fyrir lúser
        reward = -1
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # and then for the neural network:
        loser_target = feed_forward_w(x)  # squash the output
        delta = reward + gamma * 0 - loser_target.detach().cpu().numpy(
        )  # this is the usual TD error
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        # using autograd and the contructed computational graph in pytorch compute all gradients
        loser_target.backward()
        # update the eligibility traces
        Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1,
                                                      Z_w2, Z_b2)
        # zero the gradients
        zero_gradients_critic()
        # perform now the update of weights
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2

        #Update theta
        logTarget = torch.log(probold)
        logTarget.backward()

        # update the eligibility traces using the gradients
        Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th(
            gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I)
        # zero the gradients
        zero_gradients_actor()

        theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1
        thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1
        theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2
        thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2
def learnit(numgames, epsilon, lam, alpha, V, alpha1, alpha2, w1, b1, w2, b2):
    gamma = 1  # for completeness
    global episode_number, xs, hs, logProp

    # play numgames games for training
    for games in range(0, numgames):
        board = Backgammon.init_board()  # initialize the board (empty)
        # we will use TD(lambda) and so we need to use eligibility traces
        S = [
        ]  # no after-state for table V, visited after-states is an empty list
        E = np.array([])  # eligibility traces for table V
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)
        # player to start is "1" the other player is "-1"
        player = 1
        tableplayer = -1
        winner = 0  # this implies a draw
        # start turn playing game, maximum 9 moves
        dice = Backgammon.roll_dice()
        legal_moves = Backgammon.legal_moves(board, dice, player)
        for moveNumber in range(0, len(legal_moves)):
            # use a policy to find action
            if (player == tableplayer):  # this one is using the table V
                possible_moves, possible_boards = Backgammon.legal_moves(
                    board, dice, player)
                action = possible_moves[np.random.randint(len(possible_moves))]
            else:  # this one is using the neural-network to approximate the after-state value
                action = epsilon_nn_greedy(np.copy(board), dice, player,
                                           epsilon, w1, b1, w2, b2)
            # perform move and update board
            for i in range(0, len(action)):
                board = Backgammon.update_board(board, action[i], player)
            if (1 == Backgammon.game_over(board)):  # has this player won?
                winner = player
                break  # bail out of inner game loop
            # once both player have performed at least one move we can start doing updates
            if (1 < moveNumber):
                if tableplayer == player:  # here we have player 1 updating the table V
                    s = hash_it(board)  # get index to table for this new board
                    delta = 0 + gamma * V[s] - V[sold]
                    E = np.append(
                        E, 1
                    )  # add trace to this state (note all new states are unique else we would +1)
                    S.append(sold)  # keep track of this state also
                    V[S] = V[
                        S] + delta * alpha * E  # the usual tabular TD(lambda) update
                    E = gamma * lam * E
                else:  # here we have player 2 updating the neural-network (2 layer feed forward with Sigmoid units)
                    x = Variable(
                        torch.tensor(one_hot_encoding(board, player),
                                     dtype=torch.float,
                                     device=device)).view(2 * 9, 1)
                    # now do a forward pass to evaluate the new board's after-state value
                    h = torch.mm(
                        w1, x
                    ) + b1  # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid(
                    )  # squash this with a sigmoid function
                    y = torch.mm(
                        w2, h_sigmoid
                    ) + b2  # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid(
                    )  # squash this with a sigmoid function
                    target = y_sigmoid.detach().cpu().numpy()
                    # lets also do a forward past for the old board, this is the state we will update
                    h = torch.mm(
                        w1, xold
                    ) + b1  # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid(
                    )  # squash this with a sigmoid function
                    y = torch.mm(
                        w2, h_sigmoid
                    ) + b2  # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid()  # squash the output
                    delta2 = 0 + gamma * target - y_sigmoid.detach().cpu(
                    ).numpy()  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    y_sigmoid.backward()
                    # update the eligibility traces using the gradients
                    Z_w2 = gamma * lam * Z_w2 + w2.grad.data
                    Z_b2 = gamma * lam * Z_b2 + b2.grad.data
                    Z_w1 = gamma * lam * Z_w1 + w1.grad.data
                    Z_b1 = gamma * lam * Z_b1 + b1.grad.data
                    # zero the gradients
                    w2.grad.data.zero_()
                    b2.grad.data.zero_()
                    w1.grad.data.zero_()
                    b1.grad.data.zero_()
                    # perform now the update for the weights
                    delta2 = torch.tensor(delta2,
                                          dtype=torch.float,
                                          device=device)
                    w1.data = w1.data + alpha1 * delta2 * Z_w1
                    b1.data = b1.data + alpha1 * delta2 * Z_b1
                    w2.data = w2.data + alpha2 * delta2 * Z_w2
                    b2.data = b2.data + alpha2 * delta2 * Z_b2

            # we need to keep track of the last board state visited by the players
            if tableplayer == player:
                sold = hash_it(board)
            else:
                xold = Variable(
                    torch.tensor(one_hot_encoding(board),
                                 dtype=torch.float,
                                 device=device)).view(28 * 2 * 6, 1)
            # swap players
            player = -player

        # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards
        if winner == tableplayer:
            reward = 0
        elif winner == -tableplayer:
            reward = 1
        else:
            reward = 0.5

        episode_number += 1

        end_x = torch.stack(xs)
        end_h = torch.stack(hs)
        end_logProp = torch.stack(logProp)

        xs, hs, logProp = [], [], []

        #spurning hvernig reward er gefið, hvort það þurfi að taka einhversskonar discount
        end_logProp *= reward

        grad = actor_policy_backward(end_x, end_h, end_logProp, w2)

        for k in model:
            grad_buffer[k] += grad[k]  #adding the grad to the batch

        if episode_number == batch_size:
            episode_number = 0
            for k, v in model.items():
                g = grad_buffer[k]
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (
                    1 - decay_rate) * np.power(g, 2)
                model[k] += learning_rate * g / np.sqrt(rmsprop_cache[k] +
                                                        1e-5)
                print('model[k]', model[k])
                print('model', model)
                grad_buffer[k] = np.zeros_like(v)

        # Now we perform the final update (terminal after-state value is zero)
        # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold)
        # first for the table (note if reward is 0 this player actually won!):
        delta = (1.0 - reward) + gamma * 0 - V[sold]
        E = np.append(E, 1)  # add one to the trace (recall unique states)
        S.append(sold)

        for state in S:
            V[state] = V[state] + delta * alpha * E
        # and then for the neural network:
        h = torch.mm(
            w1,
            xold) + b1  # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid()  # squash this with a sigmoid function
        y = torch.mm(
            w2,
            h_sigmoid) + b2  # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid()  # squash the output
        delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        y_sigmoid.backward()
        # update the eligibility traces
        Z_w2 = gamma * lam * Z_w2 + w2.grad.data
        Z_b2 = gamma * lam * Z_b2 + b2.grad.data
        Z_w1 = gamma * lam * Z_w1 + w1.grad.data
        Z_b1 = gamma * lam * Z_b1 + b1.grad.data
        # zero the gradients
        w2.grad.data.zero_()
        b2.grad.data.zero_()
        w1.grad.data.zero_()
        b1.grad.data.zero_()
        # perform now the update of weights
        delta2 = torch.tensor(delta2, dtype=torch.float, device=device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2
예제 #13
0
def learnit(numgames, epsilon, lam, alpha, alpha1, alpha2, w1, b1, w2, b2):
    gamma = 1 # for completeness
    # play numgames games for training
    for games in range(0, numgames):
        board = Backgammon.init_board()    # initialize the board (empty)
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device = device, dtype = torch.float)
        Z_b1 = torch.zeros(b1.size(), device = device, dtype = torch.float)
        Z_w2 = torch.zeros(w2.size(), device = device, dtype = torch.float)
        Z_b2 = torch.zeros(b2.size(), device = device, dtype = torch.float)
        # player to start is "1" the other player is "-1"
        player = 1
        otherplayer = -1
        winner = 0 # this implies a draw
        isGameOver = False
        moveNumber = 0
        while (isGameOver == False):
            dice = Backgammon.roll_dice()
            # use a policy to find action
            # both are using the neural-network to approximate the after-state value
            if (player == otherplayer): # this one flippes the board to find an action.
                possible_moves, possible_boards = Backgammon.legal_moves(flipped_agent.flip_board(np.copy(board)), dice, -player)
                action = epsilon_nn_greedy(flipped_agent.flip_board(np.copy(board)), dice, -player, epsilon, w1, b1, w2, b2,  possible_moves, possible_boards, False)
                action = flipped_agent.flip_move(action)
            else: # this one uses the original board.
                possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player)
                action = epsilon_nn_greedy(np.copy(board), dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False)
            # perform move and update board
            for i in range(0,len(action)):
                board = Backgammon.update_board(board, action[i], player)
            if (1 == Backgammon.game_over(board)): # has this player won?
                winner = player
                isGameOver = True
                break # bail out of inner game loop
            # once both player have performed at least one move we can start doing updates
            if (1 < moveNumber):
                if otherplayer == player: # here we have player -1 updating the table V
                    x_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype = torch.float, device = device)).view(28*2*6,1)
                    h = torch.mm(w1,x_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash this with a sigmoid function
                    target = y_sigmoid.detach().cpu().numpy()
                    # lets also do a forward past for the old board, this is the state we will update
                    h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash the output
                    delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error
                else: # here we have player 1 updating the neural-network (2 layer feed forward with Sigmoid units)
                    x = Variable(torch.tensor(one_hot_encoding(board), dtype = torch.float, device = device)).view(28*2*6,1)
                    # now do a forward pass to evaluate the new board's after-state value
                    h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash this with a sigmoid function
                    target = y_sigmoid.detach().cpu().numpy()
                    # lets also do a forward past for the old board, this is the state we will update
                    h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias
                    h_sigmoid = h.sigmoid() # squash this with a sigmoid function
                    y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
                    y_sigmoid = y.sigmoid() # squash the output
                    delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error
                # using autograd and the contructed computational graph in pytorch compute all gradients
                y_sigmoid.backward()
                # update the eligibility traces using the gradients
                Z_w1 = gamma * lam * Z_w1 + w1.grad.data
                Z_b1 = gamma * lam * Z_b1 + b1.grad.data
                Z_w2 = gamma * lam * Z_w2 + w2.grad.data
                Z_b2 = gamma * lam * Z_b2 + b2.grad.data
                # zero the gradients
                w1.grad.data.zero_()
                b1.grad.data.zero_()
                w2.grad.data.zero_()
                b2.grad.data.zero_()
                # perform now the update for the weights
                delta2 =  torch.tensor(delta2, dtype = torch.float, device = device)
                w1.data = w1.data + alpha1 * delta2 * Z_w1
                b1.data = b1.data + alpha1 * delta2 * Z_b1
                w2.data = w2.data + alpha2 * delta2 * Z_w2
                b2.data = b2.data + alpha2 * delta2 * Z_b2

            # we need to keep track of the last board state visited by the players
            if otherplayer == player:
                xold_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype=torch.float, device = device)).view(28*2*6,1)
            else:
                xold = Variable(torch.tensor(one_hot_encoding(board), dtype=torch.float, device = device)).view(28*2*6,1)
            # swap players
            player = -player
            moveNumber = moveNumber + 1

        # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards
        if winner == otherplayer:
            reward = 0
        elif winner == -otherplayer:
            reward = 1
        else:
            reward = 0.5
        # Now we perform the final update (terminal after-state value is zero)
        # these are basically the same updates as in the inner loop but for the final-after-states (xold and xold_flipped)
        
        # Fist we update the values for player -1
        h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid() # squash this with a sigmoid function
        y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid() # squash the output
        delta = (1.0 - reward) + gamma * 0 - y_sigmoid.detach().cpu().numpy()
        # using autograd and the contructed computational graph in pytorch compute all gradients
        y_sigmoid.backward()
        # update the eligibility traces
        Z_w1 = gamma * lam * Z_w1 + w1.grad.data
        Z_b1 = gamma * lam * Z_b1 + b1.grad.data
        Z_w2 = gamma * lam * Z_w2 + w2.grad.data
        Z_b2 = gamma * lam * Z_b2 + b2.grad.data
        # zero the gradients
        w1.grad.data.zero_()
        b1.grad.data.zero_()
        w2.grad.data.zero_()
        b2.grad.data.zero_()
        # perform now the update of weights
        delta =  torch.tensor(delta, dtype = torch.float, device = device)
        w1.data = w1.data + alpha1 * delta * Z_w1
        b1.data = b1.data + alpha1 * delta * Z_b1
        w2.data = w2.data + alpha2 * delta * Z_w2
        b2.data = b2.data + alpha2 * delta * Z_b2
        
        # Then we update the values for player 1
        h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias
        h_sigmoid = h.sigmoid() # squash this with a sigmoid function
        y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias
        y_sigmoid = y.sigmoid() # squash the output
        delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy()  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        y_sigmoid.backward()
        # update the eligibility traces
        Z_w1 = gamma * lam * Z_w1 + w1.grad.data
        Z_b1 = gamma * lam * Z_b1 + b1.grad.data
        Z_w2 = gamma * lam * Z_w2 + w2.grad.data
        Z_b2 = gamma * lam * Z_b2 + b2.grad.data
        # zero the gradients
        w1.grad.data.zero_()
        b1.grad.data.zero_()
        w2.grad.data.zero_()
        b2.grad.data.zero_()
        # perform now the update of weights
        delta2 =  torch.tensor(delta2, dtype = torch.float, device = device)
        w1.data = w1.data + alpha1 * delta2 * Z_w1
        b1.data = b1.data + alpha1 * delta2 * Z_b1
        w2.data = w2.data + alpha2 * delta2 * Z_w2
        b2.data = b2.data + alpha2 * delta2 * Z_b2
예제 #14
0
def learnitDyna(numgames, epsilon, lam_w, alpha_w, gamma, numthink):
    A = np.zeros(4)
    for games in range(0, numgames):
        board = BG.init_board()  # initialize the board
        player = np.random.randint(2) * 2 - 1  # which player begins?
        count = 0
        delta = 0
        # now we initilize all the eligibility traces for the neural network
        Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float)
        Z_w3 = torch.zeros(w3.size(), device=device, dtype=torch.float)
        Z_b3 = torch.zeros(b3.size(), device=device, dtype=torch.float)

        Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float)
        Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float)
        Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float)
        Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float)
        Z_w3_flip = torch.zeros(w3.size(), device=device, dtype=torch.float)
        Z_b3_flip = torch.zeros(b3.size(), device=device, dtype=torch.float)

        if games % 100 == 0:
            print(games)

        #play a game
        while not BG.game_over(board) and not BG.check_for_error(board):
            dice = BG.roll_dice()

            for i in range(1 + int(dice[0] == dice[1])):
                move = action(np.copy(board), epsilon, dice, player, i)

                if len(move) != 0:
                    for m in move:
                        board = BG.update_board(board, m, player)
                #tvenna og vinnur i fyrri leik. BREAK!!!!
                if BG.game_over(board):
                    break

            if BG.game_over(board):
                winner = player
                break

            if player == -1:
                board = flip_board(np.copy(board))

            if (count > 1):
                if player == -1:
                    #One-hot encoding of the board
                    move_fliptemp = move
                    x_fliptemp = ice_hot_encoding(board)
                    xflip = Variable(
                        torch.tensor(x_fliptemp,
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)

                    #Feed forward w-nn for old and new
                    target = feed_forward_w(xflip)
                    old_target = feed_forward_w(xflipold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
                        gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip,
                        Z_b2_flip, Z_w3_flip, Z_b3_flip)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1_flip
                    b1.data = b1.data + alpha_w * delta * Z_b1_flip
                    w2.data = w2.data + alpha_w * delta * Z_w2_flip
                    b2.data = b2.data + alpha_w * delta * Z_b2_flip
                    w3.data = w3.data + alpha_w * delta * Z_w3_flip
                    b3.data = b3.data + alpha_w * delta * Z_b3_flip
                    # append to the model, for the first time we create A, else we just stack on it.
                    if count == 2 and games == 0:
                        A = np.array([[x_fliptempold], [move], [x_fliptemp],
                                      0])
                    else:
                        add_to_model = np.array([[x_fliptempold], [move],
                                                 [x_fliptemp], 0])
                        A = np.vstack((A, add_to_model))

                else:
                    #One-hot encoding of the board
                    move_temp = move
                    x_temp = ice_hot_encoding(board)
                    x = Variable(
                        torch.tensor(x_temp, dtype=torch.float,
                                     device=device)).view(encSize, 1)

                    #Feed forward w-nn for old and new
                    target = feed_forward_w(x)
                    old_target = feed_forward_w(xold)
                    delta = 0 + gamma * target.detach().cpu().numpy(
                    ) - old_target.detach().cpu().numpy(
                    )  # this is the usual TD error
                    # using autograd and the contructed computational graph in pytorch compute all gradients
                    old_target.backward()
                    # update the eligibility traces using the gradients
                    Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
                        gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3)
                    # zero the gradients
                    zero_gradients_critic()
                    # perform now the update for the weights
                    delta = torch.tensor(delta,
                                         dtype=torch.float,
                                         device=device)
                    w1.data = w1.data + alpha_w * delta * Z_w1
                    b1.data = b1.data + alpha_w * delta * Z_b1
                    w2.data = w2.data + alpha_w * delta * Z_w2
                    b2.data = b2.data + alpha_w * delta * Z_b2
                    w3.data = w3.data + alpha_w * delta * Z_w3
                    b3.data = b3.data + alpha_w * delta * Z_b3
                    # append to the model, for the first time we create A, else we just stack on it.
                    if count == 2 and games == 0:
                        A = np.array([[x_tempold], [move], [x_temp], 0])
                    else:
                        add_to_model = np.array([[x_tempold], [move], [x_temp],
                                                 0])
                        A = np.vstack((A, add_to_model))

                if count > 2:
                    for thought in range(0, numthink):
                        state_indx = np.random.choice(A.shape[0])
                        state, move_temp, statenew, rewardtemp = A[state_indx]

                        if statenew == 0:
                            #Feed forward old state
                            state = Variable(
                                torch.tensor(state,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            old_target1 = feed_forward_w(state)
                            delta2 = rewardtemp + 0 - old_target1.detach().cpu(
                            ).numpy()
                        else:
                            state = Variable(
                                torch.tensor(state,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            statenew = Variable(
                                torch.tensor(statenew,
                                             dtype=torch.float,
                                             device=device)).view(encSize, 1)
                            #Feed forward w-nn
                            target1 = feed_forward_w(statenew)
                            #Feed forward old state
                            old_target1 = feed_forward_w(state)
                            delta2 = 0 + gamma * target1.detach().cpu().numpy(
                            ) - old_target1.detach().cpu().numpy(
                            )  # this is the usual TD error

                        # using autograd and the contructed computational graph in pytorch compute all gradients
                        old_target1.backward()
                        # zero the gradients
                        zero_gradients_critic()
                        # perform now the update for the weights
                        delta2 = torch.tensor(delta2,
                                              dtype=torch.float,
                                              device=device)
                        w1.data = w1.data + alpha_w * delta2 * w1.grad.data
                        b1.data = b1.data + alpha_w * delta2 * b1.grad.data
                        w2.data = w2.data + alpha_w * delta2 * w2.grad.data
                        b2.data = b2.data + alpha_w * delta2 * b2.grad.data
                        w3.data = w3.data + alpha_w * delta * w3.grad.data
                        b3.data = b3.data + alpha_w * delta * b3.grad.data

            if (count < 2):
                if player == -1:
                    x_fliptempold = ice_hot_encoding(board)
                    xflipold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)
                else:
                    x_tempold = ice_hot_encoding(board)
                    xold = Variable(
                        torch.tensor(ice_hot_encoding(board),
                                     dtype=torch.float,
                                     device=device)).view(encSize, 1)
            else:
                if player == -1:
                    x_fliptempold = x_fliptemp
                    xflipold = Variable(
                        torch.tensor(xflip, dtype=torch.float,
                                     device=device)).view(encSize, 1)
                else:
                    x_tempold = x_temp
                    xold = Variable(
                        torch.tensor(x, dtype=torch.float,
                                     device=device)).view(encSize, 1)

            if player == -1:
                board = flip_board(np.copy(board))
            # swap players
            player = -player
            count += 1

        if winner == 1:
            reward = 1
            reward_flip = 0
            move_temp = move
        else:
            reward = 0
            reward_flip = 1
            move_fliptemp = move

        #update fyrir player 1
        #Feed forward old state using w-NN
        old_target = feed_forward_w(xold)
        delta = reward + 0 - old_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        old_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(
            gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1
        b1.data = b1.data + alpha_w * delta * Z_b1
        w2.data = w2.data + alpha_w * delta * Z_w2
        b2.data = b2.data + alpha_w * delta * Z_b2
        w3.data = w3.data + alpha_w * delta * Z_w3
        b3.data = b3.data + alpha_w * delta * Z_b3

        add_to_model = np.array([[x_tempold], [move_temp], 0, reward])
        A = np.vstack((A, add_to_model))

        #Feed forward old state using w-NN
        flip_target = feed_forward_w(xflipold)
        delta = reward_flip + 0 - flip_target.detach().cpu().numpy(
        )  # this is the usual TD error
        # using autograd and the contructed computational graph in pytorch compute all gradients
        flip_target.backward()
        # update the eligibility traces using the gradients
        delta = torch.tensor(delta, dtype=torch.float, device=device)
        Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w(
            gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip,
            Z_w3_flip, Z_b3_flip)
        # zero the gradients
        zero_gradients_critic()
        # perform the update for the weights for the critic, w
        w1.data = w1.data + alpha_w * delta * Z_w1_flip
        b1.data = b1.data + alpha_w * delta * Z_b1_flip
        w2.data = w2.data + alpha_w * delta * Z_w2_flip
        b2.data = b2.data + alpha_w * delta * Z_b2_flip
        w3.data = w3.data + alpha_w * delta * Z_w3_flip
        b3.data = b3.data + alpha_w * delta * Z_b3_flip

        add_to_model = np.array([[x_fliptempold], [move_fliptemp], 0,
                                 reward_flip])
        A = np.vstack((A, add_to_model))