def action(board, dice, oplayer, i = 0): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) na = len(possible_boards) if (na == 0): return [] xa = np.zeros((na,nx+1)) va = np.zeros((na)) for j in range(0, na): xa[j,:] = one_hot_encoding(possible_boards[j],i) x = Variable(torch.tensor(xa.transpose(), dtype = torch.float, device = device)) # now do a forward pass to evaluate the board's after-state value h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias va = y.sigmoid().detach().cpu() action = possible_moves[np.argmax(va)] if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def legal_moves(self, board, dice, player): if player == -1: board = FA.flip_board(np.copy(board)) moves, boards = B.legal_moves(board=board, dice=dice, player=1) if len(boards) == 0: return [], [] boards = np.vstack(boards) return moves, boards
def action(board_copy, dice, player, i): if player == -1: board_copy = FA.flip_board(np.copy(board_copy)) possible_moves, possible_boards = B.legal_moves(board_copy, dice, 1) if len(possible_moves) == 0: return [] action = AgentJ.sample_action(np.vstack(possible_boards)) move = possible_moves[action] if player == -1: move = FA.flip_move(move) return move
def action(net, board_copy, dice, player, i): if player == -1: board_copy = flipped_agent.flip_board(board_copy) # #Flip the board # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) if len(possible_moves) == 0: return [] move = [] if player == -1: move = flipped_agent.flip_move(move) # ##Flip the move return move
def action(board, dice, oplayer): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = e_legal_moves(board, dice, 1) if len(possible_moves) == 0: return [] #index = get_action(actor, possible_boards) index = epsilon_greedy(critic, possible_boards) action = possible_moves[index] #print("ACTION") #print(action) if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def action(board, dice, oplayer, nRoll = 0): flipped_player = -1 if (flipped_player == oplayer): board = flipped_agent.flip_board(np.copy(board)) player = -flipped_player else: player = oplayer # check out the legal moves available for the throw race = c_int(israce(board)) possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) na = len(possible_moves) va = np.zeros(na) if (na == 0): return [] for i in range(0, na): board = pubeval_flip(possible_boards[i]) board = board.astype(dtype = ctypes.c_int) va[i] = lib.pubeval(race, board.ctypes.data_as(intp)) action = possible_moves[np.argmax(va)] if (flipped_player == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def action(board, dice, oplayer, i=0): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = Backgammon.legal_moves( board, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] after_state, action = epsilon_nn_greedy(board, possible_moves, possible_boards, player) if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def swap_player(self): self.board = FA.flip_board(board_copy=np.copy(self.board))
def learnit(numgames, epsilon, lam, alpha, alpha1, alpha2, w1, b1, w2, b2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): board = Backgammon.init_board() # initialize the board (empty) # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device = device, dtype = torch.float) Z_b1 = torch.zeros(b1.size(), device = device, dtype = torch.float) Z_w2 = torch.zeros(w2.size(), device = device, dtype = torch.float) Z_b2 = torch.zeros(b2.size(), device = device, dtype = torch.float) # player to start is "1" the other player is "-1" player = 1 otherplayer = -1 winner = 0 # this implies a draw isGameOver = False moveNumber = 0 while (isGameOver == False): dice = Backgammon.roll_dice() # use a policy to find action # both are using the neural-network to approximate the after-state value if (player == otherplayer): # this one flippes the board to find an action. possible_moves, possible_boards = Backgammon.legal_moves(flipped_agent.flip_board(np.copy(board)), dice, -player) action = epsilon_nn_greedy(flipped_agent.flip_board(np.copy(board)), dice, -player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) action = flipped_agent.flip_move(action) else: # this one uses the original board. possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) action = epsilon_nn_greedy(np.copy(board), dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) # perform move and update board for i in range(0,len(action)): board = Backgammon.update_board(board, action[i], player) if (1 == Backgammon.game_over(board)): # has this player won? winner = player isGameOver = True break # bail out of inner game loop # once both player have performed at least one move we can start doing updates if (1 < moveNumber): if otherplayer == player: # here we have player -1 updating the table V x_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype = torch.float, device = device)).view(28*2*6,1) h = torch.mm(w1,x_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash this with a sigmoid function target = y_sigmoid.detach().cpu().numpy() # lets also do a forward past for the old board, this is the state we will update h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error else: # here we have player 1 updating the neural-network (2 layer feed forward with Sigmoid units) x = Variable(torch.tensor(one_hot_encoding(board), dtype = torch.float, device = device)).view(28*2*6,1) # now do a forward pass to evaluate the new board's after-state value h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash this with a sigmoid function target = y_sigmoid.detach().cpu().numpy() # lets also do a forward past for the old board, this is the state we will update h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces using the gradients Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # we need to keep track of the last board state visited by the players if otherplayer == player: xold_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype=torch.float, device = device)).view(28*2*6,1) else: xold = Variable(torch.tensor(one_hot_encoding(board), dtype=torch.float, device = device)).view(28*2*6,1) # swap players player = -player moveNumber = moveNumber + 1 # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards if winner == otherplayer: reward = 0 elif winner == -otherplayer: reward = 1 else: reward = 0.5 # Now we perform the final update (terminal after-state value is zero) # these are basically the same updates as in the inner loop but for the final-after-states (xold and xold_flipped) # Fist we update the values for player -1 h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta = (1.0 - reward) + gamma * 0 - y_sigmoid.detach().cpu().numpy() # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update of weights delta = torch.tensor(delta, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 # Then we update the values for player 1 h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy() # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update of weights delta2 = torch.tensor(delta2, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2
def play_a_game(opponent, commentary = False): board = init_board() # initialize the board player = np.random.randint(2)*2-1 # which player begins? y_old = 0 y_old_p2 = 0 firstMove = True firstMove_p2 = True pickle_in = open("randommodel.pickle","rb") model = pickle.load(pickle_in) model = model.cuda() # play on while not game_over(board) and not check_for_error(board): if commentary: print("lets go player ",player) # roll dice dice = roll_dice() if commentary: print("rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1+int(dice[0] == dice[1])): board_copy = np.copy(board) # make the move (agent vs agent): if(opponent == "agent"): if player == 1: move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, True) # update the board if len(move) != 0: for m in move: board = update_board(board, m, player) if(firstMove): firstMove = False elif player == -1: flipped_board = flipped_agent.flip_board(board_copy) move, y_old_p2 = agent.action(flipped_board,dice,1,i, y_old_p2, model, firstMove_p2, True) if len(move) != 0: for m in move: flipped_board = update_board(flipped_board, m, 1) board = flipped_agent.flip_board(flipped_board) if(firstMove_p2): firstMove_p2 = False elif(opponent == "human"): pretty_print(board) if player == 1: print("Computer's turn") move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, False) print("Computer's move", move) elif player == -1: print("Human's turn") possible_moves, possible_boards = legal_moves(board_copy, dice, player) print("dice:", dice) printing.moves_to_string(possible_moves) text = input("prompt") move = possible_moves[int(text)] if len(move) != 0: for m in move: board = update_board(board, m, player) #if you're playing vs random agent: elif(opponent == "random"): if player == 1: move, y_old = agent.action(board_copy,dice,player,i, y_old, model, firstMove, False) elif player == -1: move = random_agent(board_copy,dice,player,i) if len(move) != 0: for m in move: board = update_board(board, m, player) # update the board # give status after every move: if commentary: print("move from player",player,":") pretty_print(board) # players take turns player = -player # return the winner winner = -1*player if(opponent == "agent"): if(winner == 1): agent.learn(y_old, model, board_copy, "yes") agent.learn(y_old_p2, model, board_copy, "no") else: agent.learn(y_old, model, board_copy, "no") agent.learn(y_old_p2, model, board_copy, "yes") #print("Winner is player", winner) pickle_out = open("randommodel.pickle","wb") pickle.dump(model, pickle_out) pickle_out.close() return winner