def action(net, board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy if player == -1: board_copy = flip_board(board_copy) ##Flip the board # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available if len(possible_moves) == 0: return [] feature_boards = [] ###Create new features using Tsesauros for b in possible_boards: feature_boards.append(oneHot(b)) ### Get probabilites of each action via the actor forward probs, log_probs = net.actor.forward(feature_boards) ###index is used as a help to pick action index = np.arange(0, len(possible_boards)) ###This works because numpy and pytorch hate each other probs = probs.detach().numpy() ###The index of the action chose i = choice(index, p=probs) move = possible_moves[ i] ###Pick the next move according to the index selected newBoard = possible_boards[ i] ###Pick the nex board according to the index selected newBoardFeatures = oneHot(newBoard) ### Critic feedforward target, oldtarget = net.critic.forward(newBoardFeatures, oneHot( board_copy)) #(newBoardFeatures,getFeatures(board_copy,player) ) R = 0 if (Backgammon.game_over(newBoard) ): ###Did I win? If so the reward shall be +1 R = 1 target = 0 ###Terminal state is 0 ### Now we update the neaural network # target, oldtarget =net.critic.forward(newBoardFeatures,getFeatures(board_copy,player) ) delta = R + net.gamma * target - oldtarget ###Update the critic via backpropgation net.critic.backward(R, delta, net.gamma) ###Update the actor via backpropogation net.actor.backward(log_probs[i], delta, net.gamma) if player == -1: move = flip_move(move) ###Flip the move return move
def ExamplePolicy(self): _, st = B.legal_moves(B.init_board(), B.roll_dice(), 1) st = np.vstack(st) st = st[:, 1:] out = np.round( self._s.run(self._actor_policy, ({ self._possible_states: st })) * 100) / 100 out = out.flatten() out.sort() return out[::-1]
def do(self, board_real, dice, actor_theta, player): commentary = False print_results = False for i in range(0, 25): board = np.copy(board_real) old_state = np.copy(board_real) self.z = np.zeros(198) if(len(board) == 0): break count = 0 while not Backgammon.game_over(board) and not Backgammon.check_for_error(board): if commentary: print("Simulationgame: lets go player ", player) dice = Backgammon.roll_dice() if commentary: print("Simulationgame: rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == 1: move, new_state = self.nextMove(board_copy, dice, player, actor_theta) elif player == -1: move = agentX.action(board_copy, dice, player, i) if len(move) != 0: for m in move: board = Backgammon.update_board(board, m, player) if(player == 1 and count > 1): new_state = np.copy(board) if(not Backgammon.game_over(new_state) and not Backgammon.check_for_error(new_state)): delta = 0 + self.getValue(new_state, actor_theta, player) - self.getValue(old_state, actor_theta, player) self.theta = self.theta + (self.alpha * delta * self.z) self.z = self.lamb * self.z + getFeatures(old_state, player) old_state = new_state if commentary: print("Simulationgame: move from player", player, ":") Backgammon.pretty_print(board) player = -player count = count + 1 if(print_results): print("simulation game nr", i) Backgammon.pretty_print(board) delta = player * -1 + 0 - self.getValue(old_state, actor_theta, player) self.theta = np.add(self.theta , (self.alpha * delta * self.z)) self.z = self.lamb * self.z + getFeatures(old_state, player)
def learnit(numGames, agent): numWins = [] for g in tqdm(range(numGames)): if g % 1000 == 0: #print(agent.theta) wins = compete(agent) numWins.append(wins) board = Backgammon.init_board() agent.zero_el() if (0 == np.random.randint(2)): player = 1 else: player = -1 moveNr = 0 isGameOver = False while (isGameOver == False): dice = Backgammon.roll_dice() for repeat in range(1 + int(dice[0] == dice[1])): action = agent.greedy_action(np.copy(board), dice, player, repeat) for i in range(0, len(action)): board = Backgammon.update_board(board, action[i], player) R = 0 if (1 == Backgammon.game_over(board)): if (player == 1): R = 1.0 else: R = 0 isGameOver = True if ((1 < moveNr) & (len(action) > 0)): agent.update(player, R, board, isGameOver) if (len(action) > 0): if player == 1: agent.xold = board else: agent.xoldF = flip_board(board) player = -player moveNr += 1 x = np.arange(0, numGames, 1000) fig = plt.figure() #plt.figure(figsize=(30, 30)) ax = fig.add_subplot(111) ax.set_xlabel("Number of games") ax.set_ylabel("Wins against a random player") ax.plot(x, numWins)
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # starts by flipping the board so that the player always sees himself as player 1 if player == -1: board_copy = flip_board(board_copy) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] # Make the bestmove: # policy missing, returns a random move for the time being # # # # # move = possible_moves[np.random.randint(len(possible_moves))] # if the table was flipped the move has to be flipped as well if player == -1: move = flip_move(move) return move
def greedy_action(self, board, dice, player, i): if player == -1: board = flip_board(board) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] na = len(possible_boards) enc = np.zeros((na, 312)) for i in range(0, na): enc[i, :] = oneHot(possible_boards[i]) x = Variable( torch.tensor(enc.transpose(), dtype=torch.double, device=self.device)) h = torch.mm(self.w1, x) + self.b1 h_sigmoid = h.sigmoid() y = torch.mm(self.W, h_sigmoid) + self.B va = y.sigmoid().detach().cpu() action = possible_moves[np.argmax(va)] if player == -1: action = flip_move(action) return action
def action(board, dice, oplayer, i = 0): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) na = len(possible_boards) if (na == 0): return [] xa = np.zeros((na,nx+1)) va = np.zeros((na)) for j in range(0, na): xa[j,:] = one_hot_encoding(possible_boards[j],i) x = Variable(torch.tensor(xa.transpose(), dtype = torch.float, device = device)) # now do a forward pass to evaluate the board's after-state value h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias va = y.sigmoid().detach().cpu() action = possible_moves[np.argmax(va)] if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy # policy missing, returns a random move for the time being # # # # # move = epsilon_nn_greedy(board_copy, player, epsilon, w1, b1, w2, b2, debug) return move
def main(): winners = {} winners["1"] = 0 winners["-1"] = 0 # Collecting stats of the games nGames = 1000 # how many games? arr = np.zeros(nGames) for g in range(nGames): winner = Backgammon.play_a_game(commentary=False, net=new_agent) # print("game %i finished", g) # print("winner is ", winner) winners[str(winner)] += 1 arr[g] = winner # print("this is hw") # print(new_agent.val_func_nn.hidden_weights) # print("this is theta") # print(new_agent.policy_nn.theta) if (g % 50 == 0): print(new_agent.torch_nn_policy.theta) # print(winners) file = open('Failed.py', 'w') file.write(np.array_str(arr)) file.close() print("Out of", nGames, "games,") print("player", 1, "won", winners["1"], "times and") print("player", -1, "won", winners["-1"], "times")
def action(board_copy, dice, player, i, model): global actionCount # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] #Backgammon.pretty_print(board_copy) after_state, action = epsilon_nn_greedy(board_copy, possible_moves, possible_boards, player, model) #model.xtheta = xtheta_mean if (actionCount > 0): model.updateNeural(after_state) if (actionCount > 1): model.dynaUpdate() actionCount += 1 model.xold = Variable( torch.tensor(one_hot_encoding(after_state), dtype=torch.float, device=model.device)).view((28 * 31, 1)) return action
def action(board_copy, epsilon, dice, player, i): if player == -1: board_copy = flip_board(board_copy) possible_moves, possible_boards = BG.legal_moves(board_copy, dice, player=1) na = len(possible_moves) va = np.zeros(na) j = 0 # if there are no moves available if na == 0: return [] if (np.random.uniform() < epsilon): move = possible_moves[randrange(na)] if player == -1: move = flip_move(move) return move for board in possible_boards: # encode the board to create the input x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) # now do a forward pass to evaluate the board's after-state value va[j] = feed_forward_w(x) j += 1 move = possible_moves[np.argmax(va)] if player == -1: move = flip_move(move) return move
def action(board_copy, dice, player, i, model): global actionCount # starts by flipping the board so that the player always sees himself as player 1 if player == -1: board_copy = flip_board(board_copy) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] # Make the bestmove: after_state, action = epsilon_nn_greedy(board_copy, possible_moves, possible_boards, player, model) #model.xtheta = xtheta_mean if (actionCount > 0): model.updateNeural(after_state) if (actionCount > 1): model.dynaUpdate() actionCount += 1 model.xold = Variable( torch.tensor(one_hot_encoding(after_state), dtype=torch.float, device=model.device)).view((28 * 31, 1)) # if the table was flipped the move has to be flipped as well if player == -1: move = flip_move(action) return move
def main(): winners = {} winners["1"] = 0 winners["-1"] = 0 # Collecting stats of the games nGames = 10000 # how many games? arr = np.zeros(nGames) for g in tqdm(range(nGames)): # w=new_agent.actor.theta # print(w) ###Zero eligibility traces (according to psudo code) agent.actor.zero_el() agent.critic.zero_el() winner = Backgammon.play_a_game(commentary=False, net=agent) winners[str(winner)] += 1 arr[g] = winner # if(g % 100 == 0): # print(new_agent.torch_nn_policy.theta) # print(winners) # ##Save the agent file_net = open('saved_net_one_2', 'wb') pickle.dump(agent, file_net) file_net.close() print("Out of", nGames, "games,") print("player", 1, "won", winners["1"], "times and") print("player", -1, "won", winners["-1"], "times")
def main(): ranges = 1 winners = {} winners["1"] = 0 winners["-1"] = 0 # Collecting stats of the games nGames = 1000 # how many games? arr = np.zeros(nGames) for g in tqdm(range(nGames)): # ##Zero eligibility traces (according to psudo code) winner = Backgammon.play_a_game(commentary=False, net=agent, train=train) winners[str(winner)] += 1 arr[g] = winner if(g % 10 == 0): print(agent.actor.theta) k = winners["1"] print("winrate is %f" % (k / (g + 0.00000001))) # print(winners) # Save the agent if(train is True): file_net = open('saved_net_one', 'wb') pickle.dump(agent.actor.theta, file_net) file_net.close() print("Out of", ranges, nGames, "games,") print("player", 1, "won", winners["1"], "times and") print("player", -1, "won", winners["-1"], "times")
def action(board_copy,dice,player,i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy # policy missing, returns a random move for the time being # # # # # epsilon = 0.1 w1 = torch.load('./w1_trained.pth', map_location=lambda storage, loc: storage) w2 = torch.load('./w2_trained.pth', map_location=lambda storage, loc: storage) b1 = torch.load('./b1_trained.pth', map_location=lambda storage, loc: storage) b2 = torch.load('./b2_trained.pth', map_location=lambda storage, loc: storage) #w1 = torch.load('./w1_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #w2 = torch.load('./w2_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #b1 = torch.load('./b1_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #b2 = torch.load('./b2_trained_first_time_working.pth', map_location=lambda storage, loc: storage) move = neural_network_agent.epsilon_nn_greedy(board_copy, dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) return move
def action(board_copy, dice, player, i): global count # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy na = len(possible_moves) va = np.zeros(na) for i in range(0, na): move = possible_moves[i] board = possible_boards[i] # encode the board to create the input x = Variable( torch.tensor(one_hot_encoding(board), dtype=torch.float, device=device)).view(29, 31) # now do a forward pass to evaluate the board's after-state value h = torch.mm( w1, x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() z = torch.mm(y_sigmoid, w3) + b3 va[i] = z.sigmoid() count += 1 if not Backgammon.game_over(possible_boards[np.argmax(va)]): update(possible_boards[np.argmax(va)]) else: reward = 1 if player == 1 else 0 update(possible_boards[np.argmax(va)], reward) return possible_moves[np.argmax(va)]
def legal_moves(self, dice, player): moves, boards = B.legal_moves(board=self.board, dice=dice, player=player) if len(boards) == 0: return [], [] boards = np.vstack(boards) return moves, boards
def legal_moves(self, board, dice, player): if player == -1: board = FA.flip_board(np.copy(board)) moves, boards = B.legal_moves(board=board, dice=dice, player=1) if len(boards) == 0: return [], [] boards = np.vstack(boards) return moves, boards
def play_a_game_random(commentary=False): board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? randomPlayer = -1 while not BG.game_over(board) and not BG.check_for_error(board): if commentary: print("lets go player ", player) # roll dice dice = BG.roll_dice() if commentary: print("rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == randomPlayer: move = flipped_agent.action(board_copy, dice, player, i) else: move = action(board_copy, dice, player, i) # update the board if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # give status after every move: if commentary: print("move from player", player, ":") BG.pretty_print(board) # players take turns player = -player # return the winner return -1 * player
def evaluate(agent, evaluation_agent, n_eval, n_games): wins = 0 for i in range(n_eval): winner, board = Backgammon.play_a_game(agent, evaluation_agent) wins += int(winner == 1) winrate = round(wins / n_eval * 100, 3) print("Win-rate after training for " + str(n_games) + " games: " + str(winrate) + "%") return winrate
def PlayPubEval(self, test_games=1): wins = [] for _ in range(test_games): env = backgammon() done = False while not done: dice = B.roll_dice() for _ in range(1 + int(dice[0] == dice[1])): possible_moves, possible_boards = env.legal_moves(dice, 1) n_actions = len(possible_moves) if n_actions == 0: break action = self.sample_action(possible_boards) old_board, new_board, reward, done = env.step( possible_moves[action], player=1) if done: break if not done: #env.swap_player() dice = B.roll_dice() for __ in range(1 + int(dice[0] == dice[1])): action = pubeval.agent_pubeval(np.copy(env.board), dice, oplayer=-1) old_board, new_board, reward, done = env.step( action, player=-1) if B.check_for_error(env.board): PubEvalErBilað if done: reward = 0 break #env.swap_player() wins.append(float(reward == 1)) return (np.mean(wins))
def e_legal_moves(board, dice, player=1): moves, boards = B.legal_moves(board, dice=dice, player=player) if len(boards) == 0: return [], features(board, player) n_boards = np.shape(boards)[0] tesauro = np.zeros((n_boards, 198)) for b in range(n_boards): tesauro[b, :] = features(boards[b], player) tesauro = np.array(tesauro) return moves, tesauro
def action(net, board_copy, dice, player, i, learn=True): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy if player == -1: board_copy = flip_board(board_copy) ##Flip the board # check out the legal moves available for the throw if (player == 1): xold = net.xold net.xnew = board_copy else: ######################################################################## xold = net.xFlipOld net.xFlipNew = board_copy possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available if len(possible_moves) == 0: return [] one_hot = [] for b in possible_boards: one_hot.append(oneHot(b)) if learn: if not net.firstMove: net.update(player) m, xtheta = net.actor(one_hot, possible_moves) if player == 1: net.xtheta = xtheta else: net.flipxtheta = xtheta move = possible_moves[m] newBoard = possible_boards[m] # if learn: # if not net.firstMove: # net.update(player) if player == -1: move = flip_move(move) ###Flip the move if player == 1: net.xold = board_copy else: net.xFlipOld = board_copy net.firstMove = False return move
def step(self, move, player=1): old_board = np.copy(self.board) if len(move) != 0: for m in move: self.board = B.update_board(board=self.board, move=m, player=player) reward = 0 self.done = False if self.iswin(): reward = player self.done = True return old_board, np.copy(self.board), reward, self.done
def action(board_copy, dice, player, i, learning=False): if player == -1: board_copy = flip_board(board_copy) # Get every possible move and board xtheta_mean = torch.zeros((len(theta), 1)) possible_moves, possible_boards = BG.legal_moves(board_copy, dice, player=1) na = len(possible_moves) one_hot_boards = np.zeros((2 * (n - 1) * 7, na)) j = 0 # if there are no moves available if len(possible_moves) == 0: x = Variable( torch.tensor(ice_hot_encoding(board_copy), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) h_sigmoid = feed_forward_th(x) pi = torch.mm(theta, h_sigmoid).softmax(0) xtheta_mean = h_sigmoid * pi.item() if learning == True: return [], xtheta_mean else: return [] for board in possible_boards: # encode the board to create the input for the NN x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) one_hot_boards[:, j] = x[:, 0] j += 1 # select the move from a distribution X = Variable(torch.tensor(one_hot_boards, dtype=torch.float, device=device)) h = feed_forward_th(X) h_sigmoid = h.sigmoid() pi = torch.mm(theta, h_sigmoid).softmax(1) xtheta_mean = torch.sum(torch.mm(h_sigmoid, torch.diagflat(pi)), 1) xtheta_mean = torch.unsqueeze(xtheta_mean, 1) move_index = torch.multinomial(pi, num_samples=1) move = possible_moves[move_index] if player == -1: move = flip_move(move) if learning == True: return move, xtheta_mean return move
def PlayRandomAgent(self, test_games=1): wins = [] for _ in range(test_games): env = backgammon() done = False while not done: dice = B.roll_dice() for __ in range(1 + int(dice[0] == dice[1])): possible_moves, possible_boards = env.legal_moves(dice, 1) n_actions = len(possible_moves) if n_actions == 0: break action = self.sample_action(possible_boards) old_board, new_board, reward, done = env.step( possible_moves[action]) if done: break if not done: dice = B.roll_dice() for _ in range(1 + int(dice[0] == dice[1])): old_board, new_board, reward, done = env.make_move( dice) if done: reward = 0 break wins.append(float(reward == 1)) return reward
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy move = [] # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice) # make the best move according to the policy if len(possible_moves) != 0: move = policy(possible_moves, possible_boards, dice, i) return move
def action(board_copy, dice, player, i): if player == -1: board_copy = FA.flip_board(np.copy(board_copy)) possible_moves, possible_boards = B.legal_moves(board_copy, dice, 1) if len(possible_moves) == 0: return [] action = AgentJ.sample_action(np.vstack(possible_boards)) move = possible_moves[action] if player == -1: move = FA.flip_move(move) return move
def compete(agent): winners = {} winners["1"] = 0 winners["-1"] = 0 for g in range(100): board = Backgammon.init_board() if (0 == np.random.randint(2)): player = 1 else: player = -1 isGameOver = False while (isGameOver == False): dice = Backgammon.roll_dice() for repeat in range(1 + int(dice[0] == dice[1])): if (player == -1): action = Backgammon.random_agent(np.copy(board), dice, player, repeat) else: action = agent.greedy_action(np.copy(board), dice, player, repeat) for i in range(0, len(action)): board = Backgammon.update_board(board, action[i], player) if (1 == Backgammon.game_over(board)): winner = player isGameOver = True break player = -player winners[str(winner)] += 1 # numWins.append(winners["1"]) print("Out of", 100, "games,") print("player", 1, "won", winners["1"], "times and") print("player", -1, "won", winners["-1"], "times") return winners["1"]
def epsilon_nn_greedy(board, player, epsilon, w1, b1, w2, b2, debug=False): moves = Backgammon.legal_moves(board) if np.random.uniform() < epsilon: if debug is True: print("explorative move") return np.random.choice(moves, 1) na = np.size(moves) va = np.zeros(na) for i in range(0, na): board[moves[i]] = player # encode the board to create the input # FEATURES eru X # va[i] = y.sigmoid() return moves[np.argmax(va)]