def play_a_game_random(commentary=False): board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? randomPlayer = -1 while not BG.game_over(board) and not BG.check_for_error(board): if commentary: print("lets go player ", player) # roll dice dice = BG.roll_dice() if commentary: print("rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == randomPlayer: move = flipped_agent.action(board_copy, dice, player, i) else: move = action(board_copy, dice, player, i) # update the board if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # give status after every move: if commentary: print("move from player", player, ":") BG.pretty_print(board) # players take turns player = -player # return the winner return -1 * player
def do(self, board_real, dice, actor_theta, player): commentary = False print_results = False for i in range(0, 25): board = np.copy(board_real) old_state = np.copy(board_real) self.z = np.zeros(198) if(len(board) == 0): break count = 0 while not Backgammon.game_over(board) and not Backgammon.check_for_error(board): if commentary: print("Simulationgame: lets go player ", player) dice = Backgammon.roll_dice() if commentary: print("Simulationgame: rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == 1: move, new_state = self.nextMove(board_copy, dice, player, actor_theta) elif player == -1: move = agentX.action(board_copy, dice, player, i) if len(move) != 0: for m in move: board = Backgammon.update_board(board, m, player) if(player == 1 and count > 1): new_state = np.copy(board) if(not Backgammon.game_over(new_state) and not Backgammon.check_for_error(new_state)): delta = 0 + self.getValue(new_state, actor_theta, player) - self.getValue(old_state, actor_theta, player) self.theta = self.theta + (self.alpha * delta * self.z) self.z = self.lamb * self.z + getFeatures(old_state, player) old_state = new_state if commentary: print("Simulationgame: move from player", player, ":") Backgammon.pretty_print(board) player = -player count = count + 1 if(print_results): print("simulation game nr", i) Backgammon.pretty_print(board) delta = player * -1 + 0 - self.getValue(old_state, actor_theta, player) self.theta = np.add(self.theta , (self.alpha * delta * self.z)) self.z = self.lamb * self.z + getFeatures(old_state, player)
def learnit(numGames, agent): numWins = [] for g in tqdm(range(numGames)): if g % 1000 == 0: #print(agent.theta) wins = compete(agent) numWins.append(wins) board = Backgammon.init_board() agent.zero_el() if (0 == np.random.randint(2)): player = 1 else: player = -1 moveNr = 0 isGameOver = False while (isGameOver == False): dice = Backgammon.roll_dice() for repeat in range(1 + int(dice[0] == dice[1])): action = agent.greedy_action(np.copy(board), dice, player, repeat) for i in range(0, len(action)): board = Backgammon.update_board(board, action[i], player) R = 0 if (1 == Backgammon.game_over(board)): if (player == 1): R = 1.0 else: R = 0 isGameOver = True if ((1 < moveNr) & (len(action) > 0)): agent.update(player, R, board, isGameOver) if (len(action) > 0): if player == 1: agent.xold = board else: agent.xoldF = flip_board(board) player = -player moveNr += 1 x = np.arange(0, numGames, 1000) fig = plt.figure() #plt.figure(figsize=(30, 30)) ax = fig.add_subplot(111) ax.set_xlabel("Number of games") ax.set_ylabel("Wins against a random player") ax.plot(x, numWins)
def step(self, move, player=1): old_board = np.copy(self.board) if len(move) != 0: for m in move: self.board = B.update_board(board=self.board, move=m, player=player) reward = 0 self.done = False if self.iswin(): reward = player self.done = True return old_board, np.copy(self.board), reward, self.done
def compete(agent): winners = {} winners["1"] = 0 winners["-1"] = 0 for g in range(100): board = Backgammon.init_board() if (0 == np.random.randint(2)): player = 1 else: player = -1 isGameOver = False while (isGameOver == False): dice = Backgammon.roll_dice() for repeat in range(1 + int(dice[0] == dice[1])): if (player == -1): action = Backgammon.random_agent(np.copy(board), dice, player, repeat) else: action = agent.greedy_action(np.copy(board), dice, player, repeat) for i in range(0, len(action)): board = Backgammon.update_board(board, action[i], player) if (1 == Backgammon.game_over(board)): winner = player isGameOver = True break player = -player winners[str(winner)] += 1 # numWins.append(winners["1"]) print("Out of", 100, "games,") print("player", 1, "won", winners["1"], "times and") print("player", -1, "won", winners["-1"], "times") return winners["1"]
def learnit(numgames, lam_w, lam_th, alpha_w, alpha_th): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) count = 0 while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): #Mögulega taka mean af xtheta?? move, xtheta = action(np.copy(board), dice, player, i, True) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # if the player gets a double and wins the game in the first move. if BG.game_over(board): break if BG.game_over(board): winner = player break if player == -1: board = flip_board(np.copy(board)) if (count > 1): if player == -1: #One-hot encoding of the board xflip = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn for old and new target, _ = feed_forward_w(xflip) old_target, h_sigmoid = feed_forward_w(xflipold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip #Update theta grad_ln_pi = h_sigmoid - xtheta theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) xthetaflipold = xtheta else: #One-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn for old and new target, _ = feed_forward_w(x) old_target, h_sigmoid = feed_forward_w(xold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 #Update theta grad_ln_pi = h_sigmoid - xtheta theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) xthetaold = xtheta # we need to keep track of the last board state visited by the players if (count < 2): if player == -1: xflipold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: if player == -1: xflipold = Variable( torch.tensor(xflip, dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = Variable( torch.tensor(x, dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 if winner == 1: reward = 1 reward_flip = -1 xthetaold = xtheta else: reward = -1 reward_flip = 1 xthetaflipold = xtheta #update fyrir player 1 #Feed forward old state using w-NN old_target, h_sigmoid = feed_forward_w(xold) delta = reward + 0 - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 #Update theta grad_ln_pi = h_sigmoid - xthetaold theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) # update fyrir flipped player # and then for the neural network: #Feed forward w-NN #Feed forward old state using w-NN flip_target, h_sigmoid = feed_forward_w(xflipold) delta = reward_flip + 0 - flip_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients flip_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip #Update theta grad_ln_pi = h_sigmoid - xthetaflipold theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi))
def learnit(numgames, lam_w, alpha1, alpha2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): epsilon = 15000 / (15000 + games) I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) count = 0 if games % 1000 == 0: print(games) if games % 5000 == 0: print('Compete:') wins_for_player_1 = 0 loss_for_player_1 = 0 competition_games = 500 for j in range(competition_games): winner = play_a_game_random(commentary=False) if (winner == 1): wins_for_player_1 += 1.0 else: loss_for_player_1 += 1.0 print(wins_for_player_1, loss_for_player_1) while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move = action(np.copy(board), epsilon, dice, player, i) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) if BG.game_over(board): break if player == -1: board = flip_board(np.copy(board)) if (count > 1): # One-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn target = feed_forward_w(x) #Feed forward old state old_target = feed_forward_w(xolder) delta2 = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # we need to keep track of the last board state visited by the players if (count > 0): xolder = xold if (not BG.game_over(board)): if (count < 2): xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = x if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards reward = 1 #update fyrir winner # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: win_target = feed_forward_w(xold) delta2 = reward + gamma * 0 - win_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients win_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # update fyrir lúser reward = -1 # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: loser_target = feed_forward_w(x) # squash the output delta2 = reward + gamma * 0 - loser_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients loser_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2
def learnit(numgames, lam_w, lam_th, alpha1, alpha2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # initilize all the eligibility traces for the NN for critic w Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) # initilize all the eligibility traces for the NN for actor theta Z_theta_1 = torch.zeros(theta_1.size(), device=device, dtype=torch.float) Z_thetab1 = torch.zeros(thetab1.size(), device=device, dtype=torch.float) Z_theta_2 = torch.zeros(theta_2.size(), device=device, dtype=torch.float) Z_thetab2 = torch.zeros(thetab2.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) count = 0 delta = 0 # play a game while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move, prob, index = action(np.copy(board), dice, player, i, True) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # if the player gets a double and wins the game in the first move. if BG.game_over(board): break # check to see if the game is over if BG.game_over(board): break if player == -1: board = flip_board(np.copy(board)) # only update after the first two moves, because we are using afterstates # and both players have to make one move first. if (count > 1): # Ice-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) #Feed forward w-NN target = feed_forward_w(x) #Feed forward old state using w-NN old_target = feed_forward_w(xolder) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 #Update theta logTarget = torch.log(prob) logTarget.backward(retain_graph=True) # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) zero_gradients_actor() # zero the gradients # perform the update for the weights for the actor, theta theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2 I = gamma * I # keep track of the last state the player was in if (count > 0): xolder = xold # keep track of the last state if (not BG.game_over(board)): if (count < 2): xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) else: xold = x # keep track of the old values from the NN to update the player who lost probold = prob indexold = index if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 # The game episode has ended and we know the outcome of the game, and can find the terminal rewards reward = 1 # update for the winner # these are basically the same updates as in the inner loop but for the final-after-states (x and xold) # and then for the neural network: win_target = feed_forward_w(xold) delta = reward + gamma * 0 - win_target.detach().cpu().numpy( ) # this is the usual TD error delta = torch.tensor(delta, dtype=torch.float, device=device) # using autograd and the contructed computational graph in pytorch compute all gradients win_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 # Update theta logTarget = torch.log(prob) logTarget.backward() # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) # zero the gradients zero_gradients_actor() theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2 # update fyrir lúser reward = -1 # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: loser_target = feed_forward_w(x) # squash the output delta = reward + gamma * 0 - loser_target.detach().cpu().numpy( ) # this is the usual TD error delta = torch.tensor(delta, dtype=torch.float, device=device) # using autograd and the contructed computational graph in pytorch compute all gradients loser_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 #Update theta logTarget = torch.log(probold) logTarget.backward() # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) # zero the gradients zero_gradients_actor() theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2
def learnit(numgames, epsilon, lam, alpha, V, alpha1, alpha2, w1, b1, w2, b2): gamma = 1 # for completeness global episode_number, xs, hs, logProp # play numgames games for training for games in range(0, numgames): board = Backgammon.init_board() # initialize the board (empty) # we will use TD(lambda) and so we need to use eligibility traces S = [ ] # no after-state for table V, visited after-states is an empty list E = np.array([]) # eligibility traces for table V # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) # player to start is "1" the other player is "-1" player = 1 tableplayer = -1 winner = 0 # this implies a draw # start turn playing game, maximum 9 moves dice = Backgammon.roll_dice() legal_moves = Backgammon.legal_moves(board, dice, player) for moveNumber in range(0, len(legal_moves)): # use a policy to find action if (player == tableplayer): # this one is using the table V possible_moves, possible_boards = Backgammon.legal_moves( board, dice, player) action = possible_moves[np.random.randint(len(possible_moves))] else: # this one is using the neural-network to approximate the after-state value action = epsilon_nn_greedy(np.copy(board), dice, player, epsilon, w1, b1, w2, b2) # perform move and update board for i in range(0, len(action)): board = Backgammon.update_board(board, action[i], player) if (1 == Backgammon.game_over(board)): # has this player won? winner = player break # bail out of inner game loop # once both player have performed at least one move we can start doing updates if (1 < moveNumber): if tableplayer == player: # here we have player 1 updating the table V s = hash_it(board) # get index to table for this new board delta = 0 + gamma * V[s] - V[sold] E = np.append( E, 1 ) # add trace to this state (note all new states are unique else we would +1) S.append(sold) # keep track of this state also V[S] = V[ S] + delta * alpha * E # the usual tabular TD(lambda) update E = gamma * lam * E else: # here we have player 2 updating the neural-network (2 layer feed forward with Sigmoid units) x = Variable( torch.tensor(one_hot_encoding(board, player), dtype=torch.float, device=device)).view(2 * 9, 1) # now do a forward pass to evaluate the new board's after-state value h = torch.mm( w1, x ) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid( ) # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid ) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid( ) # squash this with a sigmoid function target = y_sigmoid.detach().cpu().numpy() # lets also do a forward past for the old board, this is the state we will update h = torch.mm( w1, xold ) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid( ) # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid ) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = 0 + gamma * target - y_sigmoid.detach().cpu( ).numpy() # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces using the gradients Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data # zero the gradients w2.grad.data.zero_() b2.grad.data.zero_() w1.grad.data.zero_() b1.grad.data.zero_() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # we need to keep track of the last board state visited by the players if tableplayer == player: sold = hash_it(board) else: xold = Variable( torch.tensor(one_hot_encoding(board), dtype=torch.float, device=device)).view(28 * 2 * 6, 1) # swap players player = -player # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards if winner == tableplayer: reward = 0 elif winner == -tableplayer: reward = 1 else: reward = 0.5 episode_number += 1 end_x = torch.stack(xs) end_h = torch.stack(hs) end_logProp = torch.stack(logProp) xs, hs, logProp = [], [], [] #spurning hvernig reward er gefið, hvort það þurfi að taka einhversskonar discount end_logProp *= reward grad = actor_policy_backward(end_x, end_h, end_logProp, w2) for k in model: grad_buffer[k] += grad[k] #adding the grad to the batch if episode_number == batch_size: episode_number = 0 for k, v in model.items(): g = grad_buffer[k] rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + ( 1 - decay_rate) * np.power(g, 2) model[k] += learning_rate * g / np.sqrt(rmsprop_cache[k] + 1e-5) print('model[k]', model[k]) print('model', model) grad_buffer[k] = np.zeros_like(v) # Now we perform the final update (terminal after-state value is zero) # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # first for the table (note if reward is 0 this player actually won!): delta = (1.0 - reward) + gamma * 0 - V[sold] E = np.append(E, 1) # add one to the trace (recall unique states) S.append(sold) for state in S: V[state] = V[state] + delta * alpha * E # and then for the neural network: h = torch.mm( w1, xold) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data # zero the gradients w2.grad.data.zero_() b2.grad.data.zero_() w1.grad.data.zero_() b1.grad.data.zero_() # perform now the update of weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2
def learnit(numgames, epsilon, lam, alpha, alpha1, alpha2, w1, b1, w2, b2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): board = Backgammon.init_board() # initialize the board (empty) # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device = device, dtype = torch.float) Z_b1 = torch.zeros(b1.size(), device = device, dtype = torch.float) Z_w2 = torch.zeros(w2.size(), device = device, dtype = torch.float) Z_b2 = torch.zeros(b2.size(), device = device, dtype = torch.float) # player to start is "1" the other player is "-1" player = 1 otherplayer = -1 winner = 0 # this implies a draw isGameOver = False moveNumber = 0 while (isGameOver == False): dice = Backgammon.roll_dice() # use a policy to find action # both are using the neural-network to approximate the after-state value if (player == otherplayer): # this one flippes the board to find an action. possible_moves, possible_boards = Backgammon.legal_moves(flipped_agent.flip_board(np.copy(board)), dice, -player) action = epsilon_nn_greedy(flipped_agent.flip_board(np.copy(board)), dice, -player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) action = flipped_agent.flip_move(action) else: # this one uses the original board. possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) action = epsilon_nn_greedy(np.copy(board), dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) # perform move and update board for i in range(0,len(action)): board = Backgammon.update_board(board, action[i], player) if (1 == Backgammon.game_over(board)): # has this player won? winner = player isGameOver = True break # bail out of inner game loop # once both player have performed at least one move we can start doing updates if (1 < moveNumber): if otherplayer == player: # here we have player -1 updating the table V x_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype = torch.float, device = device)).view(28*2*6,1) h = torch.mm(w1,x_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash this with a sigmoid function target = y_sigmoid.detach().cpu().numpy() # lets also do a forward past for the old board, this is the state we will update h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error else: # here we have player 1 updating the neural-network (2 layer feed forward with Sigmoid units) x = Variable(torch.tensor(one_hot_encoding(board), dtype = torch.float, device = device)).view(28*2*6,1) # now do a forward pass to evaluate the new board's after-state value h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash this with a sigmoid function target = y_sigmoid.detach().cpu().numpy() # lets also do a forward past for the old board, this is the state we will update h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = 0 + gamma * target - y_sigmoid.detach().cpu().numpy() # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces using the gradients Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # we need to keep track of the last board state visited by the players if otherplayer == player: xold_flipped = Variable(torch.tensor(one_hot_encoding(flipped_agent.flip_board(board)), dtype=torch.float, device = device)).view(28*2*6,1) else: xold = Variable(torch.tensor(one_hot_encoding(board), dtype=torch.float, device = device)).view(28*2*6,1) # swap players player = -player moveNumber = moveNumber + 1 # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards if winner == otherplayer: reward = 0 elif winner == -otherplayer: reward = 1 else: reward = 0.5 # Now we perform the final update (terminal after-state value is zero) # these are basically the same updates as in the inner loop but for the final-after-states (xold and xold_flipped) # Fist we update the values for player -1 h = torch.mm(w1,xold_flipped) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta = (1.0 - reward) + gamma * 0 - y_sigmoid.detach().cpu().numpy() # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update of weights delta = torch.tensor(delta, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 # Then we update the values for player 1 h = torch.mm(w1,xold) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() # squash the output delta2 = reward + gamma * 0 - y_sigmoid.detach().cpu().numpy() # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients y_sigmoid.backward() # update the eligibility traces Z_w1 = gamma * lam * Z_w1 + w1.grad.data Z_b1 = gamma * lam * Z_b1 + b1.grad.data Z_w2 = gamma * lam * Z_w2 + w2.grad.data Z_b2 = gamma * lam * Z_b2 + b2.grad.data # zero the gradients w1.grad.data.zero_() b1.grad.data.zero_() w2.grad.data.zero_() b2.grad.data.zero_() # perform now the update of weights delta2 = torch.tensor(delta2, dtype = torch.float, device = device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2
def learnitDyna(numgames, epsilon, lam_w, alpha_w, gamma, numthink): A = np.zeros(4) for games in range(0, numgames): board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? count = 0 delta = 0 # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w3 = torch.zeros(w3.size(), device=device, dtype=torch.float) Z_b3 = torch.zeros(b3.size(), device=device, dtype=torch.float) Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w3_flip = torch.zeros(w3.size(), device=device, dtype=torch.float) Z_b3_flip = torch.zeros(b3.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) #play a game while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move = action(np.copy(board), epsilon, dice, player, i) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) #tvenna og vinnur i fyrri leik. BREAK!!!! if BG.game_over(board): break if BG.game_over(board): winner = player break if player == -1: board = flip_board(np.copy(board)) if (count > 1): if player == -1: #One-hot encoding of the board move_fliptemp = move x_fliptemp = ice_hot_encoding(board) xflip = Variable( torch.tensor(x_fliptemp, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn for old and new target = feed_forward_w(xflip) old_target = feed_forward_w(xflipold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip, Z_w3_flip, Z_b3_flip) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip w3.data = w3.data + alpha_w * delta * Z_w3_flip b3.data = b3.data + alpha_w * delta * Z_b3_flip # append to the model, for the first time we create A, else we just stack on it. if count == 2 and games == 0: A = np.array([[x_fliptempold], [move], [x_fliptemp], 0]) else: add_to_model = np.array([[x_fliptempold], [move], [x_fliptemp], 0]) A = np.vstack((A, add_to_model)) else: #One-hot encoding of the board move_temp = move x_temp = ice_hot_encoding(board) x = Variable( torch.tensor(x_temp, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn for old and new target = feed_forward_w(x) old_target = feed_forward_w(xold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 w3.data = w3.data + alpha_w * delta * Z_w3 b3.data = b3.data + alpha_w * delta * Z_b3 # append to the model, for the first time we create A, else we just stack on it. if count == 2 and games == 0: A = np.array([[x_tempold], [move], [x_temp], 0]) else: add_to_model = np.array([[x_tempold], [move], [x_temp], 0]) A = np.vstack((A, add_to_model)) if count > 2: for thought in range(0, numthink): state_indx = np.random.choice(A.shape[0]) state, move_temp, statenew, rewardtemp = A[state_indx] if statenew == 0: #Feed forward old state state = Variable( torch.tensor(state, dtype=torch.float, device=device)).view(encSize, 1) old_target1 = feed_forward_w(state) delta2 = rewardtemp + 0 - old_target1.detach().cpu( ).numpy() else: state = Variable( torch.tensor(state, dtype=torch.float, device=device)).view(encSize, 1) statenew = Variable( torch.tensor(statenew, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn target1 = feed_forward_w(statenew) #Feed forward old state old_target1 = feed_forward_w(state) delta2 = 0 + gamma * target1.detach().cpu().numpy( ) - old_target1.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target1.backward() # zero the gradients zero_gradients_critic() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta2 * w1.grad.data b1.data = b1.data + alpha_w * delta2 * b1.grad.data w2.data = w2.data + alpha_w * delta2 * w2.grad.data b2.data = b2.data + alpha_w * delta2 * b2.grad.data w3.data = w3.data + alpha_w * delta * w3.grad.data b3.data = b3.data + alpha_w * delta * b3.grad.data if (count < 2): if player == -1: x_fliptempold = ice_hot_encoding(board) xflipold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) else: x_tempold = ice_hot_encoding(board) xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) else: if player == -1: x_fliptempold = x_fliptemp xflipold = Variable( torch.tensor(xflip, dtype=torch.float, device=device)).view(encSize, 1) else: x_tempold = x_temp xold = Variable( torch.tensor(x, dtype=torch.float, device=device)).view(encSize, 1) if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 if winner == 1: reward = 1 reward_flip = 0 move_temp = move else: reward = 0 reward_flip = 1 move_fliptemp = move #update fyrir player 1 #Feed forward old state using w-NN old_target = feed_forward_w(xold) delta = reward + 0 - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 w3.data = w3.data + alpha_w * delta * Z_w3 b3.data = b3.data + alpha_w * delta * Z_b3 add_to_model = np.array([[x_tempold], [move_temp], 0, reward]) A = np.vstack((A, add_to_model)) #Feed forward old state using w-NN flip_target = feed_forward_w(xflipold) delta = reward_flip + 0 - flip_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients flip_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip, Z_w3_flip, Z_b3_flip) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip w3.data = w3.data + alpha_w * delta * Z_w3_flip b3.data = b3.data + alpha_w * delta * Z_b3_flip add_to_model = np.array([[x_fliptempold], [move_fliptemp], 0, reward_flip]) A = np.vstack((A, add_to_model))