def do(self, board_real, dice, actor_theta, player): commentary = False print_results = False for i in range(0, 25): board = np.copy(board_real) old_state = np.copy(board_real) self.z = np.zeros(198) if(len(board) == 0): break count = 0 while not Backgammon.game_over(board) and not Backgammon.check_for_error(board): if commentary: print("Simulationgame: lets go player ", player) dice = Backgammon.roll_dice() if commentary: print("Simulationgame: rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == 1: move, new_state = self.nextMove(board_copy, dice, player, actor_theta) elif player == -1: move = agentX.action(board_copy, dice, player, i) if len(move) != 0: for m in move: board = Backgammon.update_board(board, m, player) if(player == 1 and count > 1): new_state = np.copy(board) if(not Backgammon.game_over(new_state) and not Backgammon.check_for_error(new_state)): delta = 0 + self.getValue(new_state, actor_theta, player) - self.getValue(old_state, actor_theta, player) self.theta = self.theta + (self.alpha * delta * self.z) self.z = self.lamb * self.z + getFeatures(old_state, player) old_state = new_state if commentary: print("Simulationgame: move from player", player, ":") Backgammon.pretty_print(board) player = -player count = count + 1 if(print_results): print("simulation game nr", i) Backgammon.pretty_print(board) delta = player * -1 + 0 - self.getValue(old_state, actor_theta, player) self.theta = np.add(self.theta , (self.alpha * delta * self.z)) self.z = self.lamb * self.z + getFeatures(old_state, player)
def play_a_game_random(commentary=False): board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? randomPlayer = -1 while not BG.game_over(board) and not BG.check_for_error(board): if commentary: print("lets go player ", player) # roll dice dice = BG.roll_dice() if commentary: print("rolled dices:", dice) # make a move (2 moves if the same number appears on the dice) for i in range(1 + int(dice[0] == dice[1])): board_copy = np.copy(board) if player == randomPlayer: move = flipped_agent.action(board_copy, dice, player, i) else: move = action(board_copy, dice, player, i) # update the board if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # give status after every move: if commentary: print("move from player", player, ":") BG.pretty_print(board) # players take turns player = -player # return the winner return -1 * player
def PlayPubEval(self, test_games=1): wins = [] for _ in range(test_games): env = backgammon() done = False while not done: dice = B.roll_dice() for _ in range(1 + int(dice[0] == dice[1])): possible_moves, possible_boards = env.legal_moves(dice, 1) n_actions = len(possible_moves) if n_actions == 0: break action = self.sample_action(possible_boards) old_board, new_board, reward, done = env.step( possible_moves[action], player=1) if done: break if not done: #env.swap_player() dice = B.roll_dice() for __ in range(1 + int(dice[0] == dice[1])): action = pubeval.agent_pubeval(np.copy(env.board), dice, oplayer=-1) old_board, new_board, reward, done = env.step( action, player=-1) if B.check_for_error(env.board): PubEvalErBilað if done: reward = 0 break #env.swap_player() wins.append(float(reward == 1)) return (np.mean(wins))
def learnit(numgames, lam_w, lam_th, alpha_w, alpha_th): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) count = 0 while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): #Mögulega taka mean af xtheta?? move, xtheta = action(np.copy(board), dice, player, i, True) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # if the player gets a double and wins the game in the first move. if BG.game_over(board): break if BG.game_over(board): winner = player break if player == -1: board = flip_board(np.copy(board)) if (count > 1): if player == -1: #One-hot encoding of the board xflip = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn for old and new target, _ = feed_forward_w(xflip) old_target, h_sigmoid = feed_forward_w(xflipold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip #Update theta grad_ln_pi = h_sigmoid - xtheta theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) xthetaflipold = xtheta else: #One-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn for old and new target, _ = feed_forward_w(x) old_target, h_sigmoid = feed_forward_w(xold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 #Update theta grad_ln_pi = h_sigmoid - xtheta theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) xthetaold = xtheta # we need to keep track of the last board state visited by the players if (count < 2): if player == -1: xflipold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: if player == -1: xflipold = Variable( torch.tensor(xflip, dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = Variable( torch.tensor(x, dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 if winner == 1: reward = 1 reward_flip = -1 xthetaold = xtheta else: reward = -1 reward_flip = 1 xthetaflipold = xtheta #update fyrir player 1 #Feed forward old state using w-NN old_target, h_sigmoid = feed_forward_w(xold) delta = reward + 0 - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 #Update theta grad_ln_pi = h_sigmoid - xthetaold theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi)) # update fyrir flipped player # and then for the neural network: #Feed forward w-NN #Feed forward old state using w-NN flip_target, h_sigmoid = feed_forward_w(xflipold) delta = reward_flip + 0 - flip_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients flip_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip #Update theta grad_ln_pi = h_sigmoid - xthetaflipold theta.data = theta.data + alpha_th * delta * grad_ln_pi.view( 1, len(grad_ln_pi))
def learnit(numgames, lam_w, alpha1, alpha2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): epsilon = 15000 / (15000 + games) I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) count = 0 if games % 1000 == 0: print(games) if games % 5000 == 0: print('Compete:') wins_for_player_1 = 0 loss_for_player_1 = 0 competition_games = 500 for j in range(competition_games): winner = play_a_game_random(commentary=False) if (winner == 1): wins_for_player_1 += 1.0 else: loss_for_player_1 += 1.0 print(wins_for_player_1, loss_for_player_1) while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move = action(np.copy(board), epsilon, dice, player, i) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) if BG.game_over(board): break if player == -1: board = flip_board(np.copy(board)) if (count > 1): # One-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) #Feed forward w-nn target = feed_forward_w(x) #Feed forward old state old_target = feed_forward_w(xolder) delta2 = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # we need to keep track of the last board state visited by the players if (count > 0): xolder = xold if (not BG.game_over(board)): if (count < 2): xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(7 * (n - 1) * 2, 1) else: xold = x if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 # The game epsiode has ended and we know the outcome of the game, and can find the terminal rewards reward = 1 #update fyrir winner # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: win_target = feed_forward_w(xold) delta2 = reward + gamma * 0 - win_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients win_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2 # update fyrir lúser reward = -1 # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: loser_target = feed_forward_w(x) # squash the output delta2 = reward + gamma * 0 - loser_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients loser_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha1 * delta2 * Z_w1 b1.data = b1.data + alpha1 * delta2 * Z_b1 w2.data = w2.data + alpha2 * delta2 * Z_w2 b2.data = b2.data + alpha2 * delta2 * Z_b2
def learnit(numgames, lam_w, lam_th, alpha1, alpha2): gamma = 1 # for completeness # play numgames games for training for games in range(0, numgames): I = 1 board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? # initilize all the eligibility traces for the NN for critic w Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) # initilize all the eligibility traces for the NN for actor theta Z_theta_1 = torch.zeros(theta_1.size(), device=device, dtype=torch.float) Z_thetab1 = torch.zeros(thetab1.size(), device=device, dtype=torch.float) Z_theta_2 = torch.zeros(theta_2.size(), device=device, dtype=torch.float) Z_thetab2 = torch.zeros(thetab2.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) count = 0 delta = 0 # play a game while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move, prob, index = action(np.copy(board), dice, player, i, True) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) # if the player gets a double and wins the game in the first move. if BG.game_over(board): break # check to see if the game is over if BG.game_over(board): break if player == -1: board = flip_board(np.copy(board)) # only update after the first two moves, because we are using afterstates # and both players have to make one move first. if (count > 1): # Ice-hot encoding of the board x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) #Feed forward w-NN target = feed_forward_w(x) #Feed forward old state using w-NN old_target = feed_forward_w(xolder) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 #Update theta logTarget = torch.log(prob) logTarget.backward(retain_graph=True) # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) zero_gradients_actor() # zero the gradients # perform the update for the weights for the actor, theta theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2 I = gamma * I # keep track of the last state the player was in if (count > 0): xolder = xold # keep track of the last state if (not BG.game_over(board)): if (count < 2): xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) else: xold = x # keep track of the old values from the NN to update the player who lost probold = prob indexold = index if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 # The game episode has ended and we know the outcome of the game, and can find the terminal rewards reward = 1 # update for the winner # these are basically the same updates as in the inner loop but for the final-after-states (x and xold) # and then for the neural network: win_target = feed_forward_w(xold) delta = reward + gamma * 0 - win_target.detach().cpu().numpy( ) # this is the usual TD error delta = torch.tensor(delta, dtype=torch.float, device=device) # using autograd and the contructed computational graph in pytorch compute all gradients win_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 # Update theta logTarget = torch.log(prob) logTarget.backward() # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) # zero the gradients zero_gradients_actor() theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2 # update fyrir lúser reward = -1 # these are basically the same updates as in the inner loop but for the final-after-states (sold and xold) # and then for the neural network: loser_target = feed_forward_w(x) # squash the output delta = reward + gamma * 0 - loser_target.detach().cpu().numpy( ) # this is the usual TD error delta = torch.tensor(delta, dtype=torch.float, device=device) # using autograd and the contructed computational graph in pytorch compute all gradients loser_target.backward() # update the eligibility traces Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w(gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2) # zero the gradients zero_gradients_critic() # perform now the update of weights w1.data = w1.data + alpha1 * delta * Z_w1 b1.data = b1.data + alpha1 * delta * Z_b1 w2.data = w2.data + alpha2 * delta * Z_w2 b2.data = b2.data + alpha2 * delta * Z_b2 #Update theta logTarget = torch.log(probold) logTarget.backward() # update the eligibility traces using the gradients Z_theta_2, Z_thetab2, Z_theta_1, Z_thetab1 = update_eligibility_th( gamma, lam_w, Z_theta_1, Z_thetab1, Z_theta_2, Z_thetab2, I) # zero the gradients zero_gradients_actor() theta_1.data = theta_1.data + alpha1 * delta * Z_theta_1 thetab1.data = thetab1.data + alpha1 * delta * Z_thetab1 theta_2.data = theta_2.data + alpha2 * delta * Z_theta_2 thetab2.data = thetab2.data + alpha2 * delta * Z_thetab2
def learnitDyna(numgames, epsilon, lam_w, alpha_w, gamma, numthink): A = np.zeros(4) for games in range(0, numgames): board = BG.init_board() # initialize the board player = np.random.randint(2) * 2 - 1 # which player begins? count = 0 delta = 0 # now we initilize all the eligibility traces for the neural network Z_w1 = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1 = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2 = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2 = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w3 = torch.zeros(w3.size(), device=device, dtype=torch.float) Z_b3 = torch.zeros(b3.size(), device=device, dtype=torch.float) Z_w1_flip = torch.zeros(w1.size(), device=device, dtype=torch.float) Z_b1_flip = torch.zeros(b1.size(), device=device, dtype=torch.float) Z_w2_flip = torch.zeros(w2.size(), device=device, dtype=torch.float) Z_b2_flip = torch.zeros(b2.size(), device=device, dtype=torch.float) Z_w3_flip = torch.zeros(w3.size(), device=device, dtype=torch.float) Z_b3_flip = torch.zeros(b3.size(), device=device, dtype=torch.float) if games % 100 == 0: print(games) #play a game while not BG.game_over(board) and not BG.check_for_error(board): dice = BG.roll_dice() for i in range(1 + int(dice[0] == dice[1])): move = action(np.copy(board), epsilon, dice, player, i) if len(move) != 0: for m in move: board = BG.update_board(board, m, player) #tvenna og vinnur i fyrri leik. BREAK!!!! if BG.game_over(board): break if BG.game_over(board): winner = player break if player == -1: board = flip_board(np.copy(board)) if (count > 1): if player == -1: #One-hot encoding of the board move_fliptemp = move x_fliptemp = ice_hot_encoding(board) xflip = Variable( torch.tensor(x_fliptemp, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn for old and new target = feed_forward_w(xflip) old_target = feed_forward_w(xflipold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip, Z_w3_flip, Z_b3_flip) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip w3.data = w3.data + alpha_w * delta * Z_w3_flip b3.data = b3.data + alpha_w * delta * Z_b3_flip # append to the model, for the first time we create A, else we just stack on it. if count == 2 and games == 0: A = np.array([[x_fliptempold], [move], [x_fliptemp], 0]) else: add_to_model = np.array([[x_fliptempold], [move], [x_fliptemp], 0]) A = np.vstack((A, add_to_model)) else: #One-hot encoding of the board move_temp = move x_temp = ice_hot_encoding(board) x = Variable( torch.tensor(x_temp, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn for old and new target = feed_forward_w(x) old_target = feed_forward_w(xold) delta = 0 + gamma * target.detach().cpu().numpy( ) - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3) # zero the gradients zero_gradients_critic() # perform now the update for the weights delta = torch.tensor(delta, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 w3.data = w3.data + alpha_w * delta * Z_w3 b3.data = b3.data + alpha_w * delta * Z_b3 # append to the model, for the first time we create A, else we just stack on it. if count == 2 and games == 0: A = np.array([[x_tempold], [move], [x_temp], 0]) else: add_to_model = np.array([[x_tempold], [move], [x_temp], 0]) A = np.vstack((A, add_to_model)) if count > 2: for thought in range(0, numthink): state_indx = np.random.choice(A.shape[0]) state, move_temp, statenew, rewardtemp = A[state_indx] if statenew == 0: #Feed forward old state state = Variable( torch.tensor(state, dtype=torch.float, device=device)).view(encSize, 1) old_target1 = feed_forward_w(state) delta2 = rewardtemp + 0 - old_target1.detach().cpu( ).numpy() else: state = Variable( torch.tensor(state, dtype=torch.float, device=device)).view(encSize, 1) statenew = Variable( torch.tensor(statenew, dtype=torch.float, device=device)).view(encSize, 1) #Feed forward w-nn target1 = feed_forward_w(statenew) #Feed forward old state old_target1 = feed_forward_w(state) delta2 = 0 + gamma * target1.detach().cpu().numpy( ) - old_target1.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target1.backward() # zero the gradients zero_gradients_critic() # perform now the update for the weights delta2 = torch.tensor(delta2, dtype=torch.float, device=device) w1.data = w1.data + alpha_w * delta2 * w1.grad.data b1.data = b1.data + alpha_w * delta2 * b1.grad.data w2.data = w2.data + alpha_w * delta2 * w2.grad.data b2.data = b2.data + alpha_w * delta2 * b2.grad.data w3.data = w3.data + alpha_w * delta * w3.grad.data b3.data = b3.data + alpha_w * delta * b3.grad.data if (count < 2): if player == -1: x_fliptempold = ice_hot_encoding(board) xflipold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) else: x_tempold = ice_hot_encoding(board) xold = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) else: if player == -1: x_fliptempold = x_fliptemp xflipold = Variable( torch.tensor(xflip, dtype=torch.float, device=device)).view(encSize, 1) else: x_tempold = x_temp xold = Variable( torch.tensor(x, dtype=torch.float, device=device)).view(encSize, 1) if player == -1: board = flip_board(np.copy(board)) # swap players player = -player count += 1 if winner == 1: reward = 1 reward_flip = 0 move_temp = move else: reward = 0 reward_flip = 1 move_fliptemp = move #update fyrir player 1 #Feed forward old state using w-NN old_target = feed_forward_w(xold) delta = reward + 0 - old_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients old_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w3, Z_b3, Z_w2, Z_b2, Z_w1, Z_b1 = update_eligibility_w( gamma, lam_w, Z_w1, Z_b1, Z_w2, Z_b2, Z_w3, Z_b3) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1 b1.data = b1.data + alpha_w * delta * Z_b1 w2.data = w2.data + alpha_w * delta * Z_w2 b2.data = b2.data + alpha_w * delta * Z_b2 w3.data = w3.data + alpha_w * delta * Z_w3 b3.data = b3.data + alpha_w * delta * Z_b3 add_to_model = np.array([[x_tempold], [move_temp], 0, reward]) A = np.vstack((A, add_to_model)) #Feed forward old state using w-NN flip_target = feed_forward_w(xflipold) delta = reward_flip + 0 - flip_target.detach().cpu().numpy( ) # this is the usual TD error # using autograd and the contructed computational graph in pytorch compute all gradients flip_target.backward() # update the eligibility traces using the gradients delta = torch.tensor(delta, dtype=torch.float, device=device) Z_w3_flip, Z_b3_flip, Z_w2_flip, Z_b2_flip, Z_w1_flip, Z_b1_flip = update_eligibility_w( gamma, lam_w, Z_w1_flip, Z_b1_flip, Z_w2_flip, Z_b2_flip, Z_w3_flip, Z_b3_flip) # zero the gradients zero_gradients_critic() # perform the update for the weights for the critic, w w1.data = w1.data + alpha_w * delta * Z_w1_flip b1.data = b1.data + alpha_w * delta * Z_b1_flip w2.data = w2.data + alpha_w * delta * Z_w2_flip b2.data = b2.data + alpha_w * delta * Z_b2_flip w3.data = w3.data + alpha_w * delta * Z_w3_flip b3.data = b3.data + alpha_w * delta * Z_b3_flip add_to_model = np.array([[x_fliptempold], [move_fliptemp], 0, reward_flip]) A = np.vstack((A, add_to_model))