def action(board_copy,dice,player,i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy # policy missing, returns a random move for the time being # # # # # epsilon = 0.1 w1 = torch.load('./w1_trained.pth', map_location=lambda storage, loc: storage) w2 = torch.load('./w2_trained.pth', map_location=lambda storage, loc: storage) b1 = torch.load('./b1_trained.pth', map_location=lambda storage, loc: storage) b2 = torch.load('./b2_trained.pth', map_location=lambda storage, loc: storage) #w1 = torch.load('./w1_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #w2 = torch.load('./w2_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #b1 = torch.load('./b1_trained_first_time_working.pth', map_location=lambda storage, loc: storage) #b2 = torch.load('./b2_trained_first_time_working.pth', map_location=lambda storage, loc: storage) move = neural_network_agent.epsilon_nn_greedy(board_copy, dice, player, epsilon, w1, b1, w2, b2, possible_moves, possible_boards, False) return move
def action(board, dice, oplayer, i = 0): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) na = len(possible_boards) if (na == 0): return [] xa = np.zeros((na,nx+1)) va = np.zeros((na)) for j in range(0, na): xa[j,:] = one_hot_encoding(possible_boards[j],i) x = Variable(torch.tensor(xa.transpose(), dtype = torch.float, device = device)) # now do a forward pass to evaluate the board's after-state value h = torch.mm(w1,x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm(w2,h_sigmoid) + b2 # multiply with the output weights w2 and add bias va = y.sigmoid().detach().cpu() action = possible_moves[np.argmax(va)] if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy # policy missing, returns a random move for the time being # # # # # move = epsilon_nn_greedy(board_copy, player, epsilon, w1, b1, w2, b2, debug) return move
def action(board_copy, dice, player, i, model): global actionCount # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] #Backgammon.pretty_print(board_copy) after_state, action = epsilon_nn_greedy(board_copy, possible_moves, possible_boards, player, model) #model.xtheta = xtheta_mean if (actionCount > 0): model.updateNeural(after_state) if (actionCount > 1): model.dynaUpdate() actionCount += 1 model.xold = Variable( torch.tensor(one_hot_encoding(after_state), dtype=torch.float, device=model.device)).view((28 * 31, 1)) return action
def action(board_copy, dice, player, i, model): global actionCount # starts by flipping the board so that the player always sees himself as player 1 if player == -1: board_copy = flip_board(board_copy) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] # Make the bestmove: after_state, action = epsilon_nn_greedy(board_copy, possible_moves, possible_boards, player, model) #model.xtheta = xtheta_mean if (actionCount > 0): model.updateNeural(after_state) if (actionCount > 1): model.dynaUpdate() actionCount += 1 model.xold = Variable( torch.tensor(one_hot_encoding(after_state), dtype=torch.float, device=model.device)).view((28 * 31, 1)) # if the table was flipped the move has to be flipped as well if player == -1: move = flip_move(action) return move
def action(board_copy, epsilon, dice, player, i): if player == -1: board_copy = flip_board(board_copy) possible_moves, possible_boards = BG.legal_moves(board_copy, dice, player=1) na = len(possible_moves) va = np.zeros(na) j = 0 # if there are no moves available if na == 0: return [] if (np.random.uniform() < epsilon): move = possible_moves[randrange(na)] if player == -1: move = flip_move(move) return move for board in possible_boards: # encode the board to create the input x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(encSize, 1) # now do a forward pass to evaluate the board's after-state value va[j] = feed_forward_w(x) j += 1 move = possible_moves[np.argmax(va)] if player == -1: move = flip_move(move) return move
def action(net, board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy if player == -1: board_copy = flip_board(board_copy) ##Flip the board # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available if len(possible_moves) == 0: return [] feature_boards = [] ###Create new features using Tsesauros for b in possible_boards: feature_boards.append(oneHot(b)) ### Get probabilites of each action via the actor forward probs, log_probs = net.actor.forward(feature_boards) ###index is used as a help to pick action index = np.arange(0, len(possible_boards)) ###This works because numpy and pytorch hate each other probs = probs.detach().numpy() ###The index of the action chose i = choice(index, p=probs) move = possible_moves[ i] ###Pick the next move according to the index selected newBoard = possible_boards[ i] ###Pick the nex board according to the index selected newBoardFeatures = oneHot(newBoard) ### Critic feedforward target, oldtarget = net.critic.forward(newBoardFeatures, oneHot( board_copy)) #(newBoardFeatures,getFeatures(board_copy,player) ) R = 0 if (Backgammon.game_over(newBoard) ): ###Did I win? If so the reward shall be +1 R = 1 target = 0 ###Terminal state is 0 ### Now we update the neaural network # target, oldtarget =net.critic.forward(newBoardFeatures,getFeatures(board_copy,player) ) delta = R + net.gamma * target - oldtarget ###Update the critic via backpropgation net.critic.backward(R, delta, net.gamma) ###Update the actor via backpropogation net.actor.backward(log_probs[i], delta, net.gamma) if player == -1: move = flip_move(move) ###Flip the move return move
def greedy_action(self, board, dice, player, i): if player == -1: board = flip_board(board) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] na = len(possible_boards) enc = np.zeros((na, 312)) for i in range(0, na): enc[i, :] = oneHot(possible_boards[i]) x = Variable( torch.tensor(enc.transpose(), dtype=torch.double, device=self.device)) h = torch.mm(self.w1, x) + self.b1 h_sigmoid = h.sigmoid() y = torch.mm(self.W, h_sigmoid) + self.B va = y.sigmoid().detach().cpu() action = possible_moves[np.argmax(va)] if player == -1: action = flip_move(action) return action
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # starts by flipping the board so that the player always sees himself as player 1 if player == -1: board_copy = flip_board(board_copy) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] # Make the bestmove: # policy missing, returns a random move for the time being # # # # # move = possible_moves[np.random.randint(len(possible_moves))] # if the table was flipped the move has to be flipped as well if player == -1: move = flip_move(move) return move
def legal_moves(self, dice, player): moves, boards = B.legal_moves(board=self.board, dice=dice, player=player) if len(boards) == 0: return [], [] boards = np.vstack(boards) return moves, boards
def legal_moves(self, board, dice, player): if player == -1: board = FA.flip_board(np.copy(board)) moves, boards = B.legal_moves(board=board, dice=dice, player=1) if len(boards) == 0: return [], [] boards = np.vstack(boards) return moves, boards
def e_legal_moves(board, dice, player=1): moves, boards = B.legal_moves(board, dice=dice, player=player) if len(boards) == 0: return [], features(board, player) n_boards = np.shape(boards)[0] tesauro = np.zeros((n_boards, 198)) for b in range(n_boards): tesauro[b, :] = features(boards[b], player) tesauro = np.array(tesauro) return moves, tesauro
def ExamplePolicy(self): _, st = B.legal_moves(B.init_board(), B.roll_dice(), 1) st = np.vstack(st) st = st[:, 1:] out = np.round( self._s.run(self._actor_policy, ({ self._possible_states: st })) * 100) / 100 out = out.flatten() out.sort() return out[::-1]
def action(net, board_copy, dice, player, i, learn=True): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy if player == -1: board_copy = flip_board(board_copy) ##Flip the board # check out the legal moves available for the throw if (player == 1): xold = net.xold net.xnew = board_copy else: ######################################################################## xold = net.xFlipOld net.xFlipNew = board_copy possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available if len(possible_moves) == 0: return [] one_hot = [] for b in possible_boards: one_hot.append(oneHot(b)) if learn: if not net.firstMove: net.update(player) m, xtheta = net.actor(one_hot, possible_moves) if player == 1: net.xtheta = xtheta else: net.flipxtheta = xtheta move = possible_moves[m] newBoard = possible_boards[m] # if learn: # if not net.firstMove: # net.update(player) if player == -1: move = flip_move(move) ###Flip the move if player == 1: net.xold = board_copy else: net.xFlipOld = board_copy net.firstMove = False return move
def action(board_copy, dice, player, i, learning=False): if player == -1: board_copy = flip_board(board_copy) # Get every possible move and board xtheta_mean = torch.zeros((len(theta), 1)) possible_moves, possible_boards = BG.legal_moves(board_copy, dice, player=1) na = len(possible_moves) one_hot_boards = np.zeros((2 * (n - 1) * 7, na)) j = 0 # if there are no moves available if len(possible_moves) == 0: x = Variable( torch.tensor(ice_hot_encoding(board_copy), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) h_sigmoid = feed_forward_th(x) pi = torch.mm(theta, h_sigmoid).softmax(0) xtheta_mean = h_sigmoid * pi.item() if learning == True: return [], xtheta_mean else: return [] for board in possible_boards: # encode the board to create the input for the NN x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) one_hot_boards[:, j] = x[:, 0] j += 1 # select the move from a distribution X = Variable(torch.tensor(one_hot_boards, dtype=torch.float, device=device)) h = feed_forward_th(X) h_sigmoid = h.sigmoid() pi = torch.mm(theta, h_sigmoid).softmax(1) xtheta_mean = torch.sum(torch.mm(h_sigmoid, torch.diagflat(pi)), 1) xtheta_mean = torch.unsqueeze(xtheta_mean, 1) move_index = torch.multinomial(pi, num_samples=1) move = possible_moves[move_index] if player == -1: move = flip_move(move) if learning == True: return move, xtheta_mean return move
def action(board_copy, dice, player, i): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy move = [] # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice) # make the best move according to the policy if len(possible_moves) != 0: move = policy(possible_moves, possible_boards, dice, i) return move
def action(board_copy, dice, player, i): if player == -1: board_copy = FA.flip_board(np.copy(board_copy)) possible_moves, possible_boards = B.legal_moves(board_copy, dice, 1) if len(possible_moves) == 0: return [] action = AgentJ.sample_action(np.vstack(possible_boards)) move = possible_moves[action] if player == -1: move = FA.flip_move(move) return move
def nextMove(self, board, dice, player, actor_theta): possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) if(len(possible_moves) == 0): return [], [] board_vals = np.zeros(len(possible_boards)) for k in range(0, len(possible_boards)): board_vals[k] = self.getValue(possible_boards[k], actor_theta, player) pi_vals = softmax(board_vals) index = np.arange(0, len(possible_boards)) i = choice(index, p=pi_vals) move = possible_moves[i] newBoard = possible_boards[i] return move, newBoard
def epsilon_nn_greedy(board, player, epsilon, w1, b1, w2, b2, debug=False): moves = Backgammon.legal_moves(board) if np.random.uniform() < epsilon: if debug is True: print("explorative move") return np.random.choice(moves, 1) na = np.size(moves) va = np.zeros(na) for i in range(0, na): board[moves[i]] = player # encode the board to create the input # FEATURES eru X # va[i] = y.sigmoid() return moves[np.argmax(va)]
def action(net, board_copy, dice, player, i): if player == -1: board_copy = flipped_agent.flip_board(board_copy) # #Flip the board # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) if len(possible_moves) == 0: return [] move = [] if player == -1: move = flipped_agent.flip_move(move) # ##Flip the move return move
def action(board_copy, dice, player, i): global count # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] # make the best move according to the policy na = len(possible_moves) va = np.zeros(na) for i in range(0, na): move = possible_moves[i] board = possible_boards[i] # encode the board to create the input x = Variable( torch.tensor(one_hot_encoding(board), dtype=torch.float, device=device)).view(29, 31) # now do a forward pass to evaluate the board's after-state value h = torch.mm( w1, x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid) + b2 # multiply with the output weights w2 and add bias y_sigmoid = y.sigmoid() z = torch.mm(y_sigmoid, w3) + b3 va[i] = z.sigmoid() count += 1 if not Backgammon.game_over(possible_boards[np.argmax(va)]): update(possible_boards[np.argmax(va)]) else: reward = 1 if player == 1 else 0 update(possible_boards[np.argmax(va)], reward) return possible_moves[np.argmax(va)]
def action(board_copy,dice,player,i, y_old, model, firstMove, training): # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player) # if there are no moves available if len(possible_moves) == 0: return [], y_old boards = [] for board in possible_boards: boards.append(getinputboard(board)) if(not firstMove and training): # learn learn(y_old, model, boards, "") # take greedy Action action, y_new = greedy(boards, model) move = possible_moves[action] # make the best move according to the policy return move, y_new
def action(board, dice, oplayer, nRoll = 0): flipped_player = -1 if (flipped_player == oplayer): board = flipped_agent.flip_board(np.copy(board)) player = -flipped_player else: player = oplayer # check out the legal moves available for the throw race = c_int(israce(board)) possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player) na = len(possible_moves) va = np.zeros(na) if (na == 0): return [] for i in range(0, na): board = pubeval_flip(possible_boards[i]) board = board.astype(dtype = ctypes.c_int) va[i] = lib.pubeval(race, board.ctypes.data_as(intp)) action = possible_moves[np.argmax(va)] if (flipped_player == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def action(board, dice, oplayer, i=0): flippedplayer = -1 if (flippedplayer == oplayer): # view it from player 1 perspective board = flipped_agent.flip_board(np.copy(board)) player = -oplayer # player now the other player +1 else: player = oplayer possible_moves, possible_boards = Backgammon.legal_moves( board, dice, player) # if there are no moves available if len(possible_moves) == 0: return [] after_state, action = epsilon_nn_greedy(board, possible_moves, possible_boards, player) if (flippedplayer == oplayer): # map this move to right view action = flipped_agent.flip_move(action) return action
def softMax(self, board, dice, player, i): if player == -1: board = flip_board(board) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] na = len(possible_boards) enc = np.zeros((na, 312)) for i in range(0, na): enc[i, :] = oneHot(possible_boards[i]) x = Variable( torch.tensor(enc.transpose(), dtype=torch.double, device=self.device)) h = torch.mm(self.w1, x) + self.b1 h_sigmoid = h.sigmoid() pi = (torch.mm(self.theta, h_sigmoid)).softmax(1) xtheta_mean = torch.sum(torch.mm(h_sigmoid, torch.diagflat(pi)), 1) xtheta_mean = torch.unsqueeze(xtheta_mean, 1) if player == 1: self.xtheta = xtheta_mean else: self.xthetaF = xtheta_mean self.xtheta = xtheta_mean m = torch.multinomial(pi, 1) action = possible_moves[m] if player == -1: action = flip_move(action) return action
def nextMove(self, board, dice, player, search_theta): if player == -1: board_copy = flip_board(board_copy) possible_moves, possible_boards = Backgammon.legal_moves(board, dice, player=1) if(len(possible_moves) == 0): return [], [] # feature_boards = [] board_vals = np.zeros(len(possible_boards)) for k in range(0, len(possible_boards)): # feature_boards.append() board_vals[k] = self.getValue(possible_boards[k], search_theta) i = np.where(board_vals == max(board_vals)) if(len(i[0]) > 1): i = choice(i[0]) else: i = i[0][0] if player == -1: move = flip_move(move) move = possible_moves[i] # ##Pick the next move according to the index selected newBoard = possible_boards[i] # ##Pick the nex board according to the index selected return move, newBoard
def action(board_copy, dice, player, i, net=None): # the champion to be # inputs are the board, the dice and which player is to move # outputs the chosen move accordingly to its policy # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves( board_copy, dice, player) # console.log(possible_moves) # if there are no moves available if len(possible_moves) == 0: return [] ret_arr, softmax_deriv = softmax(possible_moves, possible_boards, board_copy, player, net) s_prime = ret_arr[0] # print(0 + gamma * net.torch_nn.forward(getFeatures(s_prime, player)) - net.torch_nn.forward(getFeatures(board_copy, player))) # delta = 0 + gamma * net.val_func_nn.forward(s_prime, player) - net.val_func_nn.forward(board_copy, player) delta = 0 + net.gamma * net.torch_nn.forward(getFeatures( s_prime, player)) - net.torch_nn.forward( getFeatures(board_copy, player)) # print(delta) net.torch_nn.backward(net.gamma, delta) # print("delta is %i", delta) # net.val_func_nn.w = net.val_func_nn.w + (net.val_func_nn.alpha_w * delta * net.val_func_nn.backward(board_copy, player)) # net.policy_nn.theta = np.append(np.ravel(nn.input_weights), nn.hidden_weights) # backprop # HERA WEIGHTS I SITTHVORU LAGI net.torch_nn_policy.theta = net.torch_nn_policy.theta + net.torch_nn_policy.alpha_theta * net.i * delta * softmax_deriv net.i = net.gamma * net.i # if(i > 1) return ret_arr[1]
def epsilon_nn_greedy(board, dice, player, epsilon, w1, b1, w2, b2, debug=False): possible_moves, possible_boards = Backgammon.legal_moves( board, dice, player) if (np.random.uniform() < epsilon): if debug == True: print("explorative move") return possible_moves[np.random.randint(len(possible_moves))] na = len(possible_boards) va = np.zeros(na) for i in range(0, na): # encode the board to create the input x = Variable( torch.tensor(one_hot_encoding(possible_moves[i]), dtype=torch.float, device=device)).view(28 * 2 * 6, 1) p, h = actor_policy_forward(x, w1, b1, w2, b2) #need this information for the backpropagation for policy gradient xs.append(x) #inputs hs.append(h) #hidden states # now do a forward pass to evaluate the board's after-state value h = torch.mm( w1, x) + b1 # matrix-multiply x with input weight w1 and add bias h_sigmoid = h.sigmoid() # squash this with a sigmoid function y = torch.mm( w2, h_sigmoid) + b2 # multiply with the output weights w2 and add bias logProp.append(y - p) va[i] = y.sigmoid() return possible_moves[np.argmax(va)]
def action(board_copy,dice,player,i,train=False,train_config=None): """ inputs are the board, the dice and which player is to move outputs the chosen move accordingly to its policy """ # global variables global counter global bearing_off_counter # starts by flipping the board so that the player always sees himself as player 1 if player == -1: board_copy = flip_board(board_copy) # check out the legal moves available for the throw possible_moves, possible_boards = Backgammon.legal_moves(board_copy, dice, player=1) # if there are no moves available, return an empty move if len(possible_moves) == 0: return [] if not bearing_off(board_copy): model = DQN buffer = D else: model = DQN_bearing_off buffer = D_bearing_off bearing_off_counter += 1 # Current state and Q value, possible next states S = np.array([board_2_state(board_copy, i==2)]) Q = model(S) first_of_2 = 1+(dice[0] == dice[1])-i S_primes = np.array([board_2_state(b, first_of_2) for b in possible_boards]) # Find best action and it's q-value w/ epsilon-greedy Q_primes = model(S_primes) # TODO: only evaluate unique boards action = np.argmax(Q_primes) if train and np.random.rand() < config.eps: # epsilon-greedy when training action = np.random.randint(len(possible_moves)) # TODO: Fix the 16-piece bug (1 hour) # print("action:", action) # print("board:") # Backgammon.pretty_print(board_copy) # print('"endgames":') # [Backgammon.pretty_print(b) for b in possible_boards] if train: # # number of games # g = train_config['g'] # state S_prime = np.array([board_2_state(possible_boards[action], first_of_2)]) # Target update if not bearing_off(possible_boards[action]): target_model = DQN_target else: target_model = DQN_bearing_off_target Q_max = target_model(S_prime) r = game_won(possible_boards[action]) target = Q + config.lr*(r + config.gamma*Q_max - Q) buffer.push(S, None, r, S_prime, target, done=True) # update the target network every C steps if counter % config.C == 0: target_model.set_weights(model.get_weights()) # train model from buffer if counter % config.batch_size == 0 and bearing_off_counter > config.batch_size: state_batch, action_batch, reward_batch, next_state_batch, target_batch, done_batch = D.sample(config.batch_size) DQN.train_on_batch(np.array(state_batch), np.array(target_batch)) state_batch, action_batch, reward_batch, next_state_batch, target_batch, done_batch = D_bearing_off.sample(config.batch_size) DQN_bearing_off.train_on_batch(np.array(state_batch), np.array(target_batch)) # save model every 1000_000 training moves if counter % 10_000_000 == 0 and not counter in saved_models and counter != 0: # save both networks filepath = "./kotra_weights/DQN_"+str(counter) print("saving weights in file:"+filepath) DQN.save(filepath, overwrite=True, include_optimizer=True) filepath += "bearing_off" print("saving bearing-off-weights in file:"+filepath) DQN_bearing_off.save(filepath, overwrite=True, include_optimizer=True) saved_models.append(counter) counter += 1
def action(board_copy, dice, player, i, learning=False): if player == -1: board_copy = flip_board(board_copy) # Get every possible move and board possible_moves, possible_boards = BG.legal_moves(board_copy, dice, player=1) na = len(possible_moves) # stores the output of the NN if na == 0: values = Variable(torch.zeros((7 * (n - 1) * 2), device=device, dtype=torch.float), requires_grad=False) else: values = Variable(torch.zeros((7 * (n - 1) * 2, na), device=device, dtype=torch.float), requires_grad=False) j = 0 # if there are no moves available if len(possible_moves) == 0: x = Variable( torch.tensor(ice_hot_encoding(board_copy), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) prob_temp = feed_forward_th(x) prob_temp = prob_temp.softmax(dim=0) prob_nomove = torch.tensor([prob_temp], dtype=torch.float, device=device, requires_grad=True) move_index = torch.tensor([0], device=device) if learning == True: return [], prob_nomove, move_index else: return [] for board in possible_boards: # encode the board to create the input for the NN x = Variable( torch.tensor(ice_hot_encoding(board), dtype=torch.float, device=device)).view(2 * (n - 1) * 7, 1) values[:, j] = x[:, 0] j += 1 # forward pass to evaluate all of the boards' after-state values using the NN prob = feed_forward_th(values) # squash the after state values with softmax prob = prob.softmax(dim=-1) prob_temp = torch.tensor(prob[0, :], dtype=torch.float, device=device, requires_grad=True) # select the move from a distribution move_index = torch.multinomial(prob_temp, num_samples=1) move_index = Variable(move_index, requires_grad=False) move = possible_moves[move_index] if player == -1: move = flip_move(move) if learning == True: return move, prob_temp[move_index], move_index return move