def sim_once(self, s0): s = copy.deepcopy(s0) node = self._root while True: legal_states, who, legal_moves = Game.possible_moves(s) if len(legal_states) == 0: return None, None if node.is_leaf(): return node, s else: move, node = node.select() s = self.make_a_move(s, move, who)
def _evaluate_rollout(self, state, limit): # _, player, legal_moves = Game.possible_moves(state) winner = 0 # old_board = Board() # old_board.stones = state player = None for i in range(limit): legal_states, p, legal_moves = Game.possible_moves(state) if player is None: player = p if len(legal_states) == 0: break probs = self._rollout(state, legal_moves) mask = np.full_like(probs, -0.01) mask[:, legal_moves] = probs[:, legal_moves] probs = mask best_move = np.argmax(probs, 1)[0] idx = np.where(legal_moves == best_move)[0] # if idx.size == 0: # print(i, idx) # print(best_move) # print(probs.shape) # print(legal_moves) # print(probs) assert idx.size == 1 idx = idx[0] st1 = legal_states[idx] over, winner, last_loc = st1.is_over(state) if over: break state = st1 else: # If no break from the loop, issue a warning. print("WARNING: rollout reached move limit") if winner == 0: return 0 else: return 1 if winner == player else -1
def _playout(self, state, leaf_depth): start_time = time.time() node = self._root print('exploit') for i in range(leaf_depth): # print() legal_states, _, legal_moves = Game.possible_moves(state) # print(state) # print(legal_moves) # print('depth:', i, 'legal moves:', legal_moves.shape) if len(legal_states) == 0: break if node.is_leaf(): action_probs = self._policy(state) if len(action_probs) == 0: break # print('num of action-prob:', len(action_probs)) node.expand(action_probs) # print('num of children:', len(node._children)) best_move, node = node.select() idx = np.where(legal_moves == best_move)[0] if idx.size == 0: print('depth:', i, idx) print('best move:', best_move) # print(legal_moves) p = node.parent for a, s1 in p.children.items(): print(' ', a, s1.get_value()) assert idx.size == 1 state = legal_states[idx[0]] # duration = time.time() - start_time # print('time cost:', duration) print('rollout...') v = self._value(state) if self._lmbda < 1 else 0 z = self._evaluate_rollout( state, self._rollout_limit) if self._lmbda > 0 else 0 leaf_value = (1 - self._lmbda) * v + self._lmbda * z node.update_recursive(leaf_value, self._c_puct)
def _playout(self, state, leaf_depth): # start_time = time.time() node = self._root print('exploit') for i in range(leaf_depth): legal_states, _, legal_moves = Game.possible_moves(state) # print(state) # print(legal_moves) # print('depth:', i, 'legal moves:', legal_moves.shape) if len(legal_states) == 0: break if node.is_leaf(): action_probs = self._policy(state) if len(action_probs) == 0: break # print('num of action-prob:', len(action_probs)) node.expand(action_probs) # print('num of children:', len(node._children)) best_move, node = node.select() idx = np.where(legal_moves == best_move)[0] if idx.size == 0: print('depth:', i, idx) print('best move:', best_move) # print(legal_moves) p = node.parent for a, s1 in p.children.items(): print(' ', a, s1.get_value()) assert idx.size == 1 state = legal_states[idx[0]] # duration = time.time() - start_time # print('time cost:', duration) print('rollout...') v = self._value(state) if self._lmbda < 1 else 0 z = self._evaluate_rollout(state, self._rollout_limit) if self._lmbda > 0 else 0 leaf_value = (1 - self._lmbda) * v + self._lmbda * z node.update_recursive(leaf_value, self._c_puct)
def sim(self, board): visited_path = [] state = board winner = Board.STONE_EMPTY for _ in range(1, self.max_moves + 1): moves, player, _ = Game.possible_moves(state) state_new, state_new_val = self.get_best(state, moves, player) visited_path.append((player, state, state_new, state_new_val)) over, winner, _ = state_new.is_over(state) if over: break state = state_new self.total_sim += 1 ds = SupervisedDataSet(self.features_num, 2) for player, state, new, val in visited_path: plays = val[1] * self.total_sim + 1 wins = val[0] * self.total_sim if player == winner: wins += 1 ds.addSample(self.get_input_values(state, new, player), (wins, plays)) self.trainer.trainOnDataset(ds)
def sim(self, board): visited_path = [] state = board winner = Board.STONE_EMPTY for _ in range(1, self.max_moves + 1): moves, player = Game.possible_moves(state) state_new, state_new_val = self.get_best(state, moves, player) visited_path.append((player, state, state_new, state_new_val)) over, winner, _ = state_new.is_over(state) if over: break state = state_new self.total_sim += 1 ds = SupervisedDataSet(self.features_num, 2) for player, state, new, val in visited_path: plays = val[1] * self.total_sim + 1 wins = val[0] * self.total_sim if player == winner: wins += 1 ds.addSample(self.get_input_values(state, new, player), (wins, plays)) self.trainer.trainOnDataset(ds)
def _policy_fn(self, board): _, _, legal_moves = Game.possible_moves(board) state, _ = self.get_input_values(board.stones) probs = self.brain.get_move_probs(state) probs = probs[0, legal_moves] return list(zip(legal_moves, probs))