def _expand(self, state, current_player): s = self.state_to_str(state, current_player) with tf.device("/cpu:0"): nn_policy, nn_value = self.network.predict( othello.encode_state(state, current_player)) nn_policy, nn_value = nn_policy.numpy().tolist()[0], nn_value.numpy( )[0][0] self.P[s] = nn_policy self.N[s] = [0] * othello.ACTION_SPACE self.W[s] = [0] * othello.ACTION_SPACE valid_actions = othello.get_valid_actions(state, current_player) #: cache valid actions and next state to save computation self.next_states[s] = [ othello.step(state, action, current_player)[0] if (action in valid_actions) else None for action in range(othello.ACTION_SPACE) ] return nn_value
def npc_action(self): print("NPC action") valid_actions = othello.get_valid_actions(self.state, self.npc) if self.npc_type == "random": action = random.choice(valid_actions) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return elif self.npc_type == "eps-greedy": if random.random() > self.epsilon: best_action = None best_score = 0 for action in valid_actions: next_state, done = othello.step(self.state, action, self.npc) _, score = othello.count_stone(next_state) if score > best_score: best_score = score best_action = action self.state, done = othello.step(self.state, best_action, self.npc) else: action = random.choice(valid_actions) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return elif self.npc_type == "alphazero": mcts_policy = self.mcts.search(root_state=self.state, current_player=self.npc, num_simulations=50) print(np.array(mcts_policy[:-1]).reshape(6, 6)) action = np.argmax(mcts_policy) self.state, done = othello.step(self.state, action, self.npc) self.refresh() self.update_label() if done: self.update_label(game_end=True) return else: raise NotImplementedError()
def search(self, root_state, current_player, num_simulations): s = self.state_to_str(root_state, current_player) if s not in self.P: _ = self._expand(root_state, current_player) valid_actions = othello.get_valid_actions(root_state, current_player) #: Adding Dirichlet noise to the prior probabilities in the root node if self.alpha is not None: dirichlet_noise = np.random.dirichlet(alpha=[self.alpha] * len(valid_actions)) for a, noise in zip(valid_actions, dirichlet_noise): self.P[s][a] = (1 - self.eps) * self.P[s][a] + self.eps * noise #: MCTS simulation for _ in range(num_simulations): U = [ self.c_puct * self.P[s][a] * math.sqrt(sum(self.N[s])) / (1 + self.N[s][a]) for a in range(othello.ACTION_SPACE) ] Q = [w / n if n != 0 else 0 for w, n in zip(self.W[s], self.N[s])] assert len(U) == len(Q) == othello.ACTION_SPACE scores = [u + q for u, q in zip(U, Q)] #: Mask invalid actions scores = np.array([ score if action in valid_actions else -np.inf for action, score in enumerate(scores) ]) #: np.argmaxでは同値maxで偏るため action = random.choice(np.where(scores == scores.max())[0]) next_state = self.next_states[s][action] v = -self._evaluate(next_state, -current_player) self.W[s][action] += v self.N[s][action] += 1 mcts_policy = [n / sum(self.N[s]) for n in self.N[s]] return mcts_policy
def _evaluate(self, state, current_player): s = self.state_to_str(state, current_player) if othello.is_done(state, current_player): #: ゲーム終了 reward_first, reward_second = othello.get_result(state) reward = reward_first if current_player == 1 else reward_second return reward elif s not in self.P: #: ゲーム終了していないリーフノードの場合は展開 nn_value = self._expand(state, current_player) return nn_value else: #: 子ノードをevaluate U = [ self.c_puct * self.P[s][a] * math.sqrt(sum(self.N[s])) / (1 + self.N[s][a]) for a in range(othello.ACTION_SPACE) ] Q = [q / n if n != 0 else q for q, n in zip(self.W[s], self.N[s])] assert len(U) == len(Q) == othello.ACTION_SPACE valid_actions = othello.get_valid_actions(state, current_player) scores = [u + q for u, q in zip(U, Q)] scores = np.array([ score if action in valid_actions else -np.inf for action, score in enumerate(scores) ]) best_action = random.choice(np.where(scores == scores.max())[0]) next_state = self.next_states[s][best_action] v = -self._evaluate(next_state, -current_player) self.W[s][best_action] += v self.N[s][best_action] += 1 return v
def player_action(self, event): if not self.is_player_turn or self.is_gameend: return else: self.is_player_turn = False print("Player action") row = event.y // 100 col = event.x // 100 action = othello.xy_to_idx(row, col) valid_actions = othello.get_valid_actions(self.state, self.human) #print(valid_actions, action) if valid_actions == [othello.ACTION_NOOP]: action = othello.ACTION_NOOP if action in valid_actions: self.state, done = othello.step(self.state, action, self.human) self.refresh() self.update_label() if done: self.update_label(game_end=True) return time.sleep(0.3) self.npc_action() if self.is_gameend: return else: print("Invalid action") self.is_player_turn = True return