def evaluate(AIs, play_num, return_draw=False): wins = 0. draw_num = 0 for i in range(play_num): state = State() AIs[0].init_prev() AIs[1].init_prev() AIs[i % 2].color = 0 AIs[1 - i % 2].color = 1 while True: s, pi = AIs[i % 2].act_and_get_pi(state) a = actionid2str(state, s) while not state.accept_action_str(a): print("this action is impossible") s, pi = AIs[i % 2].act_and_get_pi(state) a = actionid2str(state, s) AIs[1 - i % 2].prev_action = s if state.terminate: break s, pi = AIs[1 - i % 2].act_and_get_pi(state) a = actionid2str(state, s) while not state.accept_action_str(a): print("this action is impossible") s, pi = AIs[1 - i % 2].act_and_get_pi(state) a = actionid2str(state, s) AIs[i % 2].prev_action = s if state.terminate: break if i % 2 == 0 and state.reward == 1: wins += 1. elif i % 2 == 1 and state.reward == -1: wins += 1. elif state.reward == 0: wins += 0.5 draw_num += 1 sys.stderr.write('\r\033[K {}win/{}'.format(i + 1 - wins, i + 1)) sys.stderr.flush() print("") AIs[0].color = 0 AIs[1].color = 1 if return_draw: return wins, draw_num else: return wins
def normal_play(agents): state = State() while True: state.display_cui() start = time.time() s = agents[0].act(state, showNQ=True) end = time.time() print(end - start) if isinstance(s, int): a = actionid2str(state, s) else: a = s while not state.accept_action_str(a): print(a) print("this action is impossible") s = agents[0].act(state, showNQ=True) if isinstance(s, int): a = actionid2str(state, s) else: a = s agents[1].prev_action = s if state.terminate: break #time.sleep(0.1) state.display_cui() s = agents[1].act(state, showNQ=True) if isinstance(s, int): a = actionid2str(state, s) else: a = s while not state.accept_action_str(a): print(a) print("this action is impossible") s = agents[1].act(state, showNQ=True) if isinstance(s, int): a = actionid2str(state, s) else: a = s agents[0].prev_action = s #time.sleep(0.1) if state.terminate: break state.display_cui() print("The game finished. reward=({}, {})".format(state.reward, -state.reward))
def get_graphviz_tree(tree, g, count=0, threshold=5): if len(tree.children.items()) == 0: g.node(str(count), label="0") else: parent_count = count g.node(str(parent_count), label=str(int(np.sum(tree.N))) + os.linesep + "{:.3f}".format(np.sum(tree.W) / np.sum(tree.N))) count += 1 for key, value in tree.children.items(): if int(tree.N[key]) >= threshold: g.edge(str(parent_count), str(count), label=actionid2str(tree.s, key) + os.linesep + str(int(tree.N[key]))) get_graphviz_tree(value, g, count) count += int(np.sum(value.N)) + 1
def oneturn(self, color): global touched s = self.agents[color].act(self.state) if isinstance(self.agents[color], CNNAI): g = self.agents[color].get_tree_for_graphviz() g.render(os.path.join("game_trees", "game_tree{}".format(self.state.turn))) if s == -1: return if isinstance(s, int): a = actionid2str(self.state, s) else: a = s if not self.state.accept_action_str(a): print(a) print("this action is impossible") return self.agents[1 - color].prev_action = s self.state.display_cui() print(self.state.get_player_dist_from_goal()) touched = False
def MCTS(self, state, max_node, C_puct, tau, showNQ=False, noise=0., random_flip=False): p = self.p(state) illegal = (p == 0.) old_p = p p = (1. - noise) * p + noise * np.random.rand(len(p)) p[illegal] = 0. p = p / sum(p) #root_tree = Tree(state, p) root_tree = self.prev_tree root_tree.s = state if self.prev_action is not None: if self.prev_action in root_tree.children.keys(): root_tree = root_tree.children[self.prev_action] else: root_tree = Tree(state, p) root_tree.P = p node_num = np.sum(root_tree.N) while node_num < max_node: # select nodess = [] actionss = [] for j in range(min(self.n_parallel, max_node)): _, _, nodes, actions = self.select(root_tree, C_puct) if nodes is None: break nodess.append(nodes) actionss.append(actions) # virtual loss for node, action in zip(nodes, actions): node.N[action] += self.virtual_loss_n node.W[action] -= self.virtual_loss_n node.Q[action] = node.W[action] / node.N[action] for nodes, actions in zip(nodess, actionss): # virtual lossを元に戻す for node, action in zip(nodes, actions): node.N[action] -= self.virtual_loss_n node.W[action] += self.virtual_loss_n if node.N[action] == 0: node.Q[action] = 0. else: node.Q[action] = node.W[action] / node.N[action] states = [] for nodes, actions in zip(nodess, actionss): s = state_copy(nodes[-1].s) #print([self.actionid2str(node.s, action) for node, action in zip(nodes, actions)]) s.accept_action_str(actionid2str(s, actions[-1])) states.append(s) node_num += len(states) p = self.p_array(states, random_flip=random_flip) v = self.v_array(states, random_flip=random_flip) for nodes2, actions2 in zip(nodess, actionss): pass #print([self.actionid2str(node.s, action) for node, action in zip(nodes2, actions2)]) #print("") count = 0 for s, nodes, actions in zip(states, nodess, actionss): if not s.terminate: t = nodes[-1] a = actions[-1] if a not in t.children.keys(): t.children[a] = Tree(s, p[count]) count += 1 # backup count = 0 for nodes, actions in zip(nodess, actionss): for node, action in zip(nodes, actions): node.N[action] += 1 node.W[action] += v[count] node.Q[action] = node.W[action] / node.N[action] count += 1 if showNQ: print("p=") self.display_parameter(np.asarray(old_p * 1000, dtype="int32")) print("N=") self.display_parameter(np.asarray(root_tree.N, dtype="int32")) print("Q=") self.display_parameter( np.asarray(root_tree.Q * 1000, dtype="int32")) print("v={}".format(self.v(root_tree.s))) if tau == 0: action = np.argmax(root_tree.N) else: N2 = np.power(np.asarray(root_tree.N, dtype="float64"), 1. / tau) pi = N2 / np.sum(N2) action = np.random.choice(len(pi), p=pi) # 葉に向う行動は勝ちになる行動のみ if action in root_tree.children.keys(): self.prev_tree = root_tree.children[action] action2 = np.argmax(root_tree.N) pi_ret = np.zeros((137, )) pi_ret[action2] = 1. return action, root_tree.N / np.sum(root_tree.N)
def generate_data(AIs, play_num, noise=0.1, display=False, equal_draw=True): data = [] for i in range(play_num): state = State() AIs[0].init_prev() AIs[1].init_prev() featuress = [[], [], [], []] for i, b1, b2 in [(0, False, False), (1, True, False), (2, False, True), (3, True, True)]: featuress[i].append(state.feature_CNN(b1, b2)) pis = [] states = [state_copy(state)] while True: AIs[0].tau = np.random.rand() * (1. - TAU_MIN) + TAU_MIN AIs[1].tau = np.random.rand() * (1. - TAU_MIN) + TAU_MIN if state.turn >= 20: AIs[0].tau = TAU_MIN AIs[1].tau = TAU_MIN s, pi = AIs[0].act_and_get_pi(state, noise=noise) a = actionid2str(state, s) while not state.accept_action_str(a): print("this action is impossible") s, pi = AIs[0].act_and_get_pi(state) a = actionid2str(state, s) AIs[1].prev_action = s pis.append(pi) if display: state.display_cui() end = False for state2 in states: if equal_draw and state == state2: end = True break if end: break states.append(state_copy(state)) if state.terminate: break for i, b1, b2 in [(0, False, False), (1, True, False), (2, False, True), (3, True, True)]: featuress[i].append(state.feature_CNN(b1, b2)) s, pi = AIs[1].act_and_get_pi(state, noise=noise) a = actionid2str(state, s) while not state.accept_action_str(a): print("this action is impossible") s, pi = AIs[1].act_and_get_pi(state) a = actionid2str(state, s) AIs[0].prev_action = s pis.append(pi) if display: state.display_cui() end = False for state2 in states: if equal_draw and state == state2: end = True break if end: break states.append(state_copy(state)) if state.terminate: break for i, b1, b2 in [(0, False, False), (1, True, False), (2, False, True), (3, True, True)]: featuress[i].append(state.feature_CNN(b1, b2)) del states if state.reward == 0: continue for feature1, feature2, feature3, feature4, pi in zip(featuress[0], featuress[1], featuress[2], featuress[3], pis): data.append((feature1, pi, state.reward)) a = np.flip(pi[:64].reshape((8, 8)), 0).flatten() b = np.flip(pi[64:128].reshape((8, 8)), 0).flatten() mvarray1 = pi[128:].reshape((3, 3)) mvarray2 = np.zeros((3, 3)) for y in [-1, 0, 1]: for x in [-1, 0, 1]: mvarray2[x, y] = mvarray1[-x, y] c = mvarray2.flatten() data.append((feature2, np.concatenate([a, b, c]), state.reward)) a = np.flip(pi[:64].reshape((8, 8)), 1).flatten() b = np.flip(pi[64:128].reshape((8, 8)), 1).flatten() mvarray1 = pi[128:].reshape((3, 3)) mvarray2 = np.zeros((3, 3)) for y in [-1, 0, 1]: for x in [-1, 0, 1]: mvarray2[x, y] = mvarray1[x, -y] c = mvarray2.flatten() data.append((feature3, np.concatenate([a, b, c]), -state.reward)) a = np.flip(np.flip(pi[:64].reshape((8, 8)), 1), 0).flatten() b = np.flip(np.flip(pi[64:128].reshape((8, 8)), 1), 0).flatten() mvarray1 = pi[128:].reshape((3, 3)) mvarray2 = np.zeros((3, 3)) for y in [-1, 0, 1]: for x in [-1, 0, 1]: mvarray2[x, y] = mvarray1[-x, -y] c = mvarray2.flatten() data.append((feature4, np.concatenate([a, b, c]), -state.reward)) return data
def MCTS(self, state, max_node, C_puct, tau, showNQ=False, noise=0., random_flip=False): # 壁がお互いになく、分岐のない場合読みを入れない。ただし、それでもprev_treeとかの関係上振る舞いが変わるので保留中。 #search_node_num = max_node #if state.black_walls == 0 and state.white_walls == 0: # x, y = state.color_p(state.turn % 2) # if int(np.sum(state.movable_array(x, y, shortest_only=True))) == 1: # search_node_num = 1 p = self.p(state) illegal = (p == 0.) old_p = p p = (1. - noise) * p + noise * np.random.rand(len(p)) p[illegal] = 0. p = p / sum(p) #root_tree = Tree(state, p) root_tree = self.prev_tree root_tree.s = state if self.prev_action is not None: if self.prev_action in root_tree.children.keys(): root_tree = root_tree.children[self.prev_action] else: root_tree = Tree(state, p) root_tree.P = p node_num = np.sum(root_tree.N) while node_num < max_node: # select nodess = [] actionss = [] for j in range(min(self.n_parallel, max_node)): _, _, nodes, actions = self.select(root_tree, C_puct) if nodes is None: break nodess.append(nodes) actionss.append(actions) # virtual loss for node, action in zip(nodes, actions): node.N[action] += self.virtual_loss_n if self.color == node.s.turn % 2: # 先後でQがひっくり返ることを考慮 node.W[action] -= self.virtual_loss_n else: node.W[action] += self.virtual_loss_n node.Q[action] = node.W[action] / node.N[action] for nodes, actions in zip(nodess, actionss): # virtual lossを元に戻す for node, action in zip(nodes, actions): node.N[action] -= self.virtual_loss_n if self.color == node.s.turn % 2: node.W[action] += self.virtual_loss_n else: node.W[action] -= self.virtual_loss_n if node.N[action] == 0: node.Q[action] = 0. else: node.Q[action] = node.W[action] / node.N[action] states = [] for nodes, actions in zip(nodess, actionss): s = state_copy(nodes[-1].s) #print([self.actionid2str(node.s, action) for node, action in zip(nodes, actions)]) s.accept_action_str(actionid2str(s, actions[-1])) states.append(s) node_num += len(states) #p = self.p_array(states, random_flip=random_flip) v = self.v_array(states, random_flip=random_flip) for nodes2, actions2 in zip(nodess, actionss): pass #print([self.actionid2str(node.s, action) for node, action in zip(nodes2, actions2)]) #print("") count = 0 for s, nodes, actions in zip(states, nodess, actionss): if not s.terminate: t = nodes[-1] a = actions[-1] if a not in t.children.keys(): t.children[a] = Tree(s, None) count += 1 # backup count = 0 for nodes, actions in zip(nodess, actionss): for node, action in zip(nodes, actions): node.N[action] += 1 node.W[action] += v[count] node.Q[action] = node.W[action] / node.N[action] count += 1 if showNQ: print("p=") self.display_parameter(np.asarray(old_p * 1000, dtype="int32")) print("N=") self.display_parameter(np.asarray(root_tree.N, dtype="int32")) print("Q=") self.display_parameter( np.asarray(root_tree.Q * 1000, dtype="int32")) print("v={}".format(self.v(root_tree.s))) if tau == 0: N2 = root_tree.N * (root_tree.N == np.max(root_tree.N)) else: N2 = np.power(np.asarray(root_tree.N, dtype="float64"), 1. / tau) pi = N2 / np.sum(N2) action = np.random.choice(len(pi), p=pi) # 葉に向う行動は勝ちになる行動のみ if action in root_tree.children.keys(): self.prev_tree = root_tree.children[action] action2 = np.argmax(root_tree.N) pi_ret = np.zeros((137, )) pi_ret[action2] = 1. self.tree_for_visualize = root_tree return action, root_tree.N / np.sum(root_tree.N)