def expansion(self, parent, dict_prob): """ Expand node :param parent: :return: """ state = self.digraph.node[parent]['state'] side = self.digraph.node[parent]['side'] for key, value in dict_prob.items(): new_state = tt.apply_move(np.copy(state), key, -side) self.digraph.add_node(self.node_counter, num_visit=0, Q=0, u=0, P=value, side=-side, action=self.board_size * key[1] + key[0], state=np.copy(new_state)) self.digraph.add_edge(parent, self.node_counter) logger.debug('Add node ', str(parent), ' -> ', str(self.node_counter)) logger.debug('Node ', str(self.node_counter), ' -> ', str(new_state.tolist())) self.node_counter += 1
def get_move_probs(self, state, temp=1e-3): """Runs all playouts sequentially and returns the available actions and their corresponding probabilities Arguments: state -- the current state, including both game state and the current player. temp -- temperature parameter in (0, 1] that controls the level of exploration Returns: the available actions and the corresponding probabilities """ logger.debug('get_move_probs, state: ', str(state.tolist())) for node in self.digraph.nodes(): if np.array_equal(self.digraph.node[node]['state'], state): current_node = node break else: raise Exception('Cannot find the board state!') logger.debug('get_move_probs, root node: ', str(current_node)) for n in range(self._n_play_out): self.play_out(current_node) children = self.digraph.successors(current_node) nodes = [] visits = [] for child_node in children: nodes.append(child_node) visits.append(self.digraph.node[child_node]['num_visit']) node_probs = ut.softmax(1.0 / temp * np.log(visits)) return nodes, node_probs
def get_action(self, state, is_self_play=True): available = list(tt.available_moves(state)) if len(available) > 0: nodes, probs = self.get_move_probs(state) logger.debug('Prob: ', str(probs)) if is_self_play: node = np.random.choice(nodes, p=0.75 * probs + 0.25 * np.random.dirichlet(0.3 * np.ones(len(probs)))) else: # with the default temp=1e-3, this is almost equivalent to choosing the move with the highest prob node = np.random.choice(nodes, p=probs) return node else: logger.error("WARNING: the board is full") return None
def play_out(self, node): """ Play out algorithm :return: """ while True: is_leaf, selected_node = self.selection(node) node = selected_node if is_leaf: break logger.debug('Leaf_node: ', str(node)) # state.do_move(action) done = tt.has_winner(self.digraph.node[node]['state'], self.winning_length) if self.digraph.node[node]['side'] == 1: # actions, value, _, prob = self.model2.step(np.copy(self.digraph.node[node]['state']), [], done) actions, value, _, prob = self.model2.step(np.copy(self.digraph.node[node]['state']), np.array(np.ones(1) * 0.01, dtype='float32'), [], []) else: # actions, value, _, prob = self.model.step(np.copy(self.digraph.node[node]['state']), [], done) actions, value, _, prob = self.model.step(np.copy(self.digraph.node[node]['state']), np.array(np.ones(1) * 0.01, dtype='float32'), [], []) dict_prob = tt.get_available_moves_with_prob(self.digraph.node[node]['state'], prob) logger.debug('dict_prob ', str(dict_prob)) if done[0]: value = [self.digraph.node[node]['side']] elif len(list(tt.available_moves(self.digraph.node[node]['state']))) == 0: value = [0.0] else: self.expansion(node, dict_prob) # Update value and visit count of nodes in this traversal. self.update_recursive(node, value[0])
def run(self): logger.debug('- ' * 20 + 'run' + ' -' * 20) node, reward = self.mcts.self_play() plus_state, minus_state, plus_action, minus_action = self.mcts.get_state( node) policy_loss, value_loss, policy_entropy = [], [], [] policy_loss_2, value_loss_2, policy_entropy_2 = [], [], [] # Train normal p1, v1, e1 = self.pad_training_data(plus_state, plus_action, reward, self.model) p2_2, v2_2, e2_2 = self.pad_training_data(minus_state, minus_action, -reward, self.model2) policy_loss.append(p1) policy_loss_2.append(p2_2) value_loss.append(v1) value_loss_2.append(v2_2) policy_entropy.append(e1) policy_entropy_2.append(e2_2) # Rotation ACLW 180 rot_plus_action = copy.copy(plus_action) rot_minus_action = copy.copy(minus_action) rot_matrix = np.rot90(self.matrix_actions, 2) rot_plus_state = copy.copy(plus_state) rot_minus_state = copy.copy(minus_state) print(len(plus_state), len(minus_state)) for i in range(len(plus_action)): rot_plus_state[i] = np.rot90(plus_state[i], 2) rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh), int(plus_action[i] / self.nh)] for i in range(len(minus_action)): rot_minus_state[i] = np.rot90(minus_state[i], 2) rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh), int(minus_action[i] / self.nh)] p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action, reward, self.model) p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state, rot_minus_action, -reward, self.model2) policy_loss.append(p1) policy_loss_2.append(p2_2) value_loss.append(v1) value_loss_2.append(v2_2) policy_entropy.append(e1) policy_entropy_2.append(e2_2) # Rotation ACLW 90 rot_matrix = np.rot90(self.matrix_actions, 1) for i in range(len(plus_action)): rot_plus_state[i][0, :, :, 0] = np.rot90(plus_state[i][0, :, :, 0], 1) rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh), int(plus_action[i] / self.nh)] for i in range(len(minus_action)): rot_minus_state[i][0, :, :, 0] = np.rot90(minus_state[i][0, :, :, 0], 1) rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh), int(minus_action[i] / self.nh)] p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action, reward, self.model) p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state, rot_minus_action, -reward, self.model2) policy_loss.append(p1) policy_loss_2.append(p2_2) value_loss.append(v1) value_loss_2.append(v2_2) policy_entropy.append(e1) policy_entropy_2.append(e2_2) # Rotation ACLW 270 rot_matrix = np.rot90(self.matrix_actions, 3) for i in range(len(plus_action)): rot_plus_state[i][0, :, :, 0] = np.rot90(plus_state[i][0, :, :, 0], 3) rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh), int(plus_action[i] / self.nh)] for i in range(len(minus_action)): rot_minus_state[i][0, :, :, 0] = np.rot90(minus_state[i][0, :, :, 0], 3) rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh), int(minus_action[i] / self.nh)] p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action, reward, self.model) p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state, rot_minus_action, -reward, self.model2) policy_loss.append(p1) policy_loss_2.append(p2_2) value_loss.append(v1) value_loss_2.append(v2_2) policy_entropy.append(e1) policy_entropy_2.append(e2_2) self.exchange_models() self.mcts.exchange_models() logger.debug('* ' * 20 + 'run' + ' *' * 20) return mean(policy_loss), mean(value_loss), mean(policy_entropy), mean( policy_loss_2), mean(value_loss_2), mean(policy_entropy_2)