コード例 #1
0
    def expansion(self, parent, dict_prob):
        """
        Expand node

        :param parent:
        :return:
        """

        state = self.digraph.node[parent]['state']
        side = self.digraph.node[parent]['side']

        for key, value in dict_prob.items():
            new_state = tt.apply_move(np.copy(state), key, -side)
            self.digraph.add_node(self.node_counter,
                                  num_visit=0,
                                  Q=0,
                                  u=0,
                                  P=value,
                                  side=-side,
                                  action=self.board_size * key[1] + key[0],
                                  state=np.copy(new_state))
            self.digraph.add_edge(parent, self.node_counter)
            logger.debug('Add node ', str(parent), ' -> ', str(self.node_counter))
            logger.debug('Node ', str(self.node_counter), ' -> ', str(new_state.tolist()))
            self.node_counter += 1
コード例 #2
0
    def get_move_probs(self, state, temp=1e-3):
        """Runs all playouts sequentially and returns the available actions and their corresponding probabilities
        Arguments:
        state -- the current state, including both game state and the current player.
        temp -- temperature parameter in (0, 1] that controls the level of exploration
        Returns:
        the available actions and the corresponding probabilities
        """

        logger.debug('get_move_probs, state: ', str(state.tolist()))
        for node in self.digraph.nodes():
            if np.array_equal(self.digraph.node[node]['state'], state):
                current_node = node
                break
        else:
            raise Exception('Cannot find the board state!')

        logger.debug('get_move_probs, root node: ', str(current_node))
        for n in range(self._n_play_out):
            self.play_out(current_node)

        children = self.digraph.successors(current_node)
        nodes = []
        visits = []
        for child_node in children:
            nodes.append(child_node)
            visits.append(self.digraph.node[child_node]['num_visit'])

        node_probs = ut.softmax(1.0 / temp * np.log(visits))
        return nodes, node_probs
コード例 #3
0
    def get_action(self, state, is_self_play=True):
        available = list(tt.available_moves(state))

        if len(available) > 0:
            nodes, probs = self.get_move_probs(state)
            logger.debug('Prob: ', str(probs))

            if is_self_play:
                node = np.random.choice(nodes, p=0.75 * probs + 0.25 * np.random.dirichlet(0.3 * np.ones(len(probs))))
            else:
                # with the default temp=1e-3, this is almost equivalent to choosing the move with the highest prob
                node = np.random.choice(nodes, p=probs)

            return node
        else:
            logger.error("WARNING: the board is full")
            return None
コード例 #4
0
    def play_out(self, node):
        """
        Play out algorithm

        :return:
        """

        while True:
            is_leaf, selected_node = self.selection(node)
            node = selected_node
            if is_leaf:
                break

        logger.debug('Leaf_node: ', str(node))
        # state.do_move(action)
        done = tt.has_winner(self.digraph.node[node]['state'], self.winning_length)

        if self.digraph.node[node]['side'] == 1:
            # actions, value, _, prob = self.model2.step(np.copy(self.digraph.node[node]['state']), [], done)
            actions, value, _, prob = self.model2.step(np.copy(self.digraph.node[node]['state']),
                                                       np.array(np.ones(1) * 0.01, dtype='float32'), [], [])
        else:
            # actions, value, _, prob = self.model.step(np.copy(self.digraph.node[node]['state']), [], done)
            actions, value, _, prob = self.model.step(np.copy(self.digraph.node[node]['state']),
                                                      np.array(np.ones(1) * 0.01, dtype='float32'), [], [])

        dict_prob = tt.get_available_moves_with_prob(self.digraph.node[node]['state'], prob)
        logger.debug('dict_prob ', str(dict_prob))

        if done[0]:
            value = [self.digraph.node[node]['side']]
        elif len(list(tt.available_moves(self.digraph.node[node]['state']))) == 0:
            value = [0.0]
        else:
            self.expansion(node, dict_prob)

        # Update value and visit count of nodes in this traversal.
        self.update_recursive(node, value[0])
コード例 #5
0
    def run(self):
        logger.debug('- ' * 20 + 'run' + ' -' * 20)
        node, reward = self.mcts.self_play()
        plus_state, minus_state, plus_action, minus_action = self.mcts.get_state(
            node)
        policy_loss, value_loss, policy_entropy = [], [], []
        policy_loss_2, value_loss_2, policy_entropy_2 = [], [], []

        # Train normal
        p1, v1, e1 = self.pad_training_data(plus_state, plus_action, reward,
                                            self.model)
        p2_2, v2_2, e2_2 = self.pad_training_data(minus_state, minus_action,
                                                  -reward, self.model2)

        policy_loss.append(p1)
        policy_loss_2.append(p2_2)
        value_loss.append(v1)
        value_loss_2.append(v2_2)
        policy_entropy.append(e1)
        policy_entropy_2.append(e2_2)

        # Rotation ACLW 180

        rot_plus_action = copy.copy(plus_action)
        rot_minus_action = copy.copy(minus_action)

        rot_matrix = np.rot90(self.matrix_actions, 2)
        rot_plus_state = copy.copy(plus_state)
        rot_minus_state = copy.copy(minus_state)
        print(len(plus_state), len(minus_state))
        for i in range(len(plus_action)):
            rot_plus_state[i] = np.rot90(plus_state[i], 2)
            rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh),
                                            int(plus_action[i] / self.nh)]

        for i in range(len(minus_action)):
            rot_minus_state[i] = np.rot90(minus_state[i], 2)
            rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh),
                                             int(minus_action[i] / self.nh)]

        p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action,
                                            reward, self.model)
        p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state,
                                                  rot_minus_action, -reward,
                                                  self.model2)

        policy_loss.append(p1)
        policy_loss_2.append(p2_2)
        value_loss.append(v1)
        value_loss_2.append(v2_2)
        policy_entropy.append(e1)
        policy_entropy_2.append(e2_2)

        # Rotation ACLW 90
        rot_matrix = np.rot90(self.matrix_actions, 1)

        for i in range(len(plus_action)):
            rot_plus_state[i][0, :, :, 0] = np.rot90(plus_state[i][0, :, :, 0],
                                                     1)
            rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh),
                                            int(plus_action[i] / self.nh)]

        for i in range(len(minus_action)):
            rot_minus_state[i][0, :, :,
                               0] = np.rot90(minus_state[i][0, :, :, 0], 1)
            rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh),
                                             int(minus_action[i] / self.nh)]

        p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action,
                                            reward, self.model)
        p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state,
                                                  rot_minus_action, -reward,
                                                  self.model2)

        policy_loss.append(p1)
        policy_loss_2.append(p2_2)
        value_loss.append(v1)
        value_loss_2.append(v2_2)
        policy_entropy.append(e1)
        policy_entropy_2.append(e2_2)

        # Rotation ACLW 270
        rot_matrix = np.rot90(self.matrix_actions, 3)

        for i in range(len(plus_action)):
            rot_plus_state[i][0, :, :, 0] = np.rot90(plus_state[i][0, :, :, 0],
                                                     3)
            rot_plus_action[i] = rot_matrix[int(plus_action[i] % self.nh),
                                            int(plus_action[i] / self.nh)]

        for i in range(len(minus_action)):
            rot_minus_state[i][0, :, :,
                               0] = np.rot90(minus_state[i][0, :, :, 0], 3)
            rot_minus_action[i] = rot_matrix[int(minus_action[i] % self.nh),
                                             int(minus_action[i] / self.nh)]

        p1, v1, e1 = self.pad_training_data(rot_plus_state, rot_plus_action,
                                            reward, self.model)
        p2_2, v2_2, e2_2 = self.pad_training_data(rot_minus_state,
                                                  rot_minus_action, -reward,
                                                  self.model2)

        policy_loss.append(p1)
        policy_loss_2.append(p2_2)
        value_loss.append(v1)
        value_loss_2.append(v2_2)
        policy_entropy.append(e1)
        policy_entropy_2.append(e2_2)
        self.exchange_models()
        self.mcts.exchange_models()

        logger.debug('* ' * 20 + 'run' + ' *' * 20)
        return mean(policy_loss), mean(value_loss), mean(policy_entropy), mean(
            policy_loss_2), mean(value_loss_2), mean(policy_entropy_2)