Esempio n. 1
0
    def prune(self, X_train, Y_train, X_val, Y_val, n_iterations=10):
        """Prunes a classifier over a validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            n_iterations (int): Maximum number of iterations.

        """

        logger.info('Pruning classifier ...')

        # Fits training data into the classifier
        self.fit(X_train, Y_train)

        # Predicts new data
        self.predict(X_val)

        # Gathering initial number of nodes
        initial_nodes = self.subgraph.n_nodes

        # For every possible iteration
        for t in range(n_iterations):
            logger.info(f'Running iteration {t+1}/{n_iterations} ...')

            # Creating temporary lists
            X_temp, Y_temp = [], []

            # Removing irrelevant nodes
            for j, n in enumerate(self.subgraph.nodes):
                if n.relevant != c.IRRELEVANT:
                    X_temp.append(X_train[j, :])
                    Y_temp.append(Y_train[j])

            # Copying lists back to original data
            X_train = np.asarray(X_temp)
            Y_train = np.asarray(Y_temp)

            # Fits training data into the classifier
            self.fit(X_train, Y_train)

            # Predicts new data
            preds = self.predict(X_val)

            # Calculating accuracy
            acc = g.opf_accuracy(Y_val, preds)

            logger.info(f'Current accuracy: {acc}.')

        # Gathering final number of nodes
        final_nodes = self.subgraph.n_nodes

        # Calculating pruning ratio
        prune_ratio = 1 - final_nodes / initial_nodes

        logger.info(f'Prune ratio: {prune_ratio}.')
Esempio n. 2
0
    def learn(self, X_train, Y_train, X_val, Y_val, n_iterations=10):
        """Learns the best classifier over a validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            n_iterations (int): Number of iterations.

        """

        # Defines the maximum accuracy
        max_acc = 0

        # Defines the previous accuracy
        previous_acc = 0

        # Defines the iterations counter
        t = 0

        # An always true loop
        while True:

            # Fits training data into the classifier
            self.fit(X_train, Y_train)

            # Predicts new data
            preds = self.predict(X_val)

            # Calculating accuracy
            acc = g.opf_accuracy(Y_val, preds)

            # Checks if current accuracy is better than the best one
            if acc > max_acc:
                # If yes, replace the maximum accuracy
                max_acc = acc

                # Makes a copy of the best OPF classifier
                best_opf = copy.deepcopy(self)

                # And saves the iteration number
                best_t = t

            # Gathers which samples were missclassified
            errors = np.argwhere(Y_val != preds)

            # Defining the initial number of non-prototypes as 0
            non_prototypes = 0

            # For every possible subgraph's node
            for n in self.subgraph.nodes:
                # If the node is not a prototype
                if n.status != c.PROTOTYPE:
                    # Increments the number of non-prototypes
                    non_prototypes += 1

            # For every possible error
            for e in errors:
                # Counter will receive the number of non-prototypes
                ctr = non_prototypes

                # While the counter is bigger than zero
                while ctr > 0:
                    # Generates a random index
                    j = int(r.generate_uniform_random_number(0, len(X_train)))

                    # If the node on that particular index is not a prototype
                    if self.subgraph.nodes[j].status != c.PROTOTYPE:
                        # Swap the nodes
                        X_train[j, :], X_val[e, :] = X_val[e, :], X_train[j, :]
                        Y_train[j], Y_val[e] = Y_val[e], Y_train[j]

                        # Decrements the number of non-prototypes
                        non_prototypes -= 1

                        # Resets the counter
                        ctr = 0

                    # If the node on that particular index is a prototype
                    else:
                        # Decrements the counter
                        ctr -= 1

            # Calculating difference between current accuracy and previous one
            delta = np.fabs(acc - previous_acc)

            # Replacing the previous accuracy as current accuracy
            previous_acc = acc

            # Incrementing the counter
            t += 1

            # If the difference is smaller than 10e-4 or iterations are finished
            if delta < 0.0001 or t == n_iterations:
                # Replaces current class with the best OPF
                self = best_opf

                # Breaks the loop
                break
Esempio n. 3
0
    def _learn(self, X_train, Y_train, X_val, Y_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.

        Returns:
            The best `k` value found over the validation set.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train)

        # Checks if it is supposed to use pre-computed distances
        if self.pre_computed_distance:
            # Checks if its size is the same as the subgraph's amount of nodes
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                # If not, raises an error
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Defining initial maximum accuracy as 0
        max_acc = 0.0

        # For every possible `k` value
        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Calculate the arcs using the current `k` value
            self.subgraph.create_arcs(k, self.distance_fn,
                                      self.pre_computed_distance,
                                      self.pre_distances)

            # Calculate the p.d.f. using the current `k` value
            self.subgraph.calculate_pdf(k, self.distance_fn,
                                        self.pre_computed_distance,
                                        self.pre_distances)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            # If accuracy is better than maximum accuracy
            if acc > max_acc:
                # Replaces the maximum accuracy value
                max_acc = acc

                # Defines current `k` as the best `k` value
                best_k = k

            logger.info(f'Accuracy over k = {k}: {acc}')

            # Destroy the arcs
            self.subgraph.destroy_arcs()

        return best_k