Esempio n. 1
0
def test_build_error():
    new_exception = exception.BuildError('error')

    try:
        raise new_exception
    except exception.BuildError:
        pass
Esempio n. 2
0
    def fit(self, X_train, Y_train=None):
        """Fits data in the classifier.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.

        """

        logger.info('Clustering with classifier ...')

        # Initializing the timer
        start = time.time()

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train)

        # Checks if it is supposed to use pre-computed distances
        if self.pre_computed_distance:
            # Checks if its size is the same as the subgraph's amount of nodes
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                # If not, raises an error
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Performing the best minimum cut on the subgraph
        self._best_minimum_cut(self.min_k, self.max_k)

        # Clustering the data with best `k` value
        self._clustering(self.subgraph.best_k)

        # The subgraph has been properly trained
        self.subgraph.trained = True

        # Ending timer
        end = time.time()

        # Calculating training task time
        train_time = end - start

        logger.info('Classifier has been clustered with.')
        logger.info(f'Number of clusters: {self.subgraph.n_clusters}.')
        logger.info(f'Clustering time: {train_time} seconds.')
Esempio n. 3
0
    def _learn(self, X_train, Y_train, I_train, X_val, Y_val, I_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            I_train (np.array): Array of training indexes.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.
            I_val (np.array): Array of validation indexes.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train, I_train)

        if self.pre_computed_distance:
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Defining initial maximum accuracy as 0
        max_acc = 0.0

        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Calculate the arcs using the current `k` value
            self.subgraph.create_arcs(k, self.distance_fn,
                                      self.pre_computed_distance,
                                      self.pre_distances)

            # Calculate the p.d.f. using the current `k` value
            self.subgraph.calculate_pdf(k, self.distance_fn,
                                        self.pre_computed_distance,
                                        self.pre_distances)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val, I_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            if acc > max_acc:
                max_acc = acc
                best_k = k

            logger.info('Accuracy over k = %d: %s', k, acc)

            self.subgraph.destroy_arcs()

        self.subgraph.best_k = best_k
Esempio n. 4
0
    def predict(self, X_val, I_val=None):
        """Predicts new data using the pre-trained classifier.

        Args:
            X_val (np.array): Array of validation features.
            I_val (np.array): Array of validation indexes.

        Returns:
            A list of predictions for each record of the data.

        """

        # Checks if there is a knn-subgraph
        if not self.subgraph:
            # If not, raises an BuildError
            raise e.BuildError('KNNSubgraph has not been properly created')

        # Checks if knn-subgraph has been properly trained
        if not self.subgraph.trained:
            # If not, raises an BuildError
            raise e.BuildError('Classifier has not been properly clustered')

        logger.info('Predicting data ...')

        # Initializing the timer
        start = time.time()

        # Creating a prediction subgraph
        pred_subgraph = KNNSubgraph(X_val, I=I_val)

        # Gathering the best `k` value
        best_k = self.subgraph.best_k

        # Creating an array of distances
        distances = np.zeros(best_k + 1)

        # Creating an array of nearest neighbours indexes
        neighbours_idx = np.zeros(best_k + 1)

        # For every possible prediction node
        for i in range(pred_subgraph.n_nodes):
            # Defines the current cost
            cost = -c.FLOAT_MAX

            # Filling array of distances with maximum value
            distances.fill(c.FLOAT_MAX)

            # For every possible trained node
            for j in range(self.subgraph.n_nodes):
                # If they are different nodes
                if j != i:
                    # If it is supposed to use a pre-computed distance
                    if self.pre_computed_distance:
                        # Gathers the distance from the matrix
                        distances[best_k] = self.pre_distances[
                            pred_subgraph.nodes[i].idx][
                                self.subgraph.nodes[j].idx]

                    # If it is supposed to calculate the distance
                    else:
                        # Calculates the distance between nodes `i` and `j`
                        distances[best_k] = self.distance_fn(
                            pred_subgraph.nodes[i].features,
                            self.subgraph.nodes[j].features)

                    # Apply node `j` as a neighbour
                    neighbours_idx[best_k] = j

                    # Gathers current `k`
                    cur_k = best_k

                    # While current `k` is bigger than 0 and the `k` distance is smaller than `k-1` distance
                    while cur_k > 0 and distances[cur_k] < distances[cur_k -
                                                                     1]:
                        # Swaps the distance from `k` and `k-1`
                        distances[cur_k], distances[cur_k - 1] = distances[
                            cur_k - 1], distances[cur_k]

                        # Swaps the neighbours indexex from `k` and `k-1`
                        neighbours_idx[cur_k], neighbours_idx[
                            cur_k -
                            1] = neighbours_idx[cur_k -
                                                1], neighbours_idx[cur_k]

                        # Decrements `k`
                        cur_k -= 1

            # Defining the density as 0
            density = 0.0

            # For every possible k
            for k in range(best_k):
                # Accumulates the density
                density += np.exp(-distances[k] / self.subgraph.constant)

            # Gather its mean value
            density /= best_k

            # Scale the density between minimum and maximum values
            density = ((c.MAX_DENSITY - 1) *
                       (density - self.subgraph.min_density) /
                       (self.subgraph.max_density - self.subgraph.min_density +
                        c.EPSILON)) + 1

            # For every possible k
            for k in range(best_k):
                # If distance is different than maximum possible value
                if distances[k] != c.FLOAT_MAX:
                    # Gathers the node's neighbour
                    neighbour = int(neighbours_idx[k])

                    # Calculate the temporary cost
                    temp_cost = np.minimum(self.subgraph.nodes[neighbour].cost,
                                           density)

                    # If temporary cost is bigger than current cost
                    if temp_cost > cost:
                        # Replaces the current cost
                        cost = temp_cost

                        # Propagates the predicted label from the neighbour
                        pred_subgraph.nodes[
                            i].predicted_label = self.subgraph.nodes[
                                neighbour].predicted_label

                        # Propagates the cluster label from the neighbour
                        pred_subgraph.nodes[
                            i].cluster_label = self.subgraph.nodes[
                                neighbour].cluster_label

        # Creating the list of predictions
        preds = [pred.predicted_label for pred in pred_subgraph.nodes]

        # Creating the list of clusters
        clusters = [pred.cluster_label for pred in pred_subgraph.nodes]

        # Ending timer
        end = time.time()

        # Calculating prediction task time
        predict_time = end - start

        logger.info('Data has been predicted.')
        logger.info('Prediction time: %s seconds.', predict_time)

        return preds, clusters
Esempio n. 5
0
    def predict(self, X_val, I_val=None):
        """Predicts new data using the pre-trained classifier.

        Args:
            X_val (np.array): Array of validation or test features.
            I_val (np.array): Array of validation or test indexes.

        Returns:
            A list of predictions for each record of the data.

        """

        if not self.subgraph:
            raise e.BuildError('Subgraph has not been properly created')

        if not self.subgraph.trained:
            raise e.BuildError('Classifier has not been properly fitted')

        logger.info('Predicting data ...')

        start = time.time()

        # Creating a prediction subgraph
        pred_subgraph = Subgraph(X_val, I=I_val)

        for i in range(pred_subgraph.n_nodes):
            # Initializing the conqueror node
            conqueror = -1

            # Initializes the `j` counter
            j = 0

            # Gathers the first node from the ordered list
            k = self.subgraph.idx_nodes[j]

            if self.pre_computed_distance:
                weight = self.pre_distances[self.subgraph.nodes[k].idx][pred_subgraph.nodes[i].idx]

            else:
                weight = self.distance_fn(self.subgraph.nodes[k].features, pred_subgraph.nodes[i].features)

            # The minimum cost will be the maximum between the `k` node cost and its weight (arc)
            min_cost = np.maximum(self.subgraph.nodes[k].cost, weight)

            # The current label will be `k` node's predicted label
            current_label = self.subgraph.nodes[k].predicted_label

            # While `j` is a possible node and the minimum cost is bigger than the current node's cost
            while j < (self.subgraph.n_nodes - 1) and min_cost > self.subgraph.nodes[self.subgraph.idx_nodes[j+1]].cost:
                # Gathers the next node from the ordered list
                l = self.subgraph.idx_nodes[j+1]

                if self.pre_computed_distance:
                    weight = self.pre_distances[self.subgraph.nodes[l].idx][pred_subgraph.nodes[i].idx]

                else:
                    weight = self.distance_fn(self.subgraph.nodes[l].features, pred_subgraph.nodes[i].features)

                # The temporary minimum cost will be the maximum between the `l` node cost and its weight (arc)
                temp_min_cost = np.maximum(self.subgraph.nodes[l].cost, weight)

                # If temporary minimum cost is smaller than the minimum cost
                if temp_min_cost < min_cost:
                    # Replaces the minimum cost
                    min_cost = temp_min_cost

                    # Gathers the identifier of `l` node
                    conqueror = l

                    # Updates the current label as `l` node's predicted label
                    current_label = self.subgraph.nodes[l].predicted_label

                # Increments the `j` counter
                j += 1

                # Makes `k` and `l` equals
                k = l

            # Node's `i` predicted label is the same as current label
            pred_subgraph.nodes[i].predicted_label = current_label

            # Checks if any node has been conquered
            if conqueror > -1:
                # Marks the conqueror node and its path
                self.subgraph.mark_nodes(conqueror)

        # Creating the list of predictions
        preds = [pred.predicted_label for pred in pred_subgraph.nodes]

        end = time.time()

        predict_time = end - start

        logger.info('Data has been predicted.')
        logger.info('Prediction time: %s seconds.', predict_time)

        return preds
Esempio n. 6
0
    def _learn(self, X_train, Y_train, X_val, Y_val):
        """Learns the best `k` value over the validation set.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_val (np.array): Array of validation features.
            Y_val (np.array): Array of validation labels.

        """

        logger.info('Learning best `k` value ...')

        # Creating a subgraph
        self.subgraph = KNNSubgraph(X_train, Y_train)

        # Checks if it is supposed to use pre-computed distances
        if self.pre_computed_distance:
            # Checks if its size is the same as the subgraph's amount of nodes
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                # If not, raises an error
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Defining initial maximum accuracy as 0
        max_acc = 0.0

        # For every possible `k` value
        for k in range(1, self.max_k + 1):
            # Gathers current `k` as subgraph's best `k`
            self.subgraph.best_k = k

            # Calculate the arcs using the current `k` value
            self.subgraph.create_arcs(k, self.distance_fn,
                                      self.pre_computed_distance,
                                      self.pre_distances)

            # Calculate the p.d.f. using the current `k` value
            self.subgraph.calculate_pdf(k, self.distance_fn,
                                        self.pre_computed_distance,
                                        self.pre_distances)

            # Clusters the subgraph
            self._clustering()

            # Calculate the predictions over the validation set
            preds = self.predict(X_val)

            # Calculating the accuracy
            acc = g.opf_accuracy(Y_val, preds)

            # If accuracy is better than maximum accuracy
            if acc > max_acc:
                # Replaces the maximum accuracy value
                max_acc = acc

                # Defines current `k` as the best `k` value
                best_k = k

            logger.info(f'Accuracy over k = {k}: {acc}')

            # Destroy the arcs
            self.subgraph.destroy_arcs()

        # Applying the best k to the subgraph's property
        self.subgraph.best_k = best_k
Esempio n. 7
0
    def fit(self, X_train, Y_train, X_unlabeled):
        """Fits data in the semi-supervised classifier.

        Args:
            X_train (np.array): Array of training features.
            Y_train (np.array): Array of training labels.
            X_unlabeled (np.array): Array of unlabeled features.

        """

        logger.info('Fitting semi-supervised classifier ...')

        # Initializing the timer
        start = time.time()

        # Creating a subgraph
        self.subgraph = Subgraph(X_train, Y_train)

        # Finding prototypes
        self._find_prototypes()

        # Gather current number of nodes
        current_n_nodes = self.subgraph.n_nodes

        # Iterate over every possible unlabeled sample
        for i, feature in enumerate(X_unlabeled):
            # Creates a Node structure
            node = Node(current_n_nodes + i, 1, feature)

            # Appends the node to the list
            self.subgraph.nodes.append(node)

        # Checks if it is supposed to use pre-computed distances
        if self.pre_computed_distance:
            # Checks if its size is the same as the subgraph's amount of nodes
            if self.pre_distances.shape[
                    0] != self.subgraph.n_nodes or self.pre_distances.shape[
                        1] != self.subgraph.n_nodes:
                # If not, raises an error
                raise e.BuildError(
                    'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`'
                )

        # Creating a minimum heap
        h = Heap(size=self.subgraph.n_nodes)

        # For each possible node
        for i in range(self.subgraph.n_nodes):
            # Checks if node is a prototype
            if self.subgraph.nodes[i].status == c.PROTOTYPE:
                # If yes, it does not have predecessor nodes
                self.subgraph.nodes[i].pred = c.NIL

                # Its predicted label is the same as its true label
                self.subgraph.nodes[i].predicted_label = self.subgraph.nodes[
                    i].label

                # Its cost equals to zero
                h.cost[i] = 0

                # Inserts the node into the heap
                h.insert(i)

            # If node is not a prototype
            else:
                # Its cost equals to maximum possible value
                h.cost[i] = c.FLOAT_MAX

        # While the heap is not empty
        while not h.is_empty():
            # Removes a node
            p = h.remove()

            # Appends its index to the ordered list
            self.subgraph.idx_nodes.append(p)

            # Gathers its cost
            self.subgraph.nodes[p].cost = h.cost[p]

            # For every possible node
            for q in range(self.subgraph.n_nodes):
                # If we are dealing with different nodes
                if p != q:
                    # If `p` node cost is smaller than `q` node cost
                    if h.cost[p] < h.cost[q]:
                        # Checks if we are using a pre-computed distance
                        if self.pre_computed_distance:
                            # Gathers the distance from the distance's matrix
                            weight = self.pre_distances[self.subgraph.nodes[
                                p].idx][self.subgraph.nodes[q].idx]

                        # If the distance is supposed to be calculated
                        else:
                            # Calls the corresponding distance function
                            weight = self.distance_fn(
                                self.subgraph.nodes[p].features,
                                self.subgraph.nodes[q].features)

                        # The current cost will be the maximum cost between the node's and its weight (arc)
                        current_cost = np.maximum(h.cost[p], weight)

                        # If current cost is smaller than `q` node's cost
                        if current_cost < h.cost[q]:
                            # `q` node has `p` as its predecessor
                            self.subgraph.nodes[q].pred = p

                            # And its predicted label is the same as `p`
                            self.subgraph.nodes[
                                q].predicted_label = self.subgraph.nodes[
                                    p].predicted_label

                            # As we may have unlabeled nodes, make sure that `q` label equals to `q` predicted label
                            self.subgraph.nodes[q].label = self.subgraph.nodes[
                                q].predicted_label

                            # Updates the heap `q` node and the current cost
                            h.update(q, current_cost)

        # The subgraph has been properly trained
        self.subgraph.trained = True

        # Ending timer
        end = time.time()

        # Calculating training task time
        train_time = end - start

        logger.info('Semi-supervised classifier has been fitted.')
        logger.info(f'Training time: {train_time} seconds.')
Esempio n. 8
0
    def predict(self, X_val, I_val=None):
        """Predicts new data using the pre-trained classifier.

        Args:
            X_val (np.array): Array of validation features.
            I_val (np.array): Array of validation indexes.

        Returns:
            A list of predictions for each record of the data.

        """

        # Checks if there is a knn-subgraph
        if not self.subgraph:
            # If not, raises an BuildError
            raise e.BuildError('ANNSubgraph has not been properly created')

        # Checks if knn-subgraph has been properly trained
        if not self.subgraph.trained:
            # If not, raises an BuildError
            raise e.BuildError('Classifier has not been properly clustered')

        logger.info('Predicting data ...')

        # Initializing the timer
        start = time.time()

        # Creating a prediction subgraph
        pred_subgraph = ANNSubgraph(X_val, I=I_val)

        # Gathering the best `k` value
        best_k = self.subgraph.best_k

        # Creating an array of distances
        # distances = np.zeros(best_k + 1)

        # Creating an array of nearest neighbours indexes
        # neighbours_idx = np.zeros(best_k + 1)

        # For every possible prediction node
        for i in range(pred_subgraph.n_nodes):

            # For every possible trained node
            neighbors_idx, distances = self.ann_search.query(
                pred_subgraph.nodes[i].features, best_k)

            density = np.sum(
                np.exp(-np.array(distances) / self.subgraph.constant))

            # Gather its mean value
            density /= best_k

            # Scale the density between minimum and maximum values
            density = ((c.MAX_DENSITY - 1) *
                       (density - self.subgraph.min_density) /
                       (self.subgraph.max_density - self.subgraph.min_density +
                        c.EPSILON)) + 1

            neighbor_costs = [
                self.subgraph.nodes[neighbor].cost
                for neighbor in neighbors_idx
            ]

            # Calculate the temporary cost
            temp_cost = np.minimum(neighbor_costs, [density])

            # Select the maximum cost among node's neighbors
            k = np.argmax(temp_cost)

            # Gathers the node's neighbor
            neighbor = int(neighbors_idx[k])

            # Propagates the predicted label from the neighbour
            pred_subgraph.nodes[i].predicted_label = self.subgraph.nodes[
                neighbor].predicted_label

            # Propagates the cluster label from the neighbour
            pred_subgraph.nodes[i].cluster_label = self.subgraph.nodes[
                neighbor].cluster_label

            del neighbor_costs
            del neighbor

        # Creating the list of predictions
        preds = [pred.predicted_label for pred in pred_subgraph.nodes]

        # Creating the list of clusters
        clusters = [pred.cluster_label for pred in pred_subgraph.nodes]

        # Ending timer
        end = time.time()

        # Calculating prediction task time
        self.pred_time = end - start

        logger.info('Data has been predicted.')
        logger.info(f'Prediction time: {self.pred_time : .4f} seconds.')

        return preds, clusters