def prune(self, X_train, Y_train, X_val, Y_val, n_iterations=10): """Prunes a classifier over a validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. n_iterations (int): Maximum number of iterations. """ logger.info('Pruning classifier ...') # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data self.predict(X_val) # Gathering initial number of nodes initial_nodes = self.subgraph.n_nodes # For every possible iteration for t in range(n_iterations): logger.info(f'Running iteration {t+1}/{n_iterations} ...') # Creating temporary lists X_temp, Y_temp = [], [] # Removing irrelevant nodes for j, n in enumerate(self.subgraph.nodes): if n.relevant != c.IRRELEVANT: X_temp.append(X_train[j, :]) Y_temp.append(Y_train[j]) # Copying lists back to original data X_train = np.asarray(X_temp) Y_train = np.asarray(Y_temp) # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data preds = self.predict(X_val) # Calculating accuracy acc = g.opf_accuracy(Y_val, preds) logger.info(f'Current accuracy: {acc}.') # Gathering final number of nodes final_nodes = self.subgraph.n_nodes # Calculating pruning ratio prune_ratio = 1 - final_nodes / initial_nodes logger.info(f'Prune ratio: {prune_ratio}.')
def learn(self, X_train, Y_train, X_val, Y_val, n_iterations=10): """Learns the best classifier over a validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. n_iterations (int): Number of iterations. """ # Defines the maximum accuracy max_acc = 0 # Defines the previous accuracy previous_acc = 0 # Defines the iterations counter t = 0 # An always true loop while True: # Fits training data into the classifier self.fit(X_train, Y_train) # Predicts new data preds = self.predict(X_val) # Calculating accuracy acc = g.opf_accuracy(Y_val, preds) # Checks if current accuracy is better than the best one if acc > max_acc: # If yes, replace the maximum accuracy max_acc = acc # Makes a copy of the best OPF classifier best_opf = copy.deepcopy(self) # And saves the iteration number best_t = t # Gathers which samples were missclassified errors = np.argwhere(Y_val != preds) # Defining the initial number of non-prototypes as 0 non_prototypes = 0 # For every possible subgraph's node for n in self.subgraph.nodes: # If the node is not a prototype if n.status != c.PROTOTYPE: # Increments the number of non-prototypes non_prototypes += 1 # For every possible error for e in errors: # Counter will receive the number of non-prototypes ctr = non_prototypes # While the counter is bigger than zero while ctr > 0: # Generates a random index j = int(r.generate_uniform_random_number(0, len(X_train))) # If the node on that particular index is not a prototype if self.subgraph.nodes[j].status != c.PROTOTYPE: # Swap the nodes X_train[j, :], X_val[e, :] = X_val[e, :], X_train[j, :] Y_train[j], Y_val[e] = Y_val[e], Y_train[j] # Decrements the number of non-prototypes non_prototypes -= 1 # Resets the counter ctr = 0 # If the node on that particular index is a prototype else: # Decrements the counter ctr -= 1 # Calculating difference between current accuracy and previous one delta = np.fabs(acc - previous_acc) # Replacing the previous accuracy as current accuracy previous_acc = acc # Incrementing the counter t += 1 # If the difference is smaller than 10e-4 or iterations are finished if delta < 0.0001 or t == n_iterations: # Replaces current class with the best OPF self = best_opf # Breaks the loop break
def _learn(self, X_train, Y_train, X_val, Y_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. Returns: The best `k` value found over the validation set. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 # For every possible `k` value for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) # If accuracy is better than maximum accuracy if acc > max_acc: # Replaces the maximum accuracy value max_acc = acc # Defines current `k` as the best `k` value best_k = k logger.info(f'Accuracy over k = {k}: {acc}') # Destroy the arcs self.subgraph.destroy_arcs() return best_k