def fit(self, X_train, Y_train=None): """Fits data in the classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. """ logger.info('Clustering with classifier ...') # Initializing the timer start = time.time() # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Performing the best minimum cut on the subgraph self._best_minimum_cut(self.min_k, self.max_k) # Clustering the data with best `k` value self._clustering(self.subgraph.best_k) # The subgraph has been properly trained self.subgraph.trained = True # Ending timer end = time.time() # Calculating training task time train_time = end - start logger.info('Classifier has been clustered with.') logger.info(f'Number of clusters: {self.subgraph.n_clusters}.') logger.info(f'Clustering time: {train_time} seconds.')
def predict(self, X_val): """Predicts new data using the pre-trained classifier. Args: X_val (np.array): Array of validation or test features. Returns: A list of predictions for each record of the data. """ # Checks if there is a subgraph if not self.subgraph: # If not, raises an BuildError raise e.BuildError('Subgraph has not been properly created') # Checks if subgraph has been properly trained if not self.subgraph.trained: # If not, raises an BuildError raise e.BuildError('Classifier has not been properly fitted') # Initializing the timer start = time.time() # Creating a prediction subgraph pred_subgraph = Subgraph(X_val) # For every possible node for i in range(pred_subgraph.n_nodes): # Initializing the conqueror node conqueror = -1 # Initializes the `j` counter j = 0 # Gathers the first node from the ordered list k = self.subgraph.idx_nodes[j] # Checks if we are using a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the distance's matrix weight = self.pre_distances[self.subgraph.nodes[k].idx][ pred_subgraph.nodes[i].idx] # If the distance is supposed to be calculated else: # Calls the corresponding distance function weight = self.distance_fn(self.subgraph.nodes[k].features, pred_subgraph.nodes[i].features) # The minimum cost will be the maximum between the `k` node cost and its weight (arc) min_cost = np.maximum(self.subgraph.nodes[k].cost, weight) # The current label will be `k` node's predicted label current_label = self.subgraph.nodes[k].predicted_label # The current conquerer will be `k` node's sample's index current_conqueror = self.subgraph.nodes[k].idx_sample # While `j` is a possible node and the minimum cost is bigger than the current node's cost while j < (self.subgraph.n_nodes - 1) and min_cost > self.subgraph.nodes[ self.subgraph.idx_nodes[j + 1]].cost: # Gathers the next node from the ordered list l = self.subgraph.idx_nodes[j + 1] # Checks if we are using a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the distance's matrix weight = self.pre_distances[self.subgraph.nodes[l].idx][ pred_subgraph.nodes[i].idx] # If the distance is supposed to be calculated else: # Calls the corresponding distance function weight = self.distance_fn(self.subgraph.nodes[l].features, pred_subgraph.nodes[i].features) # The temporary minimum cost will be the maximum between the `l` node cost and its weight (arc) temp_min_cost = np.maximum(self.subgraph.nodes[l].cost, weight) # If temporary minimum cost is smaller than the minimum cost if temp_min_cost < min_cost: # Replaces the minimum cost min_cost = temp_min_cost # Gathers the identifier of `l` node conqueror = l # Updates the current label as `l` node's predicted label current_label = self.subgraph.nodes[l].predicted_label # Updates the current conquerer as `l` node's sample's index current_conqueror = self.subgraph.nodes[l].idx_sample # Increments the `j` counter j += 1 # Makes `k` and `l` equals k = l # Node's `i` predicted label is the same as current label pred_subgraph.nodes[i].predicted_label = current_label # Node's `i` predicted label is the same as current label pred_subgraph.nodes[i]._idx_sample_conqueror = current_conqueror # Checks if any node has been conquered if conqueror > -1: # Marks the conqueror node and its path self.subgraph.mark_nodes(conqueror) # Creating the list of predictions preds = [pred.predicted_label for pred in pred_subgraph.nodes] conqs = [pred._idx_sample_conqueror for pred in pred_subgraph.nodes] # Ending timer end = time.time() # Calculating prediction task time predict_time = end - start return preds, conqs
def fit(self, X_train, Y_train, idx_train): """Fits data in the classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. """ # Initializing the timer start = time.time() # Creating a subgraph self.subgraph = Subgraph(X_train, Y_train, idx_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Finding prototypes self._find_prototypes() # Creating a minimum heap h = Heap(size=self.subgraph.n_nodes) # For each possible node for i in range(self.subgraph.n_nodes): # Checks if node is a prototype if self.subgraph.nodes[i].status == c.PROTOTYPE: # If yes, it does not have predecessor nodes self.subgraph.nodes[i].pred = c.NIL # Its predicted label is the same as its true label self.subgraph.nodes[i].predicted_label = self.subgraph.nodes[ i].label # Its cost equals to zero h.cost[i] = 0 # Inserts the node into the heap h.insert(i) # If node is not a prototype else: # Its cost equals to maximum possible value h.cost[i] = c.FLOAT_MAX # While the heap is not empty while not h.is_empty(): # Removes a node p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) # Gathers its cost self.subgraph.nodes[p].cost = h.cost[p] # For every possible node for q in range(self.subgraph.n_nodes): # If we are dealing with different nodes if p != q: # If `p` node cost is smaller than `q` node cost if h.cost[p] < h.cost[q]: # Checks if we are using a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the distance's matrix weight = self.pre_distances[self.subgraph.nodes[ p].idx][self.subgraph.nodes[q].idx] # If the distance is supposed to be calculated else: # Calls the corresponding distance function weight = self.distance_fn( self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) # The current cost will be the maximum cost between the node's and its weight (arc) current_cost = np.maximum(h.cost[p], weight) # If current cost is smaller than `q` node's cost if current_cost < h.cost[q]: # `q` node has `p` as its predecessor self.subgraph.nodes[q].pred = p # And its predicted label is the same as `p` self.subgraph.nodes[ q].predicted_label = self.subgraph.nodes[ p].predicted_label # Updates the heap `q` node and the current cost h.update(q, current_cost) # The subgraph has been properly trained self.subgraph.trained = True # Ending timer end = time.time() # Calculating training task time train_time = end - start
def _learn(self, X_train, Y_train, X_val, Y_val): """Learns the best `k` value over the validation set. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_val (np.array): Array of validation features. Y_val (np.array): Array of validation labels. Returns: The best `k` value found over the validation set. """ logger.info('Learning best `k` value ...') # Creating a subgraph self.subgraph = KNNSubgraph(X_train, Y_train) # Checks if it is supposed to use pre-computed distances if self.pre_computed_distance: # Checks if its size is the same as the subgraph's amount of nodes if self.pre_distances.shape[ 0] != self.subgraph.n_nodes or self.pre_distances.shape[ 1] != self.subgraph.n_nodes: # If not, raises an error raise e.BuildError( 'Pre-computed distance matrix should have the size of `n_nodes x n_nodes`' ) # Defining initial maximum accuracy as 0 max_acc = 0.0 # For every possible `k` value for k in range(1, self.max_k + 1): # Gathers current `k` as subgraph's best `k` self.subgraph.best_k = k # Calculate the arcs using the current `k` value self.subgraph.create_arcs(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Calculate the p.d.f. using the current `k` value self.subgraph.calculate_pdf(k, self.distance_fn, self.pre_computed_distance, self.pre_distances) # Clusters the subgraph self._clustering() # Calculate the predictions over the validation set preds = self.predict(X_val) # Calculating the accuracy acc = g.opf_accuracy(Y_val, preds) # If accuracy is better than maximum accuracy if acc > max_acc: # Replaces the maximum accuracy value max_acc = acc # Defines current `k` as the best `k` value best_k = k logger.info(f'Accuracy over k = {k}: {acc}') # Destroy the arcs self.subgraph.destroy_arcs() return best_k
def predict(self, X_val): """Predicts new data using the pre-trained classifier. Args: X_val (np.array): Array of validation features. Returns: A list of predictions for each record of the data. """ # Checks if there is a knn-subgraph if not self.subgraph: # If not, raises an BuildError raise e.BuildError('KNNSubgraph has not been properly created') # Checks if knn-subgraph has been properly trained if not self.subgraph.trained: # If not, raises an BuildError raise e.BuildError('Classifier has not been properly clustered') # Initializing the timer start = time.time() # Creating a prediction subgraph pred_subgraph = KNNSubgraph(X_val) # Gathering the best `k` value best_k = self.subgraph.best_k # Creating an array of distances distances = np.zeros(best_k + 1) # Creating an array of nearest neighbours indexes neighbours_idx = np.zeros(best_k + 1) # For every possible prediction node for i in range(pred_subgraph.n_nodes): # Defines the current cost cost = -c.FLOAT_MAX # Filling array of distances with maximum value distances.fill(c.FLOAT_MAX) # For every possible trained node for j in range(self.subgraph.n_nodes): # If they are different nodes if j != i: # If it is supposed to use a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the matrix distances[best_k] = self.pre_distances[ pred_subgraph.nodes[i].idx][ self.subgraph.nodes[j].idx] # If it is supposed to calculate the distance else: # Calculates the distance between nodes `i` and `j` distances[best_k] = self.distance_fn( pred_subgraph.nodes[i].features, self.subgraph.nodes[j].features) # Apply node `j` as a neighbour neighbours_idx[best_k] = j # Gathers current `k` current_k = best_k # While current `k` is bigger than 0 and the `k` distance is smaller than `k-1` distance while current_k > 0 and distances[current_k] < distances[ current_k - 1]: # Swaps the distance from `k` and `k-1` distances[current_k], distances[ current_k - 1] = distances[current_k - 1], distances[current_k] # Swaps the neighbours indexex from `k` and `k-1` neighbours_idx[current_k], neighbours_idx[ current_k - 1] = neighbours_idx[current_k - 1], neighbours_idx[current_k] # Decrements `k` current_k -= 1 # Defining the density as 0 density = 0.0 # For every possible k for k in range(best_k): # Accumulates the density density += np.exp(-distances[k] / self.subgraph.constant) # Gather its mean value density /= best_k # Scale the density between minimum and maximum values density = ((c.MAX_DENSITY - 1) * (density - self.subgraph.min_density) / (self.subgraph.max_density - self.subgraph.min_density + c.EPSILON)) + 1 # For every possible k for k in range(best_k): # If distance is different than maximum possible value if distances[k] != c.FLOAT_MAX: # Gathers the node's neighbour neighbour = int(neighbours_idx[k]) # Calculate the temporary cost temp_cost = np.minimum(self.subgraph.nodes[neighbour].cost, density) # If temporary cost is bigger than current cost if temp_cost > cost: # Replaces the current cost cost = temp_cost # Propagates the predicted label from the neighbour pred_subgraph.nodes[ i].predicted_label = self.subgraph.nodes[ neighbour].predicted_label # Propagates the cluster label from the neighbour pred_subgraph.nodes[ i].cluster_label = self.subgraph.nodes[ neighbour].cluster_label # Creating the list of predictions preds = [pred.predicted_label for pred in pred_subgraph.nodes] # Creating the list of clusters clusters = [pred.cluster_label for pred in pred_subgraph.nodes] # Ending timer end = time.time() # Calculating prediction task time predict_time = end - start return preds, clusters