def _clustering(self, force_prototype=False): """Clusters the subgraph. Args: force_prototype (bool): Whether clustering should for each class to have at least one prototype. """ for i in range(self.subgraph.n_nodes): # For every adjacent node of `i` for j in self.subgraph.nodes[i].adjacency: # Making sure that variable is an integer j = int(j) # Checks if node `i` density is equals as node `j` density if self.subgraph.nodes[i].density == self.subgraph.nodes[ j].density: # Marks the insertion flag as True insert = True # For every adjacent node of `j` for l in self.subgraph.nodes[j].adjacency: # Making sure that variable is an integer l = int(l) # Checks if it is the same node as `i` if i == l: insert = False if insert: self.subgraph.nodes[j].adjacency.insert(0, i) # Creating a maximum heap h = Heap(size=self.subgraph.n_nodes, policy='max') for i in range(self.subgraph.n_nodes): # Updates the node's cost on the heap h.cost[i] = self.subgraph.nodes[i].cost # Defines node's `i` predecessor as NIL self.subgraph.nodes[i].pred = c.NIL # And its root as its same identifier self.subgraph.nodes[i].root = i # Inserts the node in the heap h.insert(i) while not h.is_empty(): p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) if self.subgraph.nodes[p].pred == c.NIL: # Updates its cost on the heap h.cost[p] = self.subgraph.nodes[p].density # Defines its predicted label as the node's true label self.subgraph.nodes[p].predicted_label = self.subgraph.nodes[ p].label # Apply current node's cost as the heap's cost self.subgraph.nodes[p].cost = h.cost[p] # For every possible adjacent node for q in self.subgraph.nodes[p].adjacency: # Making sure that variable is an integer q = int(q) if h.color[q] != c.BLACK: current_cost = np.minimum(h.cost[p], self.subgraph.nodes[q].density) # If prototypes should be forced to belong to a class if force_prototype: if self.subgraph.nodes[p].label != self.subgraph.nodes[ q].label: current_cost = -c.FLOAT_MAX # If current cost is bigger than heap's cost if current_cost > h.cost[q]: # Apply `q` predecessor as `p` self.subgraph.nodes[q].pred = p # Gathers the same root's identifier self.subgraph.nodes[q].root = self.subgraph.nodes[ p].root # And its cluster label self.subgraph.nodes[ q].predicted_label = self.subgraph.nodes[ p].predicted_label # Updates node `q` on the heap with the current cost h.update(q, current_cost)
def _clustering(self, n_neighbours): """Clusters the subgraph using using a `k` value (number of neighbours). Args: n_neighbours (int): Number of neighbours to be used. """ # For every possible node for i in range(self.subgraph.n_nodes): # For every possible `k` value for k in range(n_neighbours): # Gathers node `i` adjacent node j = int(self.subgraph.nodes[i].adjacency[k]) # If both nodes' density are equal if self.subgraph.nodes[i].density == self.subgraph.nodes[ j].density: # Turns on the insertion flag insert = True # For every possible `l` value for l in range(n_neighbours): # Gathers node `j` adjacent node adj = int(self.subgraph.nodes[j].adjacency[l]) # If the nodes are the same if i == adj: # Turns off the insertion flag insert = False # If it is supposed to be inserted if insert: # Inserts node `i` in the adjacency list of `j` self.subgraph.nodes[j].adjacency.insert(0, i) # Increments the amount of adjacent nodes self.subgraph.nodes[j].n_plateaus += 1 # Creating a maximum heap h = Heap(size=self.subgraph.n_nodes, policy='max') # For every possible node for i in range(self.subgraph.n_nodes): # Updates the node's cost on the heap h.cost[i] = self.subgraph.nodes[i].cost # Defines node's `i` predecessor as NIL self.subgraph.nodes[i].pred = c.NIL # And its root as its same identifier self.subgraph.nodes[i].root = i # Inserts the node in the heap h.insert(i) # Defining an `l` counter l = 0 # While the heap is not empty while not h.is_empty(): # Removes a node p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) # If the node's predecessor is NIL if self.subgraph.nodes[p].pred == c.NIL: # Updates its cost on the heap h.cost[p] = self.subgraph.nodes[p].density # Defines its cluster label as `l` self.subgraph.nodes[p].cluster_label = l # Increments the cluster identifier l += 1 # Apply current node's cost as the heap's cost self.subgraph.nodes[p].cost = h.cost[p] # Calculates the number of its adjacent nodes n_adjacents = self.subgraph.nodes[p].n_plateaus + n_neighbours # For every possible adjacent node for k in range(n_adjacents): # Gathers the adjacent identifier q = int(self.subgraph.nodes[p].adjacency[k]) # If its color in the heap is different from `BLACK` if h.color[q] != c.BLACK: # Calculates the current cost current_cost = np.minimum(h.cost[p], self.subgraph.nodes[q].density) # If temporary cost is bigger than heap's cost if current_cost > h.cost[q]: # Apply `q` predecessor as `p` self.subgraph.nodes[q].pred = p # Gathers the same root's identifier self.subgraph.nodes[q].root = self.subgraph.nodes[ p].root # And its cluster label self.subgraph.nodes[ q].cluster_label = self.subgraph.nodes[ p].cluster_label # Updates the heap `q` node and the current cost h.update(q, current_cost) # The final number of clusters will be equal to `l` self.subgraph.n_clusters = l
def _find_prototypes(self): """Find prototype nodes using the Minimum Spanning Tree (MST) approach. """ logger.debug('Finding prototypes ...') # Creating a Heap of size equals to number of nodes h = Heap(self.subgraph.n_nodes) # Marking first node without any predecessor self.subgraph.nodes[0].pred = c.NIL # Adding first node to the heap h.insert(0) # Creating a list of prototype nodes prototypes = [] while not h.is_empty(): # Remove a node from the heap p = h.remove() # Gathers its cost from the heap self.subgraph.nodes[p].cost = h.cost[p] # And also its predecessor pred = self.subgraph.nodes[p].pred if pred != c.NIL: # Checks if the label of current node is the same as its predecessor if self.subgraph.nodes[p].label != self.subgraph.nodes[pred].label: # If current node is not a prototype if self.subgraph.nodes[p].status != c.PROTOTYPE: # Marks it as a prototype self.subgraph.nodes[p].status = c.PROTOTYPE # Appends current node identifier to the prototype's list prototypes.append(p) # If predecessor node is not a prototype if self.subgraph.nodes[pred].status != c.PROTOTYPE: # Marks it as a protoype self.subgraph.nodes[pred].status = c.PROTOTYPE # Appends predecessor node identifier to the prototype's list prototypes.append(pred) for q in range(self.subgraph.n_nodes): if h.color[q] != c.BLACK: if p != q: if self.pre_computed_distance: weight = self.pre_distances[self.subgraph.nodes[p].idx][self.subgraph.nodes[q].idx] else: weight = self.distance_fn(self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) if weight < h.cost[q]: # Marks `q` predecessor node as `p` self.subgraph.nodes[q].pred = p # Updates the arc on the heap h.update(q, weight) logger.debug('Prototypes: %s.', prototypes)
def fit(self, X_train, Y_train, I_train=None): """Fits data in the classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. I_train (np.array): Array of training indexes. """ logger.info('Fitting classifier ...') start = time.time() # Creating a subgraph self.subgraph = Subgraph(X_train, Y_train, I=I_train) # Finding prototypes self._find_prototypes() # Creating a minimum heap h = Heap(size=self.subgraph.n_nodes) for i in range(self.subgraph.n_nodes): if self.subgraph.nodes[i].status == c.PROTOTYPE: # If yes, it does not have predecessor nodes self.subgraph.nodes[i].pred = c.NIL # Its predicted label is the same as its true label self.subgraph.nodes[i].predicted_label = self.subgraph.nodes[i].label # Its cost equals to zero h.cost[i] = 0 # Inserts the node into the heap h.insert(i) else: # Its cost equals to maximum possible value h.cost[i] = c.FLOAT_MAX while not h.is_empty(): # Removes a node p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) # Gathers its cost self.subgraph.nodes[p].cost = h.cost[p] for q in range(self.subgraph.n_nodes): if p != q: if h.cost[p] < h.cost[q]: if self.pre_computed_distance: weight = self.pre_distances[self.subgraph.nodes[p].idx][self.subgraph.nodes[q].idx] else: weight = self.distance_fn(self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) # The current cost will be the maximum cost between the node's and its weight (arc) current_cost = np.maximum(h.cost[p], weight) if current_cost < h.cost[q]: # `q` node has `p` as its predecessor self.subgraph.nodes[q].pred = p # And its predicted label is the same as `p` self.subgraph.nodes[q].predicted_label = self.subgraph.nodes[p].predicted_label # Updates the heap `q` node and the current cost h.update(q, current_cost) # The subgraph has been properly trained self.subgraph.trained = True end = time.time() train_time = end - start logger.info('Classifier has been fitted.') logger.info('Training time: %s seconds.', train_time)
def fit(self, X_train, Y_train, X_unlabeled, I_train=None): """Fits data in the semi-supervised classifier. Args: X_train (np.array): Array of training features. Y_train (np.array): Array of training labels. X_unlabeled (np.array): Array of unlabeled features. I_train (np.array): Array of training indexes. """ logger.info('Fitting semi-supervised classifier ...') # Initializing the timer start = time.time() # Creating a subgraph self.subgraph = Subgraph(X_train, Y_train, I_train) # Finding prototypes self._find_prototypes() # Gather current number of nodes current_n_nodes = self.subgraph.n_nodes # Iterate over every possible unlabeled sample for i, feature in enumerate(X_unlabeled): # Creates a Node structure node = Node(current_n_nodes + i, 1, feature) # Appends the node to the list self.subgraph.nodes.append(node) # Creating a minimum heap h = Heap(size=self.subgraph.n_nodes) # For each possible node for i in range(self.subgraph.n_nodes): # Checks if node is a prototype if self.subgraph.nodes[i].status == c.PROTOTYPE: # If yes, it does not have predecessor nodes self.subgraph.nodes[i].pred = c.NIL # Its predicted label is the same as its true label self.subgraph.nodes[i].predicted_label = self.subgraph.nodes[ i].label # Its cost equals to zero h.cost[i] = 0 # Inserts the node into the heap h.insert(i) # If node is not a prototype else: # Its cost equals to maximum possible value h.cost[i] = c.FLOAT_MAX # While the heap is not empty while not h.is_empty(): # Removes a node p = h.remove() # Appends its index to the ordered list self.subgraph.idx_nodes.append(p) # Gathers its cost self.subgraph.nodes[p].cost = h.cost[p] # For every possible node for q in range(self.subgraph.n_nodes): # If we are dealing with different nodes if p != q: # If `p` node cost is smaller than `q` node cost if h.cost[p] < h.cost[q]: # Checks if we are using a pre-computed distance if self.pre_computed_distance: # Gathers the distance from the distance's matrix weight = self.pre_distances[self.subgraph.nodes[ p].idx][self.subgraph.nodes[q].idx] # If the distance is supposed to be calculated else: # Calls the corresponding distance function weight = self.distance_fn( self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) # The current cost will be the maximum cost between the node's and its weight (arc) current_cost = np.maximum(h.cost[p], weight) # If current cost is smaller than `q` node's cost if current_cost < h.cost[q]: # `q` node has `p` as its predecessor self.subgraph.nodes[q].pred = p # And its predicted label is the same as `p` self.subgraph.nodes[ q].predicted_label = self.subgraph.nodes[ p].predicted_label # As we may have unlabeled nodes, make sure that `q` label equals to `q` predicted label self.subgraph.nodes[q].label = self.subgraph.nodes[ q].predicted_label # Updates the heap `q` node and the current cost h.update(q, current_cost) # The subgraph has been properly trained self.subgraph.trained = True # Ending timer end = time.time() # Calculating training task time train_time = end - start logger.info('Semi-supervised classifier has been fitted.') logger.info('Training time: %s seconds.', train_time)
def _find_prototypes(self): """Find prototype nodes using the Minimum Spanning Tree (MST) approach. """ logger.debug('Finding prototypes ...') # Creating a Heap of size equals to number of nodes h = Heap(self.subgraph.n_nodes) # Marking first node without any predecessor self.subgraph.nodes[0].pred = c.NIL # Adding first node to the heap h.insert(0) # Creating a list of prototype nodes prototypes = [] # While the heap is not empty while not h.is_empty(): # Remove a node from the heap p = h.remove() # Gathers its cost from the heap self.subgraph.nodes[p].cost = h.cost[p] # And also its predecessor pred = self.subgraph.nodes[p].pred # If the predecessor is not NIL if pred != c.NIL: # Checks if the label of current node is the same as its predecessor if self.subgraph.nodes[p].label != self.subgraph.nodes[ pred].label: # If current node is not a prototype if self.subgraph.nodes[p].status != c.PROTOTYPE: # Marks it as a prototype self.subgraph.nodes[p].status = c.PROTOTYPE # Appends current node identifier to the prototype's list prototypes.append(p) # If predecessor node is not a prototype if self.subgraph.nodes[pred].status != c.PROTOTYPE: # Marks it as a protoype self.subgraph.nodes[pred].status = c.PROTOTYPE # Appends predecessor node identifier to the prototype's list prototypes.append(pred) # For every possible node for q in range(self.subgraph.n_nodes): # Checks if the color of current node in the heap is not black if h.color[q] != c.BLACK: # If `p` and `q` identifiers are different if p != q: # If it is supposed to use pre-computed distances if self.pre_computed_distance: # Gathers the arc from the distances' matrix weight = self.pre_distances[self.subgraph.nodes[ p].idx][self.subgraph.nodes[q].idx] # If distance is supposed to be calculated else: # Calculates the distance weight = self.distance_fn( self.subgraph.nodes[p].features, self.subgraph.nodes[q].features) # If current arc's cost is smaller than the path's cost if weight < h.cost[q]: # Marks `q` predecessor node as `p` self.subgraph.nodes[q].pred = p # Updates the arc on the heap h.update(q, weight) logger.debug('Prototypes: %s.', prototypes)