def get_dist_matrix(similarity_graph, base_path, image_count, similar_count): dist_matrix_file = Path(base_path + "/" + "dist_matrix" + str(image_count) + str(similar_count) + ".npy") if dist_matrix_file.is_file(): dist_matrix = np.load(base_path + "/" + "dist_matrix" + str(image_count) + str(similar_count) + ".npy") else: similarity_graph = np.maximum(similarity_graph, similarity_graph.T) similarity_graph = csr_matrix(similarity_graph) similarity_graph_sparse = similarity_graph similarity_graph_sparse = validate_graph(similarity_graph_sparse, False, np.float64) dist_matrix = shortest_path(similarity_graph_sparse, method="J", directed=False) np.save( base_path + "/" + "dist_matrix" + str(image_count) + str(similar_count) + ".npy", dist_matrix) return dist_matrix
def minimum_spanning_tree_K(csgraph, k=1, overwrite=False): csgraph = validate_graph(csgraph, True, np.float64, dense_output=False, copy_if_sparse=not overwrite) N = csgraph.shape[0] data = csgraph.data indices = csgraph.indices indptr = csgraph.indptr rank = np.zeros(N, dtype=np.int32) predecessors = np.arange(N, dtype=np.int32) i_sort = np.argsort(data).astype(np.int32) row_indices = np.zeros(len(data), dtype=np.int32) min_spanning_tree_K(data, indices, indptr, i_sort, row_indices, predecessors, rank, k) sp_tree = csr_matrix((data, indices, indptr), (N, N)) sp_tree.eliminate_zeros() return sp_tree
def fit(self, X, y=None): """Fit the clustering model Parameters ---------- X : array_like the data to be clustered: shape = [n_samples, n_features] """ if self.cutoff is None and self.cutoff_scale is None: raise ValueError("Must specify either cutoff or cutoff_frac") # Compute the distance-based graph G from the points in X if self.metric == 'precomputed': # Input is already a graph. Copy if sparse # so we can overwrite for efficiency below. self.X_fit_ = None G = validate_graph(X, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) elif not self.approximate: X = check_array(X) self.X_fit_ = X kwds = self.metric_params or {} G = pairwise_distances(X, metric=self.metric, **kwds) G = validate_graph(G, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) else: # generate a sparse graph using n_neighbors of each point X = check_array(X) self.X_fit_ = X n_neighbors = min(self.n_neighbors, X.shape[0] - 1) G = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance', metric=self.metric, metric_params=self.metric_params) # HACK to keep explicit zeros (minimum spanning tree removes them) zero_fillin = G.data[G.data > 0].min() * 1E-8 G.data[G.data == 0] = zero_fillin # Compute the minimum spanning tree of this graph self.full_tree_ = minimum_spanning_tree(G, overwrite=True) # undo the hack to bring back explicit zeros self.full_tree_[self.full_tree_ == zero_fillin] = 0 # Partition the data by the cutoff N = G.shape[0] - 1 if self.cutoff is None: i_cut = N elif 0 <= self.cutoff < 1: i_cut = int((1 - self.cutoff) * N) elif self.cutoff >= 1: i_cut = int(N - self.cutoff) else: raise ValueError('self.cutoff must be positive, not {0}' ''.format(self.cutoff)) # create the mask; we zero-out values where the mask is True N = len(self.full_tree_.data) if i_cut < 0: mask = np.ones(N, dtype=bool) elif i_cut >= N: mask = np.zeros(N, dtype=bool) else: mask = np.ones(N, dtype=bool) part = np.argpartition(self.full_tree_.data, i_cut) mask[part[:i_cut]] = False # additionally cut values above the ``cutoff_scale`` if self.cutoff_scale is not None: mask |= (self.full_tree_.data > self.cutoff_scale) # Trim the tree cluster_graph = self.full_tree_.copy() # Eliminate zeros from cluster_graph for efficiency. # We want to do this: # cluster_graph.data[mask] = 0 # cluster_graph.eliminate_zeros() # but there could be explicit zeros in our data! # So we call eliminate_zeros() with a stand-in data array, # then replace the data when we're finished. original_data = cluster_graph.data cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1) cluster_graph.data[mask] = 0 cluster_graph.eliminate_zeros() cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1] # find connected components n_components, labels = connected_components(cluster_graph, directed=False) # remove clusters with fewer than min_cluster_size counts = np.bincount(labels) to_remove = np.where(counts < self.min_cluster_size)[0] if len(to_remove) > 0: for i in to_remove: labels[labels == i] = -1 _, labels = np.unique(labels, return_inverse=True) labels -= 1 # keep -1 labels the same # update cluster_graph by eliminating non-clusters # operationally, this means zeroing-out rows & columns where # the label is negative. I = sparse.eye(len(labels)) I.data[0, labels < 0] = 0 # we could just do this: # cluster_graph = I * cluster_graph * I # but we want to be able to eliminate the zeros, so we use # the same indexing trick as above original_data = cluster_graph.data cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1) cluster_graph = I * cluster_graph * I cluster_graph.eliminate_zeros() cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1] self.labels_ = labels self.cluster_graph_ = cluster_graph return self
def fit(self, X, y=None): """Fit the clustering model Parameters ---------- X : array_like the data to be clustered: shape = [n_samples, n_features] threshold : str Algorithm to use for thresholding edge length in MST """ # Compute the distance-based graph G from the points in X if self.metric == 'precomputed': # Input is already a graph. Copy if sparse # so we can overwrite for efficiency below. self.X_fit_ = None G = validate_graph(X, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) elif not self.approximate: X = check_array(X) self.X_fit_ = X kwds = self.metric_params or {} G = pairwise_distances(X, metric=self.metric, **kwds) G = validate_graph(G, directed=True, csr_output=True, dense_output=False, copy_if_sparse=True, null_value_in=np.inf) else: # generate a sparse graph using n_neighbors of each point X = check_array(X) self.X_fit_ = X n_neighbors = min(self.n_neighbors, X.shape[0] - 1) G = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance', metric=self.metric, metric_params=self.metric_params) # HACK to keep explicit zeros (minimum spanning tree removes them) zero_fillin = G.data[G.data > 0].min() * 1E-8 G.data[G.data == 0] = zero_fillin # Compute the minimum spanning tree of this graph self.full_tree_ = minimum_spanning_tree(G, overwrite=True) # undo the hack to bring back explicit zeros self.full_tree_[self.full_tree_ == zero_fillin] = 0 if self.threshold == 'hermite': max_edge = self._hermite_threshold(self.full_tree_) else: max_edge = self._histogram_threshold(self.full_tree_) mask = self.full_tree_.data > max_edge cluster_graph = self.full_tree_.copy() original_data = cluster_graph.data cluster_graph.data = np.arange(1, len(cluster_graph.data) + 1) cluster_graph.data[mask] = 0 cluster_graph.eliminate_zeros() cluster_graph.data = original_data[cluster_graph.data.astype(int) - 1] self.cluster_graph_ = cluster_graph self.n_components_, self.labels_ = connected_components(cluster_graph, directed=False) return self