def fit(self, X): self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm) self.nearest_neighbors_.fit(X) forest = euclidean_mst(X, self.nearest_neighbors_) weights = forest.data inds = np.argsort(weights)[::-1] edges = np.vstack(forest.nonzero()).T n_samples = len(edges) + 1 i = 0 while len(forest.nonzero()[0]) > n_samples - self.n_clusters: e = edges[inds[i]] forest[e[0], e[1]] = 0 if np.min(sparse.cs_graph_components(forest + forest.T)[1]) < 0: # only one node in new component. messes up cs_graph_components forest[e[0], e[1]] = weights[i] elif (np.min(np.bincount(sparse.cs_graph_components(forest + forest.T)[1])) < 2): # disallow small clusters forest[e[0], e[1]] = weights[i] i += 1 self.labels_ = sparse.cs_graph_components(forest + forest.T)[1] return self
def fit(self, X): """ Parameters ---------- X : ndarray, shape (n_samples, n_features) Input data. Returns ------ self """ n_samples, n_features = X.shape self.nearest_neighbors_ = NearestNeighbors( algorithm=self.nearest_neighbor_algorithm) if self.verbose: print("Fitting neighbors data structure.") self.nearest_neighbors_.fit(X) if self.verbose: print("Datastructure used: %s" % self.nearest_neighbors_._fit_method) if self.verbose: print("Bulding minimum spanning tree.") forest = euclidean_mst(X, self.nearest_neighbors_, verbose=self.verbose) # the dimensionality of the space can at most be n_samples if self.infer_dimensionality: if self.verbose: print("Estimating dimensionality.") intrinsic_dimensionality = estimate_dimension( X, neighbors_estimator=self.nearest_neighbors_) if self.verbose > 0: print("Estimated dimensionality: %d" % intrinsic_dimensionality) elif n_samples < n_features: warnings.warn("Got dataset with n_samples < n_features. Setting" "intrinsic dimensionality to n_samples. This is most" " likely to high, leading to uneven clusters. It " "is recommendet to set infer_dimensionality=True.") intrinsic_dimensionality = n_samples else: intrinsic_dimensionality = n_features if self.verbose: print("Cutting spanning tree.") clusters = [(forest, np.arange(n_samples))] cut_improvement = [ itm_binary(forest.copy(), intrinsic_dimensionality, return_edge=True) ] # init cluster_infos to anything. # doesn't matter any way as there is only one component cluster_infos = [0] y = np.zeros(n_samples, dtype=np.int) removed_edges = [] # keep all possible next splits, pick the one with highest gain. while len(clusters) < self.n_clusters: if self.verbose > 1: print("Finding for split %d." % len(clusters)) possible_improvements = (np.array( [cut_i[1] * cut_i[0].shape[0] for cut_i in cut_improvement]) - np.array(cluster_infos)) i_to_split = np.argmax(possible_improvements) split, info, edge = cut_improvement.pop(i_to_split) # get rid of old cluster cluster_infos.pop(i_to_split) # need the indices of the nodes in the cluster to keep track # of where our datapoint went _, old_inds = clusters.pop(i_to_split) removed_edges.append((old_inds[list(edge[:2])], edge[2])) n_split_components, split_components_indicator = \ connected_components(split + split.T) assert (n_split_components == 2) assert (len(np.unique(split_components_indicator)) == 2) for i in range(n_split_components): inds = np.where(split_components_indicator == i)[0] clusters.append((split[inds, :][:, inds], old_inds[inds])) mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality) cluster_infos.append(mi) imp = itm_binary(clusters[-1][0].copy(), intrinsic_dimensionality, return_edge=True) cut_improvement.append(imp) # correspondence of nodes to datapoints not present in sparse matrices # but we saved the indices. c_inds = [c[1] for c in clusters] y = np.empty(n_samples, dtype=np.int) assert len(np.hstack(c_inds)) == n_samples for i, c in enumerate(c_inds): y[c] = i # for computing the objective, we don't care about the indices result = block_diag([c[0] for c in clusters], format='csr') self.labels_ = y self.tree_information_ = ( tree_information_sparse(result, intrinsic_dimensionality) / n_samples) return self
def fit(self, X): """ Parameters ---------- X : ndarray, shape (n_samples, n_features) Input data. Returns ------ self """ n_samples, n_features = X.shape self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm) if self.verbose: print("Fitting neighbors data structure.") self.nearest_neighbors_.fit(X) if self.verbose: print("Datastructure used: %s" % self.nearest_neighbors_._fit_method) if self.verbose: print("Bulding minimum spanning tree.") forest = euclidean_mst(X, self.nearest_neighbors_, verbose=self.verbose) # the dimensionality of the space can at most be n_samples if self.infer_dimensionality: if self.verbose: print("Estimating dimensionality.") intrinsic_dimensionality = estimate_dimension( X, neighbors_estimator=self.nearest_neighbors_) if self.verbose > 0: print("Estimated dimensionality: %d" % intrinsic_dimensionality) elif n_samples < n_features: warnings.warn("Got dataset with n_samples < n_features. Setting" "intrinsic dimensionality to n_samples. This is most" " likely to high, leading to uneven clusters." " It is recommendet to set infer_dimensionality=True.") intrinsic_dimensionality = n_samples else: intrinsic_dimensionality = n_features if self.verbose: print("Cutting spanning tree.") clusters = [(forest, np.arange(n_samples))] cut_improvement = [itm_binary(forest.copy(), intrinsic_dimensionality, return_edge=True)] # init cluster_infos to anything. # doesn't matter any way as there is only one component cluster_infos = [0] y = np.zeros(n_samples, dtype=np.int) removed_edges = [] # keep all possible next splits, pick the one with highest gain. while len(clusters) < self.n_clusters: if self.verbose > 1: print("Finding for split %d." % len(clusters)) possible_improvements = (np.array([cut_i[1] * cut_i[0].shape[0] for cut_i in cut_improvement]) - np.array(cluster_infos)) i_to_split = np.argmax(possible_improvements) split, info, edge = cut_improvement.pop(i_to_split) # get rid of old cluster cluster_infos.pop(i_to_split) # need the indices of the nodes in the cluster to keep track # of where our datapoint went _, old_inds = clusters.pop(i_to_split) removed_edges.append((old_inds[list(edge[:2])], edge[2])) n_split_components, split_components_indicator = \ sparse.cs_graph_components(split + split.T) assert(n_split_components == 2) assert(len(np.unique(split_components_indicator)) == 2) for i in xrange(n_split_components): inds = np.where(split_components_indicator == i)[0] clusters.append((split[inds[np.newaxis, :], inds], old_inds[inds])) mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality) cluster_infos.append(mi) imp = itm_binary(clusters[-1][0].copy(), intrinsic_dimensionality, return_edge=True) cut_improvement.append(imp) # correspondence of nodes to datapoints not present in sparse matrices # but we saved the indices. c_inds = [c[1] for c in clusters] y = np.empty(n_samples, dtype=np.int) assert len(np.hstack(c_inds)) == n_samples for i, c in enumerate(c_inds): y[c] = i # for computing the objective, we don't care about the indices result = block_diag([c[0] for c in clusters], format='csr') self.labels_ = y self.tree_information_ = (tree_information_sparse(result, intrinsic_dimensionality) / n_samples) return self