def fit(self, X):
        self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm)
        self.nearest_neighbors_.fit(X)
        forest = euclidean_mst(X, self.nearest_neighbors_)
        weights = forest.data
        inds = np.argsort(weights)[::-1]
        edges = np.vstack(forest.nonzero()).T
        n_samples = len(edges) + 1
        i = 0
        while len(forest.nonzero()[0]) > n_samples - self.n_clusters:
            e = edges[inds[i]]
            forest[e[0], e[1]] = 0
            if np.min(sparse.cs_graph_components(forest + forest.T)[1]) < 0:
                # only one node in new component. messes up cs_graph_components
                forest[e[0], e[1]] = weights[i]
            elif (np.min(np.bincount(sparse.cs_graph_components(forest +
                                                                forest.T)[1])) <
                  2):
                # disallow small clusters
                forest[e[0], e[1]] = weights[i]

            i += 1
        self.labels_ = sparse.cs_graph_components(forest + forest.T)[1]
        return self
Example #2
0
    def fit(self, X):
        """
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input data.

        Returns
        ------
        self
        """
        n_samples, n_features = X.shape

        self.nearest_neighbors_ = NearestNeighbors(
            algorithm=self.nearest_neighbor_algorithm)
        if self.verbose:
            print("Fitting neighbors data structure.")
        self.nearest_neighbors_.fit(X)
        if self.verbose:
            print("Datastructure used: %s" %
                  self.nearest_neighbors_._fit_method)
        if self.verbose:
            print("Bulding minimum spanning tree.")
        forest = euclidean_mst(X,
                               self.nearest_neighbors_,
                               verbose=self.verbose)

        # the dimensionality of the space can at most be n_samples
        if self.infer_dimensionality:
            if self.verbose:
                print("Estimating dimensionality.")
            intrinsic_dimensionality = estimate_dimension(
                X, neighbors_estimator=self.nearest_neighbors_)
            if self.verbose > 0:
                print("Estimated dimensionality: %d" %
                      intrinsic_dimensionality)
        elif n_samples < n_features:
            warnings.warn("Got dataset with n_samples < n_features. Setting"
                          "intrinsic dimensionality to n_samples. This is most"
                          " likely to high, leading to uneven clusters. It "
                          "is recommendet to set infer_dimensionality=True.")
            intrinsic_dimensionality = n_samples
        else:
            intrinsic_dimensionality = n_features

        if self.verbose:
            print("Cutting spanning tree.")
        clusters = [(forest, np.arange(n_samples))]
        cut_improvement = [
            itm_binary(forest.copy(),
                       intrinsic_dimensionality,
                       return_edge=True)
        ]
        # init cluster_infos to anything.
        # doesn't matter any way as there is only one component
        cluster_infos = [0]
        y = np.zeros(n_samples, dtype=np.int)
        removed_edges = []
        # keep all possible next splits, pick the one with highest gain.
        while len(clusters) < self.n_clusters:
            if self.verbose > 1:
                print("Finding for split %d." % len(clusters))
            possible_improvements = (np.array(
                [cut_i[1] * cut_i[0].shape[0]
                 for cut_i in cut_improvement]) - np.array(cluster_infos))
            i_to_split = np.argmax(possible_improvements)
            split, info, edge = cut_improvement.pop(i_to_split)
            # get rid of old cluster
            cluster_infos.pop(i_to_split)
            # need the indices of the nodes in the cluster to keep track
            # of where our datapoint went
            _, old_inds = clusters.pop(i_to_split)
            removed_edges.append((old_inds[list(edge[:2])], edge[2]))

            n_split_components, split_components_indicator = \
                connected_components(split + split.T)
            assert (n_split_components == 2)
            assert (len(np.unique(split_components_indicator)) == 2)

            for i in range(n_split_components):
                inds = np.where(split_components_indicator == i)[0]
                clusters.append((split[inds, :][:, inds], old_inds[inds]))
                mi = tree_information_sparse(clusters[-1][0],
                                             intrinsic_dimensionality)
                cluster_infos.append(mi)
                imp = itm_binary(clusters[-1][0].copy(),
                                 intrinsic_dimensionality,
                                 return_edge=True)
                cut_improvement.append(imp)

        # correspondence of nodes to datapoints not present in sparse matrices
        # but we saved the indices.
        c_inds = [c[1] for c in clusters]
        y = np.empty(n_samples, dtype=np.int)
        assert len(np.hstack(c_inds)) == n_samples

        for i, c in enumerate(c_inds):
            y[c] = i

        # for computing the objective, we don't care about the indices
        result = block_diag([c[0] for c in clusters], format='csr')
        self.labels_ = y
        self.tree_information_ = (
            tree_information_sparse(result, intrinsic_dimensionality) /
            n_samples)
        return self
Example #3
0
    def fit(self, X):
        """
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input data.

        Returns
        ------
        self
        """
        n_samples, n_features = X.shape

        self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm)
        if self.verbose:
            print("Fitting neighbors data structure.")
        self.nearest_neighbors_.fit(X)
        if self.verbose:
            print("Datastructure used: %s" % self.nearest_neighbors_._fit_method)
        if self.verbose:
            print("Bulding minimum spanning tree.")
        forest = euclidean_mst(X, self.nearest_neighbors_, verbose=self.verbose)

        # the dimensionality of the space can at most be n_samples
        if self.infer_dimensionality:
            if self.verbose:
                print("Estimating dimensionality.")
            intrinsic_dimensionality = estimate_dimension(
                X, neighbors_estimator=self.nearest_neighbors_)
            if self.verbose > 0:
                print("Estimated dimensionality: %d" % intrinsic_dimensionality)
        elif n_samples < n_features:
            warnings.warn("Got dataset with n_samples < n_features. Setting"
                          "intrinsic dimensionality to n_samples. This is most"
                          " likely to high, leading to uneven clusters."
                          " It is recommendet to set infer_dimensionality=True.")
            intrinsic_dimensionality = n_samples
        else:
            intrinsic_dimensionality = n_features

        if self.verbose:
            print("Cutting spanning tree.")
        clusters = [(forest, np.arange(n_samples))]
        cut_improvement = [itm_binary(forest.copy(), intrinsic_dimensionality,
                                      return_edge=True)]
        # init cluster_infos to anything.
        # doesn't matter any way as there is only one component
        cluster_infos = [0]
        y = np.zeros(n_samples, dtype=np.int)
        removed_edges = []
        # keep all possible next splits, pick the one with highest gain.
        while len(clusters) < self.n_clusters:
            if self.verbose > 1:
                print("Finding for split %d." % len(clusters))
            possible_improvements = (np.array([cut_i[1] * cut_i[0].shape[0] for
                                               cut_i in cut_improvement]) -
                                     np.array(cluster_infos))
            i_to_split = np.argmax(possible_improvements)
            split, info, edge = cut_improvement.pop(i_to_split)
            # get rid of old cluster
            cluster_infos.pop(i_to_split)
            # need the indices of the nodes in the cluster to keep track
            # of where our datapoint went
            _, old_inds = clusters.pop(i_to_split)
            removed_edges.append((old_inds[list(edge[:2])], edge[2]))

            n_split_components, split_components_indicator = \
                sparse.cs_graph_components(split + split.T)
            assert(n_split_components == 2)
            assert(len(np.unique(split_components_indicator)) == 2)

            for i in xrange(n_split_components):
                inds = np.where(split_components_indicator == i)[0]
                clusters.append((split[inds[np.newaxis, :], inds],
                                 old_inds[inds]))
                mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality)
                cluster_infos.append(mi)
                imp = itm_binary(clusters[-1][0].copy(), intrinsic_dimensionality,
                                 return_edge=True)
                cut_improvement.append(imp)

        # correspondence of nodes to datapoints not present in sparse matrices
        # but we saved the indices.
        c_inds = [c[1] for c in clusters]
        y = np.empty(n_samples, dtype=np.int)
        assert len(np.hstack(c_inds)) == n_samples

        for i, c in enumerate(c_inds):
            y[c] = i

        # for computing the objective, we don't care about the indices
        result = block_diag([c[0] for c in clusters], format='csr')
        self.labels_ = y
        self.tree_information_ = (tree_information_sparse(result, intrinsic_dimensionality) /
                                  n_samples)
        return self