Esempio n. 1
0
    def fit(self, X, y=None):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The samples a.k.a. observations.

        Returns
        -------
        self
        """
        X = check_array(X, ensure_min_samples=2, estimator=self)
        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)

        if self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError("%s was provided as affinity. Ward can only "
                             "work with euclidean distances." %
                             (self.affinity, ))

        if self.linkage not in _TREE_BUILDERS:
            raise ValueError("Unknown linkage type %s."
                             "Valid options are %s" %
                             (self.linkage, _TREE_BUILDERS.keys()))
        tree_builder = _TREE_BUILDERS[self.linkage]

        connectivity = self.connectivity
        if self.connectivity is not None:
            if callable(self.connectivity):
                connectivity = self.connectivity(X)
            connectivity = check_array(connectivity,
                                       accept_sparse=['csr', 'coo', 'lil'])

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            # Early stopping is likely to give a speed up only for
            # a large number of clusters. The actual threshold
            # implemented here is heuristic
            compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity
        if self.return_distance:
            self.children_, self.n_components_, self.n_leaves_, parents, \
                self.distances = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           return_distance=True,
                                           **kwargs)
        else:
            self.children_, self.n_components_, self.n_leaves_, parents = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           **kwargs)
        # Cut the tree
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reasign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        return self
Esempio n. 2
0
    def fit(self, X):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The samples a.k.a. observations.

        Returns
        -------
        self
        """
        memory = self.memory
        X = array2d(X)
        if isinstance(memory, basestring):
            memory = Memory(cachedir=memory, verbose=0)

        if not self.connectivity is None:
            if not sparse.issparse(self.connectivity):
                raise TypeError("`connectivity` should be a sparse matrix or "
                                "None, got: %r" % type(self.connectivity))

            if (self.connectivity.shape[0] != X.shape[0] or
                    self.connectivity.shape[1] != X.shape[0]):
                raise ValueError("`connectivity` does not have shape "
                                 "(n_samples, n_samples)")

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            # Early stopping is likely to give a speed up only for
            # a large number of clusters. The actual threshold
            # implemented here is heuristic
            compute_full_tree = self.n_clusters > max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        self.children_, self.n_components_, self.n_leaves_, self.parent_ = \
            memory.cache(ward_tree)(X, self.connectivity,
                                    n_components=self.n_components,
                                    copy=self.copy, n_clusters=n_clusters)
        # Cut the tree 
        parents=self.parent_
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reasign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        #---------------------------------------------------------------------
        # moi them vao 04-04-2013 de chon nhat cut cho cay            
        # each node we need height of that node, and alpha_min, alpha_max
        # compute height of each node, only in the case of compute full tree
        '''
        #New Attributes
        ----------
        `height_` : int, array [n_nodes+n_leaves]
            List of the hieght of each node of the tree (included leaves of the tree, which have height value 0)
    
        `alpha_` : int, array-like, shape = [n_nodes+n_leaves,2] 
            List of alpha_min, alpha_max of each nodes ((included leaves of the tree, which have alpha_min value 0)
            It is not true alpha, just the height of that node --> to get the true alpha: height/heigh_max
        
        `R_alpha_` : float, array-like,  shape = [height_max,3], corresponding to A, B and C; where height_max = height_[n_nodes+n_leaves - 1]
            List of A, B, C of the R_alpha function
            R_alpha(C) = 1/2 * (alpha_max(C)-alpha_min(C)) + 2*(alpha_max(C) - alpha)(alpha-alpha_min(C))/(alpha_max(C)-alpha_min(C))
            R(alpha) = 1/n * sum(|C|R_alpha(C)) for all C in P_alpha 
            ref: Pascal Pon 2011, Post-processing hierarchial community structures: Quality improvements and multi-scale view
         
        '''
        n_nodes = self.n_leaves_ + len(self.children_)               
        self.height_=np.zeros(n_nodes,dtype=np.int)         
        self.alpha_ = []
        if (self.compute_full_tree == True):                   
            for k in range(len(self.children_)):   
                 i = k + self.n_leaves_
                 if (self.height_[i]== 0):
                     self.height_[i]=self.height(i)            
            for i in range(n_nodes):   
              alpha_min = self.height_[i]
              alpha_max = self.height_[self.parent_[i]]
              self.alpha_.append([alpha_min,alpha_max])  
              
            self.compute_R_alpha()
            self.best_cut_=[]
        return self
Esempio n. 3
0
def linkage_tree(X, connectivity=None, n_components=None,
                 n_clusters=None, linkage='complete', affinity="euclidean",
                 return_distance=False, max_size=sys.maxint,
                 means=None, variances=None):
    """Linkage agglomerative clustering based on a Feature matrix.
    The inertia matrix uses a Heapq-based representation.
    This is the structured version, that takes into account some topological
    structure between samples.
    Read more in the :ref:`User Guide <hierarchical_clustering>`.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered
    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.
    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.
        NOTE: This parameter is now directly determined directly
        from the connectivity matrix and will be removed in 0.18
    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.
    linkage : {"average", "complete"}, optional, default: "complete"
        Which linkage critera to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.
    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)
    return_distance : bool, default False
        whether or not to return the distances between the clusters.
    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`
    n_components : int
        The number of connected components in the graph.
    n_leaves : int
        The number of leaves in the tree.
    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    distances : ndarray, shape (n_nodes-1,)
        Returned when return_distance is set to True.
        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.
    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {'complete': _hierarchical.max_merge,
                       'average': _hierarchical.average_merge}
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError(
            'Unknown linkage option, linkage should be one '
            'of %s, but %s was given' % (linkage_choices.keys(), linkage))

    if connectivity is None:
        from scipy.cluster import hierarchy     # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int)

        # JESSE: removed return distance and converted return to yield
        # if return_distance:
        #     distances = out[:, 2]
        #     return children_, 1, n_samples, None, distances
        yield children_, 1, n_samples, None

    if n_components is not None:
        warnings.warn(
            "n_components is now directly calculated from the connectivity "
            "matrix and will be removed in 0.18",
            DeprecationWarning)
    connectivity, n_components = _fix_connectivity(X, connectivity)

    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    # if affinity == 'precomputed':
    #     distances = X[connectivity.row, connectivity.col]
    # else:
        # FIXME We compute all the distances, while we could have only computed
        # the "interesting" distances
    #     distances = paired_distances(X[connectivity.row],
    #                                  X[connectivity.col],
    #                                  metric=affinity)
    # connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    # if return_distance:
    #     distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(_hierarchical.WeightedEdge(d, ind, r)
                       for r, d in zip(row, data) if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    # print "n_samples: " + str(n_samples)  # DEBUG
    # print "n_nodes: " + str(n_nodes)  # DEBUG
    for k in xrange(n_samples, n_nodes):
        # print "k: " + str(k)  # DEBUG

        # Jesse: get labels in order to count cluster sizes
        labels = _hierarchical.hc_get_heads(parent, copy=False)
        labels = np.asarray(labels)

        # identify the merge
        while True:
            edge = heappop(inertia)

            if used_node[edge.a] and used_node[edge.b]:

                # Jesse: cancel merger if it would put us over the dev max cluster size
                if (len(np.where(labels == labels[edge.a])[0]) +
                        len(np.where(labels == labels[edge.b])[0]) > max_size):
                    continue

                break

        i = edge.a
        j = edge.b
        print "k: " + str(k) + ", merging (" + str(i) + ", " + str(j) + ") with dist " + str(edge.weight)  # DEBUG

        # if return_distance:
        #     # store distances
        #     distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        # Jesse: modified from cython documentation of this function:
        """Merge two IntFloatDicts with the average strategy: when the
        same key is present in the two dicts, the weighted average of the two
        values is used.
        Parameters
        ==========
        A[i], A[j] : IntFloatDict object
            The IntFloatDicts to merge
        used_node : ndarray array of dtype integer and of dimension 1
            a mask for keys to ignore: if not used_node[key] the corresponding key
            is skipped in the output dictionary
        n_i, n_j : float
            n_i and n_j are weights for i and j for the merge strategy.
            They are used for a weighted mean.
        Returns
        =======
        out : IntFloatDict object
            The IntFloatDict resulting from the merge
        """
        if variances is None:
            coord_col = join_func(A[i], A[j], used_node, n_i, n_j)

        # Jesse: in order to implement kl-divergence-based clustering, we need to override
        # Jesse: this function with one that computes new distances based on the merged clusters'
        # Jesse: new means and variance vectors; no average/maximum weighting is necessary since
        # Jesse: average mean vectors and average variance vectors are instead used to calculate
        # Jesse: the new O(n) distances to the remaining cluster centroids
        else:
            # Jesse: update means and variances structures with merge
            means = np.vstack([means, np.add(means[i] * n_i, means[j] * n_j) / (n_i + n_j)])
            variances = np.vstack([variances, np.add(variances[i] * n_i, variances[j] * n_j) / (n_i + n_j)])

            # Jesse: calculate the O(n) kl distances between merged cluster and previous clusters
            # Jesse: testing alternative where all distances are re-calculated on narsil-1 tmux 2
            coord_col = {}  # need to calculate distances for every key; nothing is preserved in kl merge
            for key, dist in A[i]:
                if used_node[key]:
                    coord_col[key] = dist
            for key, dist in A[j]:
                if used_node[key]:
                    if key not in coord_col:
                        coord_col[key] = dist
                    else:
                        # compute new kl distance between entries
                        kld = multivariate_kl_distance(means[k], variances[k], means[key], variances[key])
                        if not np.isfinite(kld):
                            kld = float(sys.maxint)
                        coord_col[key] = kld
            coord_col = IntFloatDict(np.asarray(coord_col.keys()),
                                     np.asarray([coord_col[kidx] for kidx in coord_col.keys()]))

        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            # print "pushing " + str(k) + ", " + str(l) + " onto heap with distance " + str(d)  # DEBUG
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

        # Jesse: yield after every merge
        n_leaves = n_samples
        r_children = np.array(children)[:, ::-1]
        yield r_children, n_components, n_leaves, parent
Esempio n. 4
0
    def fit(self, X, y=None):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data. Shape [n_samples, n_features], or [n_samples,
            n_samples] if affinity=='precomputed'.

        y : Ignored

        Returns
        -------
        self
        """
        if (self.pooling_func != 'deprecated'
                and not isinstance(self, AgglomerationTransform)):
            warnings.warn(
                'Agglomerative "pooling_func" parameter is not used.'
                ' It has been deprecated in version 0.20 and will be'
                'removed in 0.22', DeprecationWarning)
        X = check_array(X, ensure_min_samples=2, estimator=self)
        memory = check_memory(self.memory)

        if self.n_clusters is not None and self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
            raise ValueError("Exactly one of n_clusters and "
                             "distance_threshold has to be set, and the other "
                             "needs to be None.")

        if (self.distance_threshold is not None
                and not self.compute_full_tree):
            raise ValueError("compute_full_tree must be True if "
                             "distance_threshold is set.")

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError("%s was provided as affinity. Ward can only "
                             "work with euclidean distances." %
                             (self.affinity, ))

        if self.linkage not in _TREE_BUILDERS:
            raise ValueError("Unknown linkage type %s. "
                             "Valid options are %s" %
                             (self.linkage, _TREE_BUILDERS.keys()))
        tree_builder = _TREE_BUILDERS[self.linkage]

        connectivity = self.connectivity
        if self.connectivity is not None:
            if callable(self.connectivity):
                connectivity = self.connectivity(X)
            connectivity = check_array(connectivity,
                                       accept_sparse=['csr', 'coo', 'lil'])

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            if self.distance_threshold is not None:
                compute_full_tree = True
            else:
                # Early stopping is likely to give a speed up only for
                # a large number of clusters. The actual threshold
                # implemented here is heuristic
                compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity

        distance_threshold = self.distance_threshold

        return_distance = distance_threshold is not None
        out = memory.cache(tree_builder)(X,
                                         connectivity,
                                         n_clusters=n_clusters,
                                         return_distance=return_distance,
                                         **kwargs)
        (self.children_, self.n_connected_components_, self.n_leaves_,
         parents) = out[:4]

        if distance_threshold is not None:
            distances = out[-1]
            self.distances_ = distances
            self.n_clusters_ = np.count_nonzero(
                distances >= distance_threshold) + 1
        else:
            self.n_clusters_ = self.n_clusters

        # Cut the tree
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters_, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reassign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        return self
Esempio n. 5
0
    def fit(self, X, y=None):
        """Fit the hierarchical clustering on the data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The samples a.k.a. observations.

        Returns
        -------
        self
        """
        X = check_array(X, ensure_min_samples=2, estimator=self)
        memory = self.memory
        if isinstance(memory, six.string_types):
            memory = Memory(cachedir=memory, verbose=0)

        if self.n_clusters <= 0:
            raise ValueError("n_clusters should be an integer greater than 0."
                             " %s was provided." % str(self.n_clusters))

        if self.linkage == "ward" and self.affinity != "euclidean":
            raise ValueError("%s was provided as affinity. Ward can only "
                             "work with euclidean distances." %
                             (self.affinity, ))

        if self.linkage not in _TREE_BUILDERS:
            raise ValueError("Unknown linkage type %s."
                             "Valid options are %s" % (self.linkage,
                                                       _TREE_BUILDERS.keys()))
        tree_builder = _TREE_BUILDERS[self.linkage]

        connectivity = self.connectivity
        if self.connectivity is not None:
            if callable(self.connectivity):
                connectivity = self.connectivity(X)
            connectivity = check_array(
                connectivity, accept_sparse=['csr', 'coo', 'lil'])

        n_samples = len(X)
        compute_full_tree = self.compute_full_tree
        if self.connectivity is None:
            compute_full_tree = True
        if compute_full_tree == 'auto':
            # Early stopping is likely to give a speed up only for
            # a large number of clusters. The actual threshold
            # implemented here is heuristic
            compute_full_tree = self.n_clusters < max(100, .02 * n_samples)
        n_clusters = self.n_clusters
        if compute_full_tree:
            n_clusters = None

        # Construct the tree
        kwargs = {}
        if self.linkage != 'ward':
            kwargs['linkage'] = self.linkage
            kwargs['affinity'] = self.affinity
        if self.return_distance:
            self.children_, self.n_components_, self.n_leaves_, parents, \
                self.distances = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           return_distance=True,
                                           **kwargs)
        else:
            self.children_, self.n_components_, self.n_leaves_, parents = \
                memory.cache(tree_builder)(X, connectivity,
                                           n_components=self.n_components,
                                           n_clusters=n_clusters,
                                           **kwargs)
        # Cut the tree
        if compute_full_tree:
            self.labels_ = _hc_cut(self.n_clusters, self.children_,
                                   self.n_leaves_)
        else:
            labels = _hierarchical.hc_get_heads(parents, copy=False)
            # copy to avoid holding a reference on the original array
            labels = np.copy(labels[:n_samples])
            # Reasign cluster numbers
            self.labels_ = np.searchsorted(np.unique(labels), labels)
        return self