Example #1
0
def test_unstructured_ward_tree():
    """
    Check that we obtain the correct solution for unstructured ward tree.
    """
    np.random.seed(0)
    X = np.random.randn(50, 100)
    children, n_nodes, n_leaves = ward_tree(X.T)
    n_nodes = 2 * X.shape[1] - 1
    assert(len(children) + n_leaves == n_nodes)
Example #2
0
def test_height_ward_tree():
    """
    Check that the height of ward tree is sorted.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_nodes, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert(len(children) + n_leaves == n_nodes)
Example #3
0
def test_structured_ward_tree():
    """
    Check that we obtain the correct solution for structured ward tree.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert(len(children) + n_leaves == n_nodes)
Example #4
0
def test_scikit_vs_scipy():
    """Test scikit ward with full connectivity (i.e. unstructured) vs scipy
    """
    from scipy.sparse import lil_matrix
    n, p, k = 10, 5, 3

    connectivity = lil_matrix(np.ones((n, n)))
    for i in range(5):
        X = .1*np.random.normal(size=(n, p))
        X -= 4*np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out = hierarchy.ward(X)

        children_ = out[:, :2].astype(np.int)
        children, _, n_leaves = ward_tree(X, connectivity)

        cut = _hc_cut(k, children, n_leaves)
        cut_ = _hc_cut(k, children_, n_leaves)
        assess_same_labelling(cut, cut_)
    def fit(self, X, y, n_iterations=25, verbose=0):
        """
        Fits Hierarchical CLustering.

        Parameters
        ----------
        X : ndarray of shape = (n_samples, n_features)
        
        Y : ndarray of shape = (n_samples)

        A : sparse matrix
            connectivity matrix

        n_iterations : int
            number of iterations = max number of parcel we want

        verbose : int, optional
            does it really need explanations?

        Returns
        -------
        tab : ndarray
            a list of labels of shape (n_features)
            two features have the same label if they are in the same parcel.
            The smaller label correspond to the smaller parcel
        """
        # Computing the ward tree
        children, n_components, n_leaves = ward_tree(X.T,
                connectivity=self.A, n_components=self.n_components_A)
        # Converting children from numpy array to list (faster)
        children = children.tolist()
        # Computing the parcel_based_signal for each parcel
        avg_signals = average_signals(X, children, n_leaves)
        # The first parcellations is the list of the tree roots
        parcellation = tree_roots(children, n_components, n_leaves)
        parcellations = [parcellation]  # List of the best parcellations
        self.scores = []
        if verbose:
            print "\n# First parcellation (=tree roots) : %s" % parcellations

        for i in range(1, n_iterations+1):  # for verbose mode
            if verbose:
                print "# Iteration %d" % i
            # Computing all the parcellations obtainable by splitting a parcel
            # of the current parcellation
            iteration_parcellations = split_parcellation(parcellation,
                    children, n_leaves)

            if (len(iteration_parcellations) == 0):
                # No parcellation can be splitted
                print " UserWARNING : n_iterations is too big :"
                print " Ending function at iteration %d." % i
                break

            # Selecting the best parcellation for current iteration
            parcellation, score = select_best_parcellation(
                    iteration_parcellations, self.clf, avg_signals, y,
                    self.n_jobs, verbose)

            parcellations.append(parcellation)
            self.scores.append(score)

        # We select the best parcel of those "pre-selected"
        #parcellation = select_best_parcellation(parcellations,
        #    self.clf, avg_signals, y, self.n_jobs, verbose)
        if self.scores != []: # Otherwise, max is not defined
            parcellation = parcellations[self.scores.index(max(self.scores))]
        # Sorting the parcellation, so the smaller label correspond 
        # to the smaller parcel
        parcellation.sort()
        # Computing the corresponding labels array
        self.tab = parcellation_to_label(parcellation, children, n_leaves)
        self.clf.fit(avg_signals[:, parcellation], y)
        if hasattr(self.clf, 'coef_'):
            self.coef_ = self.clf.coef_
        return self.tab
    def fit(self, X, y, n_iterations=25, verbose=0):
        """
        Fits Hierarchical CLustering.

        Parameters
        ----------
        X : ndarray of shape = (n_samples, n_features)
        
        Y : ndarray of shape = (n_samples)

        A : sparse matrix
            connectivity matrix

        n_iterations : int
            number of iterations = max number of parcel we want

        verbose : int, optional
            does it really need explanations?

        Returns
        -------
        tab : ndarray
            a list of labels of shape (n_features)
            two features have the same label if they are in the same parcel.
            The smaller label correspond to the smaller parcel
        """
        # Computing the ward tree
        children, n_components, n_leaves = ward_tree(
            X.T, connectivity=self.A, n_components=self.n_components_A)
        # Converting children from numpy array to list (faster)
        children = children.tolist()
        # Computing the parcel_based_signal for each parcel
        avg_signals = average_signals(X, children, n_leaves)
        # The first parcellations is the list of the tree roots
        parcellation = tree_roots(children, n_components, n_leaves)
        parcellations = [parcellation]  # List of the best parcellations
        self.scores = []
        if verbose:
            print "\n# First parcellation (=tree roots) : %s" % parcellations

        for i in range(1, n_iterations + 1):  # for verbose mode
            if verbose:
                print "# Iteration %d" % i
            # Computing all the parcellations obtainable by splitting a parcel
            # of the current parcellation
            iteration_parcellations = split_parcellation(
                parcellation, children, n_leaves)

            if (len(iteration_parcellations) == 0):
                # No parcellation can be splitted
                print " UserWARNING : n_iterations is too big :"
                print " Ending function at iteration %d." % i
                break

            # Selecting the best parcellation for current iteration
            parcellation, score = select_best_parcellation(
                iteration_parcellations, self.clf, avg_signals, y, self.n_jobs,
                verbose)

            parcellations.append(parcellation)
            self.scores.append(score)

        # We select the best parcel of those "pre-selected"
        #parcellation = select_best_parcellation(parcellations,
        #    self.clf, avg_signals, y, self.n_jobs, verbose)
        if self.scores != []:  # Otherwise, max is not defined
            parcellation = parcellations[self.scores.index(max(self.scores))]
        # Sorting the parcellation, so the smaller label correspond
        # to the smaller parcel
        parcellation.sort()
        # Computing the corresponding labels array
        self.tab = parcellation_to_label(parcellation, children, n_leaves)
        self.clf.fit(avg_signals[:, parcellation], y)
        if hasattr(self.clf, 'coef_'):
            self.coef_ = self.clf.coef_
        return self.tab
X = np.zeros(size**2)
X2 = X
#Generating two convexe parts
mask = np.zeros((size, size), dtype=bool)
mask[0:roi_size, 0:roi_size] = True
mask[-roi_size:, -roi_size:] = True
mask = mask.reshape(size**2)
X = X[mask]
# making n_samples
X2 = X2 + np.zeros((n_samples, 1))
X = X + np.arange(n_samples).reshape((n_samples, 1))
Y = np.arange(n_samples)
# Generating the connectivity grids and ward trees
A = grid_to_graph(n_x=size, n_y=size, mask=mask)
children, n_components, n_leaves = ward_tree(X.T,
                                             connectivity=A,
                                             n_components=2)
children = children.tolist()
A2 = grid_to_graph(n_x=size, n_y=size)
children2, n_components2, n_leaves2 = ward_tree(X2.T,
                                                connectivity=A2,
                                                n_components=1)
children2 = children2.tolist()


###############################################################################
# Test functions
def test_tree_roots():
    """
    Tests that the function returns the right roots.
    """
roi_size = 2
X = np.zeros(size**2)
X2 = X
#Generating two convexe parts
mask = np.zeros((size, size), dtype=bool)
mask[0:roi_size, 0:roi_size] = True
mask[-roi_size:, -roi_size:] = True
mask = mask.reshape(size**2)
X = X[mask]
# making n_samples
X2 = X2 + np.zeros((n_samples, 1))
X = X + np.arange(n_samples).reshape((n_samples, 1))
Y = np.arange(n_samples)
# Generating the connectivity grids and ward trees
A = grid_to_graph(n_x=size, n_y=size, mask=mask)
children, n_components, n_leaves = ward_tree(X.T, connectivity=A,
        n_components=2)
children = children.tolist()
A2 = grid_to_graph(n_x=size, n_y=size)
children2, n_components2, n_leaves2 = ward_tree(X2.T, connectivity=A2,
        n_components=1)
children2 = children2.tolist()


###############################################################################
# Test functions
def test_tree_roots():
    """
    Tests that the function returns the right roots.
    """
    roots1 = supervised_clustering.tree_roots(children,
            n_components, n_leaves)