Exemple #1
0
def _build_naive_divisive_tree_helper(boxed_Z, ordered_labels, early_termination=False):
    """
    Try to be somewhat memory efficient because Z can be huge.
    @param boxed_Z: a standardized data matrix, boxed so it can be deleted
    @param ordered_labels: integer labels conformant to rows of Z
    @param early_termination: True iff clustering stops when a split is degenerate
    @return: the root of a tree
    """
    if len(boxed_Z) != 1:
        raise ValueError('expected the input matrix to be boxed for deletion')
    Z = boxed_Z[0]
    if len(Z) != len(ordered_labels):
        raise ValueError('the input labels are incompatible with the input matrix')
    p = len(ordered_labels)
    # define the root
    root = mtree.Node()
    # deal with a degenerate split
    if p == 1:
        root.label = ordered_labels[0]
        return root
    # get the eigenvector whose loadings will be used to split the matrix
    Z = util.get_column_centered_matrix(Z)
    U, S, VT = np.linalg.svd(Z, full_matrices=0)
    v = khorr.get_dominant_vector(U, S)
    del U
    del VT
    # split the matrix
    stack = []
    index_split = splitbuilder.eigenvector_to_split(v)
    # if we are doing early termination and the split is degenerate then we are done
    if early_termination and min(len(x) for x in index_split) < 2:
        for loading, row_index in sorted((x, i) for i, x in enumerate(v)):
            child = mtree.Node()
            child.label = ordered_labels[row_index]
            root.add_child(child)
        return root
    for selection_set in index_split:
        selection = list(sorted(selection_set))
        # define the next standardized (but not column centered) matrix
        next_matrix = np.vstack(row for i, row in enumerate(Z) if i in selection_set)
        # define the next ordered labels
        next_ordered_labels = [ordered_labels[i] for i in selection]
        # add to the stack
        stack.append([next_matrix, next_ordered_labels])
    # we no longer need the Z matrix
    del boxed_Z[0]
    del Z
    # build the tree
    while stack:
        next_matrix, next_ordered_labels = stack.pop()
        next_boxed_Z = [next_matrix]
        del next_matrix
        child = _build_naive_divisive_tree_helper(next_boxed_Z, next_ordered_labels, early_termination)
        root.add_child(child)
    return root
Exemple #2
0
def build_tree_helper(boxed_U_in, S_in, ordered_labels, tree_data):
    """
    Get the root of an mtree reconstructed from the transformed data.
    The input matrix U will be freed (deleted) by this function.
    @param boxed_U_in: part of the laplacian sqrt obtained by svd
    @param S_in: another part of the laplacian sqrt obtained by svd
    @param ordered_labels: a list of labels conformant with rows of U
    @param tree_data: state whose scope is the construction of the tree
    @return: an mtree rooted at a degree 2 vertex unless the input matrix has 3 rows
    """
    # take U_in out of the box
    if len(boxed_U_in) != 1:
        raise ValueError('expected a 2d array as the only element of a list')
    U_in = boxed_U_in[0]
    shape = U_in.shape
    if len(shape) != 2:
        raise valueError('expected a 2d array as the only element of a list')
    p, n = shape
    if p < 3 or n < 3:
        raise ValueError('expected the input matrix to have at least three rows and columns')
    # look for an informative split
    index_split = None
    if p > 3:
        # the signs of v match the signs of the fiedler vector
        v = khorr.get_fiedler_vector(U_in, S_in)
        index_split = splitbuilder.eigenvector_to_split(v)
        # if the split is degenerate then don't use it
        if min(len(x) for x in index_split) < 2:
            index_split = None
    # if no informative split was found then create a degenerate tree
    if not index_split:
        root = mtree.create_tree(ordered_labels)
        for node in root.preorder():
            if node.has_label():
                tree_data.add_node(node)
        return root
    # get the indices defined by the split
    a, b = tuple(list(sorted(x)) for x in index_split)
    # Create two new matrices.
    # Be somewhat careful to not create lots of intermediate matrices
    A = np.zeros((len(a)+1, n))
    B = np.zeros((len(b)+1, n))
    for i, index in enumerate(a):
        A[i] = U_in[index] * S_in
    for i, index in enumerate(b):
        B[i] = U_in[index] * S_in
    A_outgroup = np.sum(B, 0)
    B_outgroup = np.sum(A, 0)
    A[-1] = A_outgroup
    B[-1] = B_outgroup
    # delete the two references to the old matrix
    del U_in
    del boxed_U_in[0]
    # recursively construct the subtrees
    subtrees = []
    stack = [[b,a,B], [a,b,A]]
    # delete non-stack references to partial matrices
    del A
    del B
    # process the partial matrices
    while stack:
        selection, complement, summed_L_sqrt = stack.pop()
        # record the outgroup label for this subtree
        outgroup_label = tree_data.decrement_outgroup_label()
        # create the ordered list of labels corresponding to leaves of the subtree
        next_ordered_labels = [ordered_labels[i] for i in selection]
        next_ordered_labels.append(outgroup_label)
        # get the criterion matrix for the next iteration
        U, S, VT = np.linalg.svd(summed_L_sqrt, full_matrices=0)
        del VT
        # delete matrices that are no longer useful
        del summed_L_sqrt
        # build the tree recursively
        boxed_U = [U]
        del U
        root = build_tree_helper(boxed_U, S, next_ordered_labels, tree_data)
        # if the root is degree 2 then remove the root node
        if root.degree() == 2:
            root = root.remove()
        # root the tree at the outgroup node
        root = tree_data.label_to_node[outgroup_label]
        root.reroot()
        # we don't need the outgroup label anymore
        tree_data.remove_node(root)
        # we can also remove the label from the outgroup node itself
        root.label = None
        # save the properly rooted subtree
        subtrees.append(root)
    # connect the two subtrees at their roots
    left_root, right_root = subtrees
    right_root = right_root.remove()
    left_root.add_child(right_root)
    return left_root