def _build_naive_divisive_tree_helper(boxed_Z, ordered_labels, early_termination=False): """ Try to be somewhat memory efficient because Z can be huge. @param boxed_Z: a standardized data matrix, boxed so it can be deleted @param ordered_labels: integer labels conformant to rows of Z @param early_termination: True iff clustering stops when a split is degenerate @return: the root of a tree """ if len(boxed_Z) != 1: raise ValueError('expected the input matrix to be boxed for deletion') Z = boxed_Z[0] if len(Z) != len(ordered_labels): raise ValueError('the input labels are incompatible with the input matrix') p = len(ordered_labels) # define the root root = mtree.Node() # deal with a degenerate split if p == 1: root.label = ordered_labels[0] return root # get the eigenvector whose loadings will be used to split the matrix Z = util.get_column_centered_matrix(Z) U, S, VT = np.linalg.svd(Z, full_matrices=0) v = khorr.get_dominant_vector(U, S) del U del VT # split the matrix stack = [] index_split = splitbuilder.eigenvector_to_split(v) # if we are doing early termination and the split is degenerate then we are done if early_termination and min(len(x) for x in index_split) < 2: for loading, row_index in sorted((x, i) for i, x in enumerate(v)): child = mtree.Node() child.label = ordered_labels[row_index] root.add_child(child) return root for selection_set in index_split: selection = list(sorted(selection_set)) # define the next standardized (but not column centered) matrix next_matrix = np.vstack(row for i, row in enumerate(Z) if i in selection_set) # define the next ordered labels next_ordered_labels = [ordered_labels[i] for i in selection] # add to the stack stack.append([next_matrix, next_ordered_labels]) # we no longer need the Z matrix del boxed_Z[0] del Z # build the tree while stack: next_matrix, next_ordered_labels = stack.pop() next_boxed_Z = [next_matrix] del next_matrix child = _build_naive_divisive_tree_helper(next_boxed_Z, next_ordered_labels, early_termination) root.add_child(child) return root
def build_tree_helper(boxed_U_in, S_in, ordered_labels, tree_data): """ Get the root of an mtree reconstructed from the transformed data. The input matrix U will be freed (deleted) by this function. @param boxed_U_in: part of the laplacian sqrt obtained by svd @param S_in: another part of the laplacian sqrt obtained by svd @param ordered_labels: a list of labels conformant with rows of U @param tree_data: state whose scope is the construction of the tree @return: an mtree rooted at a degree 2 vertex unless the input matrix has 3 rows """ # take U_in out of the box if len(boxed_U_in) != 1: raise ValueError('expected a 2d array as the only element of a list') U_in = boxed_U_in[0] shape = U_in.shape if len(shape) != 2: raise valueError('expected a 2d array as the only element of a list') p, n = shape if p < 3 or n < 3: raise ValueError('expected the input matrix to have at least three rows and columns') # look for an informative split index_split = None if p > 3: # the signs of v match the signs of the fiedler vector v = khorr.get_fiedler_vector(U_in, S_in) index_split = splitbuilder.eigenvector_to_split(v) # if the split is degenerate then don't use it if min(len(x) for x in index_split) < 2: index_split = None # if no informative split was found then create a degenerate tree if not index_split: root = mtree.create_tree(ordered_labels) for node in root.preorder(): if node.has_label(): tree_data.add_node(node) return root # get the indices defined by the split a, b = tuple(list(sorted(x)) for x in index_split) # Create two new matrices. # Be somewhat careful to not create lots of intermediate matrices A = np.zeros((len(a)+1, n)) B = np.zeros((len(b)+1, n)) for i, index in enumerate(a): A[i] = U_in[index] * S_in for i, index in enumerate(b): B[i] = U_in[index] * S_in A_outgroup = np.sum(B, 0) B_outgroup = np.sum(A, 0) A[-1] = A_outgroup B[-1] = B_outgroup # delete the two references to the old matrix del U_in del boxed_U_in[0] # recursively construct the subtrees subtrees = [] stack = [[b,a,B], [a,b,A]] # delete non-stack references to partial matrices del A del B # process the partial matrices while stack: selection, complement, summed_L_sqrt = stack.pop() # record the outgroup label for this subtree outgroup_label = tree_data.decrement_outgroup_label() # create the ordered list of labels corresponding to leaves of the subtree next_ordered_labels = [ordered_labels[i] for i in selection] next_ordered_labels.append(outgroup_label) # get the criterion matrix for the next iteration U, S, VT = np.linalg.svd(summed_L_sqrt, full_matrices=0) del VT # delete matrices that are no longer useful del summed_L_sqrt # build the tree recursively boxed_U = [U] del U root = build_tree_helper(boxed_U, S, next_ordered_labels, tree_data) # if the root is degree 2 then remove the root node if root.degree() == 2: root = root.remove() # root the tree at the outgroup node root = tree_data.label_to_node[outgroup_label] root.reroot() # we don't need the outgroup label anymore tree_data.remove_node(root) # we can also remove the label from the outgroup node itself root.label = None # save the properly rooted subtree subtrees.append(root) # connect the two subtrees at their roots left_root, right_root = subtrees right_root = right_root.remove() left_root.add_child(right_root) return left_root