Beispiel #1
0
def build_naive_divisive_tree(X, early_termination=False):
    """
    Build a tree by recursively splitting gene sets without regard to outgrouping.
    This method is less naive than building from a single split,
    and is more naive than splitting with outgrouping.
    @param X: a data matrix, preferably with more rows than columns
    @param early_termination: True iff clustering stops when a split is degenerate
    @return: the root of a tree
    """
    p, n = X.shape
    Z = khorr.get_standardized_matrix(X)
    Z = khorr.standardized_to_augmented_C(Z)
    boxed_Z = [Z]
    del Z
    return _build_naive_divisive_tree_helper(boxed_Z, range(p), early_termination)
Beispiel #2
0
def build_single_split_tree(X, use_squared_correlation=True):
    """
    Get the root of an mtree reconstructed from the transformed data.
    Note that only the dominant singular vector is required.
    This may be faster to get than the entire SVD.
    This method is naive compared to build_tree.
    With the use_squared_correlation options disabled, it is even more naive.
    @param X: a data matrix, preferably with more rows than columns
    @param use_squared_correlation: True for squared correlation, False for correlation
    @return: the root of a tree
    """
    # get the eigenvector whose loadings will be used to split and order the rows
    logging.debug('creating the standardized matrix')
    Z = khorr.get_standardized_matrix(X)
    if use_squared_correlation:
        logging.debug('creating the augmented matrix')
        Z = khorr.standardized_to_augmented_C(Z)
    logging.debug('creating the column centered matrix')
    W = util.get_column_centered_matrix(Z)
    logging.debug('manually cleaning up old matrices')
    del Z
    logging.debug('doing a singular value decomposition')
    U, S, VT = np.linalg.svd(W, full_matrices=0)
    logging.debug('getting the dominant eigenvector')
    v = khorr.get_dominant_vector(U, S)
    # account for values near zero, using the same criterion as in splitbuilder
    epsilon = 1e-14
    vprime = [0.0 if abs(x) < epsilon else x for x in v]
    # start making a tree from the eigenvector
    root = mtree.Node()
    neg_child = mtree.Node()
    pos_child = mtree.Node()
    root.add_child(neg_child)
    root.add_child(pos_child)
    for loading, row_index in sorted((x, i) for i, x in enumerate(vprime)):
        grandchild = mtree.Node()
        grandchild.label = row_index
        if loading > 0:
            pos_child.add_child(grandchild)
        else:
            neg_child.add_child(grandchild)
    return root