Ejemplo n.º 1
0
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string, scaling_factor):
    """
    @param L: the laplacian corresponding to tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and L
    @param m_to_string: a function that converts a matrix to a string
    @param scaling_factor: show the Laplacian scaled by this factor
    @return: a multi-line string
    """
    out = StringIO()
    n = len(L)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        L_child = SchurAlgebra.mmerge(L, complement) 
        print >> out, 'the Schur complement in the Laplacian of child tree', i+1, 'scaled by', scaling_factor
        print >> out, m_to_string(scaling_factor * L_child)
        print >> out
        child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement)
        v_child = BuildTreeTopology.laplacian_to_fiedler(L_child) 
        print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i+1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Ejemplo n.º 2
0
def analyze_matrix(M, block_size):
    """
    Get results for g(f(M)) and f(g(M)).
    Each block is square with block_size rows.
    @param M: a matrix
    @param block_size: the number of rows in blocks of the partitioned matrix
    @return: a string of results
    """
    # define the response
    out = StringIO()
    # get the new matrix using the first composition of functions
    M_11 = SchurAlgebra.mmerge(M, set(range(2 * block_size)))
    M_12 = SchurAlgebra.mschur(
        M_11, set(1 + block_size + k for k in range(block_size)))
    print >> out, M_12
    # get the new matrix using the second composition of functions
    M_21 = SchurAlgebra.mschur(
        M, set(3 * block_size + k for k in range(block_size)))
    M_22 = SchurAlgebra.mmerge(M_21, set(range(2 * block_size)))
    print >> out, M_22
    if np.allclose(M_12, M_22):
        print >> out, 'the matrices are similar'
    else:
        print >> out, 'the matrices are different'
    return out.getvalue().strip()
Ejemplo n.º 3
0
def analyze_matrix(M, block_size):
    """
    Get results for g(f(M)) and f(g(M)).
    Each block is square with block_size rows.
    @param M: a matrix
    @param block_size: the number of rows in blocks of the partitioned matrix
    @return: a string of results
    """
    # define the response
    out = StringIO()
    # get the new matrix using the first composition of functions
    M_11 = SchurAlgebra.mmerge(M, set(range(2*block_size)))
    M_12 = SchurAlgebra.mschur(
            M_11, set(1 + block_size + k for k in range(block_size)))
    print >> out, M_12
    # get the new matrix using the second composition of functions
    M_21 = SchurAlgebra.mschur(
            M, set(3*block_size + k for k in range(block_size)))
    M_22 = SchurAlgebra.mmerge(M_21, set(range(2*block_size)))
    print >> out, M_22
    if np.allclose(M_12, M_22):
        print >> out, 'the matrices are similar'
    else:
        print >> out, 'the matrices are different'
    return out.getvalue().strip()
Ejemplo n.º 4
0
def get_child_messages(L, eigensplit, ordered_tip_names, m_to_string,
                       scaling_factor):
    """
    @param L: the laplacian corresponding to tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and L
    @param m_to_string: a function that converts a matrix to a string
    @param scaling_factor: show the Laplacian scaled by this factor
    @return: a multi-line string
    """
    out = StringIO()
    n = len(L)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        L_child = SchurAlgebra.mmerge(L, complement)
        print >> out, 'the Schur complement in the Laplacian of child tree', i + 1, 'scaled by', scaling_factor
        print >> out, m_to_string(scaling_factor * L_child)
        print >> out
        child_label_sets = SchurAlgebra.vmerge(ordered_label_sets, complement)
        v_child = BuildTreeTopology.laplacian_to_fiedler(L_child)
        print >> out, 'the Fiedler split of the Schur complement in the Laplacian of child tree', i + 1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Ejemplo n.º 5
0
def get_splits(initial_distance_matrix,
               split_function,
               update_function,
               on_label_split=None):
    """
    This is the most external of the functions in this module.
    Get the set of splits implied by the tree that would be reconstructed.
    @param initial_distance_matrix: a distance matrix
    @param split_function: takes a distance matrix and returns an index split
    @param update_function: takes a distance matrix and an index subset and returns a distance matrix
    @param on_label_split: notifies the caller of the label split induced by an index split
    @return: a set of splits
    """
    n = len(initial_distance_matrix)
    # keep a stack of (label_set_per_vertex, distance_matrix) pairs
    initial_state = ([set([i]) for i in range(n)], initial_distance_matrix)
    stack = [initial_state]
    # process the stack in a depth first manner, building the split set
    label_split_set = set()
    while stack:
        label_sets, D = stack.pop()
        # if the matrix is small then we are done
        if len(D) < 4:
            continue
        # split the indices using the specified function
        try:
            index_split = split_function(D)
            # convert the index split to a label split
            label_split = index_split_to_label_split(index_split, label_sets)
            # notify the caller if a callback is requested
            if on_label_split:
                on_label_split(label_split)
            # add the split to the master set of label splits
            label_split_set.add(label_split)
            # for large matrices create the new label sets and the new conformant distance matrices
            a, b = index_split
            for index_selection, index_complement in ((a, b), (b, a)):
                if len(index_complement) > 2:
                    next_label_sets = SchurAlgebra.vmerge(
                        label_sets, index_selection)
                    next_D = update_function(D, index_selection)
                    next_state = (next_label_sets, next_D)
                    stack.append(next_state)
        except DegenerateSplitException, e:
            # we cannot recover from a degenerate split unless there are more than four indices
            if len(D) <= 4:
                continue
            # with more than four indices we can fall back to partial splits
            index_set = set([e.index])
            # get the next label sets
            next_label_sets = SchurAlgebra.vdelete(label_sets, index_set)
            # get the next conformant distance matrix by schur complementing out the offending index
            L = Euclid.edm_to_laplacian(D)
            L_small = SchurAlgebra.mschur(L, index_set)
            next_D = Euclid.laplacian_to_edm(L_small)
            next_state = (next_label_sets, next_D)
            stack.append(next_state)
Ejemplo n.º 6
0
def get_splits(initial_distance_matrix, split_function, update_function, on_label_split=None):
    """
    This is the most external of the functions in this module.
    Get the set of splits implied by the tree that would be reconstructed.
    @param initial_distance_matrix: a distance matrix
    @param split_function: takes a distance matrix and returns an index split
    @param update_function: takes a distance matrix and an index subset and returns a distance matrix
    @param on_label_split: notifies the caller of the label split induced by an index split
    @return: a set of splits
    """
    n = len(initial_distance_matrix)
    # keep a stack of (label_set_per_vertex, distance_matrix) pairs
    initial_state = ([set([i]) for i in range(n)], initial_distance_matrix)
    stack = [initial_state]
    # process the stack in a depth first manner, building the split set
    label_split_set = set()
    while stack:
        label_sets, D = stack.pop()
        # if the matrix is small then we are done
        if len(D) < 4:
            continue
        # split the indices using the specified function
        try:
            index_split = split_function(D)
            # convert the index split to a label split
            label_split = index_split_to_label_split(index_split, label_sets)
            # notify the caller if a callback is requested
            if on_label_split:
                on_label_split(label_split)
            # add the split to the master set of label splits
            label_split_set.add(label_split)
            # for large matrices create the new label sets and the new conformant distance matrices
            a, b = index_split
            for index_selection, index_complement in ((a, b), (b, a)):
                if len(index_complement) > 2:
                    next_label_sets = SchurAlgebra.vmerge(label_sets, index_selection)
                    next_D = update_function(D, index_selection)
                    next_state = (next_label_sets, next_D)
                    stack.append(next_state)
        except DegenerateSplitException, e:
            # we cannot recover from a degenerate split unless there are more than four indices
            if len(D) <= 4:
                continue
            # with more than four indices we can fall back to partial splits
            index_set = set([e.index])
            # get the next label sets
            next_label_sets = SchurAlgebra.vdelete(label_sets, index_set)
            # get the next conformant distance matrix by schur complementing out the offending index
            L = Euclid.edm_to_laplacian(D)
            L_small = SchurAlgebra.mschur(L, index_set)
            next_D = Euclid.laplacian_to_edm(L_small)
            next_state = (next_label_sets, next_D)
            stack.append(next_state)
Ejemplo n.º 7
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains([self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains([self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets, bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains([pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains([pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota' in right_subdomains):
         raise HandlingError('eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(pruned_names_secondary, L_secondary, v_secondary)
Ejemplo n.º 8
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(self.pruned_tree, 120) 
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [set(self.pruned_names[i] for i in side) for side in eigensplit]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Ejemplo n.º 9
0
def get_response_content(fs):
    # read the matrix
    D = np.array(fs.matrix)
    n = len(D)
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    selected_labels = Util.get_stripped_lines(StringIO(fs.selection))
    # validate the input
    if n != len(ordered_labels):
        raise HandlingError(
            'the number of taxon labels should match the number of rows in the distance matrix'
        )
    # get the two sets of indices
    index_set_A = set(i for i, label in enumerate(ordered_labels)
                      if label in selected_labels)
    index_set_B = set(range(n)) - index_set_A
    # get internal values related to the split
    R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B)
    # get the two new distance matrices
    D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B)
    D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A)
    # get the names associated with the indices of the new distance matrices
    all_names = [set([name]) for name in ordered_labels]
    D_A_names = [
        set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B)
    ]
    D_B_names = [
        set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A)
    ]
    # show the results
    out = StringIO()
    print >> out, 'alpha:', alpha
    print >> out, 'beta:', beta
    print >> out, 'gamma:', gamma
    print >> out
    print >> out, 'new distance matrix corresponding to the selected names:'
    print >> out, MatrixUtil.m_to_string(D_A)
    print >> out
    print >> out, 'ordered labels corresponding to this matrix:'
    for name in D_A_names:
        print >> out, name
    print >> out
    print >> out, 'new distance matrix corresponding to the non-selected names:'
    print >> out, MatrixUtil.m_to_string(D_B)
    print >> out
    print >> out, 'ordered labels corresponding to this matrix:'
    for name in D_B_names:
        print >> out, name
    # return the response
    return out.getvalue()
Ejemplo n.º 10
0
def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if not ordered_labels:
        raise HandlingError('no ordered taxa were provided')
    if len(ordered_labels) != len(set(ordered_labels)):
        raise HandlingError('the ordered taxa should be unique')
    # get the label selection and its complement
    min_selected_labels = 2
    min_unselected_labels = 1
    selected_labels = set(Util.get_stripped_lines(StringIO(fs.selection)))
    if len(selected_labels) < min_selected_labels:
        raise HandlingError('at least %d taxa should be selected to be grouped' % min_selected_labels)
    # get the set of labels in the complement
    unselected_labels = set(ordered_labels) - selected_labels
    if len(unselected_labels) < min_unselected_labels:
        raise HandlingError('at least %d taxa should remain outside the selected group' % min_unselected_labels)
    # assert that no bizarre labels were selected
    weird_labels = selected_labels - set(ordered_labels)
    if weird_labels:
        raise HandlingError('some selected taxa are invalid: ' + str(weird_labels))
    # assert that the size of the distance matrix is compatible with the number of ordered labels
    if len(D) != len(ordered_labels):
        raise HandlingError('the number of listed taxa does not match the number of rows in the distance matrix')
    # get the set of selected indices and its complement
    n = len(D)
    index_selection = set(i for i, label in enumerate(ordered_labels) if label in selected_labels)
    index_complement = set(range(n)) - index_selection
    # begin the response
    out = StringIO()
    # get the ordered list of sets of indices to merge
    merged_indices = SchurAlgebra.vmerge([set([x]) for x in range(n)], index_selection)
    # calculate the new distance matrix
    L = Euclid.edm_to_laplacian(D)
    L_merged = SchurAlgebra.mmerge(L, index_selection)
    D_merged = Euclid.laplacian_to_edm(L_merged)
    # print the output distance matrix and the labels of its rows
    print >> out, 'new distance matrix:'
    print >> out, MatrixUtil.m_to_string(D_merged)
    print >> out
    print >> out, 'new taxon labels:'
    for merged_index_set in merged_indices:
        if len(merged_index_set) == 1:
            print >> out, ordered_labels[merged_index_set.pop()]
        else:
            print >> out, '{' + ', '.join(selected_labels) + '}'
    # write the response
    return out.getvalue()
Ejemplo n.º 11
0
def update_generalized_nj(D, index_set):
    """
    Create a new distance matrix according to a neighbor-joining-like criterion.
    Do this according to the explanation in our tree reconstruction manuscript.
    The length of the branch defined by the split is divided evenly
    between the two successor distance matrices.
    @param D: the distance matrix
    @param index_set: the subset of indices that will be removed from the updated distance matrix
    @return: a new distance matrix
    """
    n = len(D)
    A = set(range(n)) - set(index_set)
    B = set(index_set)
    nA = len(A)
    nB = len(B)
    if nA < 2 or nB < 2:
        raise ValueError(
            'expected each side of the split to have at least two elements')
    # The split of the indices into A and B defines a single internal branch.
    # The average distance from A to the branch is alpha.
    # The average distance from B to the branch is beta.
    # The length of the branch is gamma.
    # The expected distance from i to a taxon in the other group is R[i].
    R = {}
    R.update((i, sum(D[i, b] for b in B) / float(nB)) for i in A)
    R.update((j, sum(D[a, j] for a in A) / float(nA)) for j in B)
    gamma_plus_beta = 0.5 * min(R[i] + R[j] - D[i, j]
                                for i, j in itertools.combinations(A, 2))
    alpha_plus_gamma = 0.5 * min(R[i] + R[j] - D[i, j]
                                 for i, j in itertools.combinations(B, 2))
    alpha_plus_gamma_plus_beta = sum(
        D[i, j] for i, j in itertools.product(A, B)) / float(nA * nB)
    gamma = alpha_plus_gamma + gamma_plus_beta - alpha_plus_gamma_plus_beta
    beta = gamma_plus_beta - gamma
    # Initialize the new distance matrix.
    D_out = SchurAlgebra.mmerge(D, index_set)
    # Find the index of D_out that corresponds to the outgroup.
    outgroup_index = sum(1 for a in A if a < min(B))
    D_out[outgroup_index, outgroup_index] = 0
    # Adjust one of the rows and columns to reflect distances to the outgroup.
    label_sets = SchurAlgebra.vmerge([set([i]) for i in range(n)], index_set)
    for i, labels in enumerate(label_sets):
        if i != outgroup_index:
            a = iterutils.get_only(labels)
            d = R[a] - beta - 0.5 * gamma
            D_out[i, outgroup_index] = D_out[outgroup_index, i] = d
    return D_out
Ejemplo n.º 12
0
def get_response_content(fs):
    locations = get_locations()
    np_locs = [np.array(p) for p in locations]
    edges = get_edges()
    npoints = len(locations)
    # start writing the response
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # print the layout data
    print >> out, 'POINTS'
    for i, (x, y) in enumerate(locations):
        print >> out, i, x, y
    print >> out, 'EDGES'
    for i, j in edges:
        print >> out, i, j
    print >> out
    # show the unweighted adjacency matrix
    UA = np.zeros((npoints, npoints))
    for i, j in edges:
        UA[i, j] = 1
        UA[j, i] = 1
    print >> out, 'unweighted adjacency matrix:'
    print >> out, UA
    print >> out
    # show the unweighted laplacian matrix
    UL = Euclid.adjacency_to_laplacian(UA)
    print >> out, 'unweighted laplacian matrix:'
    print >> out, UL
    print >> out
    # show the weighted adjacency matrix
    WA = np.zeros((npoints, npoints))
    for i, j in edges:
        d = np.linalg.norm(np_locs[i] - np_locs[j]) / math.sqrt(2.0)
        w = 1.0 / d
        WA[i, j] = w
        WA[j, i] = w
    print >> out, 'weighted adjacency matrix:'
    print >> out, WA
    print >> out
    # show the weighted laplacian matrix
    WL = Euclid.adjacency_to_laplacian(WA)
    print >> out, 'weighted laplacian matrix:'
    print >> out, WL
    print >> out
    # remove the two internal nodes by schur complementation
    ntips = 4
    schur_L = SchurAlgebra.schur_helper(WL, 2)
    X = Euclid.dccov_to_points(np.linalg.pinv(schur_L))
    print >> out, 'schur graph layout:'
    print >> out, 'POINTS'
    for i, v in enumerate(X):
        print >> out, i, v[0], v[1]
    print >> out, 'EDGES'
    for i in range(ntips):
        for j in range(i + 1, ntips):
            print >> out, i, j
    # return the response
    return out.getvalue()
Ejemplo n.º 13
0
def get_response_content(fs):
    locations = get_locations()
    np_locs = [np.array(p) for p in locations]
    edges = get_edges()
    npoints = len(locations)
    # start writing the response
    np.set_printoptions(linewidth=200)
    out = StringIO()
    # print the layout data
    print >> out, 'POINTS'
    for i, (x, y) in enumerate(locations):
        print >> out, i, x, y
    print >> out, 'EDGES'
    for i, j in edges:
        print >> out, i, j
    print >> out
    # show the unweighted adjacency matrix
    UA = np.zeros((npoints, npoints))
    for i, j in edges:
        UA[i, j] = 1
        UA[j, i] = 1
    print >> out, 'unweighted adjacency matrix:'
    print >> out, UA
    print >> out
    # show the unweighted laplacian matrix
    UL = Euclid.adjacency_to_laplacian(UA)
    print >> out, 'unweighted laplacian matrix:'
    print >> out, UL
    print >> out
    # show the weighted adjacency matrix
    WA = np.zeros((npoints, npoints))
    for i, j in edges:
        d = np.linalg.norm(np_locs[i] - np_locs[j]) / math.sqrt(2.0)
        w = 1.0 / d
        WA[i, j] = w
        WA[j, i] = w
    print >> out, 'weighted adjacency matrix:'
    print >> out, WA
    print >> out
    # show the weighted laplacian matrix
    WL = Euclid.adjacency_to_laplacian(WA)
    print >> out, 'weighted laplacian matrix:'
    print >> out, WL
    print >> out
    # remove the two internal nodes by schur complementation
    ntips = 4
    schur_L = SchurAlgebra.schur_helper(WL, 2)
    X = Euclid.dccov_to_points(np.linalg.pinv(schur_L))
    print >> out, 'schur graph layout:'
    print >> out, 'POINTS'
    for i, v in enumerate(X):
        print >> out, i, v[0], v[1]
    print >> out, 'EDGES'
    for i in range(ntips):
        for j in range(i+1, ntips):
            print >> out, i, j
    # return the response
    return out.getvalue()
Ejemplo n.º 14
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the adjacency matrix and the augmented adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    A_aug = get_augmented_adjacency(A, nleaves, fs.ndups, fs.strength)
    # get the laplacian matrices
    L = Euclid.adjacency_to_laplacian(A)
    L_aug = Euclid.adjacency_to_laplacian(A_aug)
    # get the schur complement
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_pinv = np.linalg.pinv(R)
    vals, vecs = EigUtil.eigh(R_pinv)
    # get the scaled Fiedler vector for the Schur complement
    w, v = EigUtil.principal_eigh(R_pinv)
    fiedler = v * math.sqrt(w)
    # get the eigendecomposition of the augmented Laplacian
    L_aug_pinv = np.linalg.pinv(L_aug)
    vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv)
    # get the scaled Fiedler vector for the augmented Laplacian
    w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv)
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=300)
    out = StringIO()
    print >> out, 'Laplacian matrix:'
    print >> out, L
    print >> out
    print >> out, 'Schur complement of Laplacian matrix:'
    print >> out, R
    print >> out
    print >> out, 'scaled Fiedler vector of Schur complement:'
    print >> out, fiedler
    print >> out
    print >> out, 'eigenvalues of pinv of Schur complement:'
    print >> out, vals
    print >> out
    print >> out, 'corresponding eigenvectors of pinv of Schur complement:'
    print >> out, np.array(vecs).T
    print >> out
    print >> out
    print >> out, 'augmented Laplacian matrix:'
    print >> out, L_aug
    print >> out
    print >> out, 'scaled Fiedler vector of augmented Laplacian:'
    print >> out, fiedler_aug
    print >> out
    print >> out, 'eigenvalues of pinv of augmented Laplacian:'
    print >> out, vals_aug
    print >> out
    print >> out, 'rows are eigenvectors of pinv of augmented Laplacian:'
    print >> out, np.array(vecs_aug)
    return out.getvalue()
Ejemplo n.º 15
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the adjacency matrix and the augmented adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    A_aug = get_augmented_adjacency(A, nleaves, fs.ndups, fs.strength)
    # get the laplacian matrices
    L = Euclid.adjacency_to_laplacian(A)
    L_aug = Euclid.adjacency_to_laplacian(A_aug)
    # get the schur complement
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_pinv = np.linalg.pinv(R)
    vals, vecs = EigUtil.eigh(R_pinv)
    # get the scaled Fiedler vector for the Schur complement
    w, v = EigUtil.principal_eigh(R_pinv)
    fiedler = v * math.sqrt(w)
    # get the eigendecomposition of the augmented Laplacian
    L_aug_pinv = np.linalg.pinv(L_aug)
    vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv)
    # get the scaled Fiedler vector for the augmented Laplacian
    w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv)
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=300)
    out = StringIO()
    print >> out, 'Laplacian matrix:'
    print >> out, L
    print >> out
    print >> out, 'Schur complement of Laplacian matrix:'
    print >> out, R
    print >> out
    print >> out, 'scaled Fiedler vector of Schur complement:'
    print >> out, fiedler
    print >> out
    print >> out, 'eigenvalues of pinv of Schur complement:'
    print >> out, vals
    print >> out
    print >> out, 'corresponding eigenvectors of pinv of Schur complement:'
    print >> out, np.array(vecs).T
    print >> out
    print >> out
    print >> out, 'augmented Laplacian matrix:'
    print >> out, L_aug
    print >> out
    print >> out, 'scaled Fiedler vector of augmented Laplacian:'
    print >> out, fiedler_aug
    print >> out
    print >> out, 'eigenvalues of pinv of augmented Laplacian:'
    print >> out, vals_aug
    print >> out
    print >> out, 'rows are eigenvectors of pinv of augmented Laplacian:'
    print >> out, np.array(vecs_aug)
    return out.getvalue()
Ejemplo n.º 16
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the distance matrix and the augmented distance matrix
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    D_aug = get_augmented_distance(D, nleaves, fs.ndups)
    # get the laplacian matrix
    L = Euclid.edm_to_laplacian(D)
    # get the schur complement
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_pinv = np.linalg.pinv(R)
    vals, vecs = EigUtil.eigh(R_pinv)
    # get the scaled Fiedler vector for the Schur complement
    w, v = EigUtil.principal_eigh(R_pinv)
    fiedler = v * math.sqrt(w)
    # get the eigendecomposition of the centered augmented distance matrix
    L_aug_pinv = Euclid.edm_to_dccov(D_aug)
    vals_aug, vecs_aug = EigUtil.eigh(L_aug_pinv)
    # get the scaled Fiedler vector for the augmented Laplacian
    w_aug, v_aug = EigUtil.principal_eigh(L_aug_pinv)
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=300, threshold=10000)
    out = StringIO()
    print >> out, "Laplacian matrix:"
    print >> out, L
    print >> out
    print >> out, "Schur complement of Laplacian matrix:"
    print >> out, R
    print >> out
    print >> out, "scaled Fiedler vector of Schur complement:"
    print >> out, fiedler
    print >> out
    print >> out, "eigenvalues of pinv of Schur complement:"
    print >> out, vals
    print >> out
    print >> out, "corresponding eigenvectors of pinv of Schur complement:"
    print >> out, np.array(vecs).T
    print >> out
    print >> out
    print >> out, "augmented distance matrix:"
    print >> out, D_aug
    print >> out
    print >> out, "scaled Fiedler vector of augmented Laplacian limit:"
    print >> out, fiedler_aug
    print >> out
    print >> out, "eigenvalues of pinv of augmented Laplacian limit:"
    print >> out, vals_aug
    print >> out
    print >> out, "rows are eigenvectors of pinv of augmented Laplacian limit:"
    print >> out, np.array(vecs_aug)
    return out.getvalue()
Ejemplo n.º 17
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the adjacency matrix and the augmented adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    A_aug = get_augmented_adjacency(A, nleaves, fs.strength)
    # get the laplacian matrices
    L = Euclid.adjacency_to_laplacian(A)
    L_aug = Euclid.adjacency_to_laplacian(A_aug)
    # get the schur complements
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_aug = SchurAlgebra.mschur(L_aug, set(range(nleaves, nvertices)))
    # get the scaled Fiedler vectors
    w, v = EigUtil.principal_eigh(np.linalg.pinv(R))
    fiedler = v * math.sqrt(w)
    w_aug, v_aug = EigUtil.principal_eigh(np.linalg.pinv(R_aug))
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=200)
    out = StringIO()
    print >> out, "Laplacian matrix:"
    print >> out, L
    print >> out
    print >> out, "Schur complement of Laplacian matrix:"
    print >> out, R
    print >> out
    print >> out, "scaled Fiedler vector:"
    print >> out, fiedler
    print >> out
    print >> out, "augmented Laplacian matrix:"
    print >> out, L_aug
    print >> out
    print >> out, "Schur complement of augmented Laplacian matrix:"
    print >> out, R_aug
    print >> out
    print >> out, "scaled Fiedler vector of augmented matrix:"
    print >> out, fiedler_aug
    print >> out
    return out.getvalue()
Ejemplo n.º 18
0
def update_generalized_nj(D, index_set):
    """
    Create a new distance matrix according to a neighbor-joining-like criterion.
    Do this according to the explanation in our tree reconstruction manuscript.
    The length of the branch defined by the split is divided evenly
    between the two successor distance matrices.
    @param D: the distance matrix
    @param index_set: the subset of indices that will be removed from the updated distance matrix
    @return: a new distance matrix
    """
    n = len(D)
    A = set(range(n)) - set(index_set)
    B = set(index_set)
    nA = len(A)
    nB = len(B)
    if nA < 2 or nB < 2:
        raise ValueError("expected each side of the split to have at least two elements")
    # The split of the indices into A and B defines a single internal branch.
    # The average distance from A to the branch is alpha.
    # The average distance from B to the branch is beta.
    # The length of the branch is gamma.
    # The expected distance from i to a taxon in the other group is R[i].
    R = {}
    R.update((i, sum(D[i, b] for b in B) / float(nB)) for i in A)
    R.update((j, sum(D[a, j] for a in A) / float(nA)) for j in B)
    gamma_plus_beta = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(A, 2))
    alpha_plus_gamma = 0.5 * min(R[i] + R[j] - D[i, j] for i, j in itertools.combinations(B, 2))
    alpha_plus_gamma_plus_beta = sum(D[i, j] for i, j in itertools.product(A, B)) / float(nA * nB)
    gamma = alpha_plus_gamma + gamma_plus_beta - alpha_plus_gamma_plus_beta
    beta = gamma_plus_beta - gamma
    # Initialize the new distance matrix.
    D_out = SchurAlgebra.mmerge(D, index_set)
    # Find the index of D_out that corresponds to the outgroup.
    outgroup_index = sum(1 for a in A if a < min(B))
    D_out[outgroup_index, outgroup_index] = 0
    # Adjust one of the rows and columns to reflect distances to the outgroup.
    label_sets = SchurAlgebra.vmerge([set([i]) for i in range(n)], index_set)
    for i, labels in enumerate(label_sets):
        if i != outgroup_index:
            a = iterutils.get_only(labels)
            d = R[a] - beta - 0.5 * gamma
            D_out[i, outgroup_index] = D_out[outgroup_index, i] = d
    return D_out
Ejemplo n.º 19
0
def get_response_content(fs):
    # build the newick tree from the string
    tree = NewickIO.parse(fs.tree_string, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # get ordered ids with the leaves first
    ordered_ids = get_ordered_ids(tree)
    # get the adjacency matrix and the augmented adjacency matrix
    A = np.array(tree.get_affinity_matrix(ordered_ids))
    A_aug = get_augmented_adjacency(A, nleaves, fs.strength)
    # get the laplacian matrices
    L = Euclid.adjacency_to_laplacian(A)
    L_aug = Euclid.adjacency_to_laplacian(A_aug)
    # get the schur complements
    R = SchurAlgebra.mschur(L, set(range(nleaves, nvertices)))
    R_aug = SchurAlgebra.mschur(L_aug, set(range(nleaves, nvertices)))
    # get the scaled Fiedler vectors
    w, v = EigUtil.principal_eigh(np.linalg.pinv(R))
    fiedler = v * math.sqrt(w)
    w_aug, v_aug = EigUtil.principal_eigh(np.linalg.pinv(R_aug))
    fiedler_aug = v_aug * math.sqrt(w_aug)
    # report the results
    np.set_printoptions(linewidth=200)
    out = StringIO()
    print >> out, 'Laplacian matrix:'
    print >> out, L
    print >> out
    print >> out, 'Schur complement of Laplacian matrix:'
    print >> out, R
    print >> out
    print >> out, 'scaled Fiedler vector:'
    print >> out, fiedler
    print >> out
    print >> out, 'augmented Laplacian matrix:'
    print >> out, L_aug
    print >> out
    print >> out, 'Schur complement of augmented Laplacian matrix:'
    print >> out, R_aug
    print >> out
    print >> out, 'scaled Fiedler vector of augmented matrix:'
    print >> out, fiedler_aug
    print >> out
    return out.getvalue()
Ejemplo n.º 20
0
def get_response_content(fs):
    # read the matrix
    D = np.array(fs.matrix)
    n = len(D)
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    selected_labels = Util.get_stripped_lines(StringIO(fs.selection))
    # validate the input
    if n != len(ordered_labels):
        raise HandlingError("the number of taxon labels should match the number of rows in the distance matrix")
    # get the two sets of indices
    index_set_A = set(i for i, label in enumerate(ordered_labels) if label in selected_labels)
    index_set_B = set(range(n)) - index_set_A
    # get internal values related to the split
    R, alpha, beta, gamma = get_R_alpha_beta_gamma(D, index_set_B)
    # get the two new distance matrices
    D_A = BuildTreeTopology.update_generalized_nj(D, index_set_B)
    D_B = BuildTreeTopology.update_generalized_nj(D, index_set_A)
    # get the names associated with the indices of the new distance matrices
    all_names = [set([name]) for name in ordered_labels]
    D_A_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_B)]
    D_B_names = [set_to_string(x) for x in SchurAlgebra.vmerge(all_names, index_set_A)]
    # show the results
    out = StringIO()
    print >> out, "alpha:", alpha
    print >> out, "beta:", beta
    print >> out, "gamma:", gamma
    print >> out
    print >> out, "new distance matrix corresponding to the selected names:"
    print >> out, MatrixUtil.m_to_string(D_A)
    print >> out
    print >> out, "ordered labels corresponding to this matrix:"
    for name in D_A_names:
        print >> out, name
    print >> out
    print >> out, "new distance matrix corresponding to the non-selected names:"
    print >> out, MatrixUtil.m_to_string(D_B)
    print >> out
    print >> out, "ordered labels corresponding to this matrix:"
    for name in D_B_names:
        print >> out, name
    # return the response
    return out.getvalue()
Ejemplo n.º 21
0
def get_response_content(fs):
    # read the matrix
    L = fs.laplacian
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if not ordered_labels:
        raise HandlingError('no ordered taxa were provided')
    if len(ordered_labels) != len(set(ordered_labels)):
        raise HandlingError('the ordered taxa should be unique')
    # get the label selection and its complement
    min_selected_labels = 2
    min_unselected_labels = 1
    selected_labels = set(Util.get_stripped_lines(StringIO(fs.selection)))
    if len(selected_labels) < min_selected_labels:
        raise HandlingError(
            'at least %d taxa should be selected to be grouped' %
            min_selected_labels)
    # get the set of labels in the complement
    unselected_labels = set(ordered_labels) - selected_labels
    if len(unselected_labels) < min_unselected_labels:
        raise HandlingError(
            'at least %d taxa should remain outside the selected group' %
            min_unselected_labels)
    # assert that no bizarre labels were selected
    weird_labels = selected_labels - set(ordered_labels)
    if weird_labels:
        raise HandlingError('some selected taxa are invalid: ' +
                            str(weird_labels))
    # assert that the size of the distance matrix is compatible with the number of ordered labels
    if len(L) != len(ordered_labels):
        raise HandlingError(
            'the number of listed taxa does not match the number of rows in the distance matrix'
        )
    # get the set of selected indices and its complement
    n = len(L)
    index_selection = set(i for i, label in enumerate(ordered_labels)
                          if label in selected_labels)
    index_complement = set(range(n)) - index_selection
    # begin the response
    out = StringIO()
    # calculate the new laplacian matrix
    L_small = SchurAlgebra.mschur(L, index_selection)
    D_small = Euclid.laplacian_to_edm(L_small)
    # print the matrices and the labels of its rows
    print >> out, 'new laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(L_small)
    print >> out
    print >> out, 'new distance matrix:'
    print >> out, MatrixUtil.m_to_string(D_small)
    print >> out
    print >> out, 'new taxon labels:'
    for index in sorted(index_complement):
        print >> out, ordered_labels[index]
    # write the response
    return out.getvalue()
Ejemplo n.º 22
0
def update_using_laplacian(D, index_set):
    """
    Update the distance matrix by summing rows and columns of the removed indices.
    @param D: the distance matrix
    @param index_set: the set of indices that will be removed from the updated distance matrix
    @return: an updated distance matrix
    """
    L = Euclid.edm_to_laplacian(D)
    L_small = SchurAlgebra.mmerge(L, index_set)
    D_small = Euclid.laplacian_to_edm(L_small)
    return D_small
Ejemplo n.º 23
0
def update_using_laplacian(D, index_set):
    """
    Update the distance matrix by summing rows and columns of the removed indices.
    @param D: the distance matrix
    @param index_set: the set of indices that will be removed from the updated distance matrix
    @return: an updated distance matrix
    """
    L = Euclid.edm_to_laplacian(D)
    L_small = SchurAlgebra.mmerge(L, index_set)
    D_small = Euclid.laplacian_to_edm(L_small)
    return D_small
Ejemplo n.º 24
0
def get_response_content(fs):
    # read the matrix
    L = fs.laplacian
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if not ordered_labels:
        raise HandlingError('no ordered taxa were provided')
    if len(ordered_labels) != len(set(ordered_labels)):
        raise HandlingError('the ordered taxa should be unique')
    # get the label selection and its complement
    min_selected_labels = 2
    min_unselected_labels = 1
    selected_labels = set(Util.get_stripped_lines(StringIO(fs.selection)))
    if len(selected_labels) < min_selected_labels:
        raise HandlingError('at least %d taxa should be selected to be grouped' % min_selected_labels)
    # get the set of labels in the complement
    unselected_labels = set(ordered_labels) - selected_labels
    if len(unselected_labels) < min_unselected_labels:
        raise HandlingError('at least %d taxa should remain outside the selected group' % min_unselected_labels)
    # assert that no bizarre labels were selected
    weird_labels = selected_labels - set(ordered_labels)
    if weird_labels:
        raise HandlingError('some selected taxa are invalid: ' + str(weird_labels))
    # assert that the size of the distance matrix is compatible with the number of ordered labels
    if len(L) != len(ordered_labels):
        raise HandlingError('the number of listed taxa does not match the number of rows in the distance matrix')
    # get the set of selected indices and its complement
    n = len(L)
    index_selection = set(i for i, label in enumerate(ordered_labels) if label in selected_labels)
    index_complement = set(range(n)) - index_selection
    # begin the response
    out = StringIO()
    # calculate the new laplacian matrix
    L_small = SchurAlgebra.mschur(L, index_selection)
    D_small = Euclid.laplacian_to_edm(L_small)
    # print the matrices and the labels of its rows
    print >> out, 'new laplacian matrix:'
    print >> out, MatrixUtil.m_to_string(L_small)
    print >> out
    print >> out, 'new distance matrix:'
    print >> out, MatrixUtil.m_to_string(D_small)
    print >> out
    print >> out, 'new taxon labels:'
    for index in sorted(index_complement):
        print >> out, ordered_labels[index]
    # write the response
    return out.getvalue()
Ejemplo n.º 25
0
def process(npoints, nseconds):
    """
    @param npoints: attempt to form each counterexample from this many points
    @param nseconds: allow this many seconds to run
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    best_result = None
    nchecked = 0
    while time.time() - start_time < nseconds:
        # look for a counterexample
        points = sample_points(npoints)
        D = points_to_edm(points)
        L = Euclid.edm_to_laplacian(D)
        L_small = SchurAlgebra.mmerge(L, set([0, 1]))
        w = np.linalg.eigvalsh(L_small)
        D_small = Euclid.laplacian_to_edm(L_small)
        result = Counterexample(points, D, w, D_small)
        # see if the counterexample is interesting
        if best_result is None:
            best_result = result
        elif min(result.L_eigenvalues) < min(best_result.L_eigenvalues):
            best_result = result
        nchecked += 1
    out = StringIO()
    print >> out, 'checked', nchecked, 'matrices each formed from', npoints, 'points'
    print >> out
    print >> out, 'eigenvalues of the induced matrix with lowest eigenvalue:'
    for value in reversed(sorted(best_result.L_eigenvalues)):
        print >> out, value
    print >> out
    print >> out, 'corresponding induced distance matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D_small)
    print >> out
    print >> out, 'the original distance matrix corresponding to this matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D)
    print >> out
    print >> out, 'the points that formed the original distance matrix:'
    for point in best_result.points:
        print >> out, '\t'.join(str(x) for x in point)
    return out.getvalue().strip()
Ejemplo n.º 26
0
def process(npoints, nseconds):
    """
    @param npoints: attempt to form each counterexample from this many points
    @param nseconds: allow this many seconds to run
    @return: a multi-line string that summarizes the results
    """
    start_time = time.time()
    best_result = None
    nchecked = 0
    while time.time() - start_time < nseconds:
        # look for a counterexample
        points = sample_points(npoints)
        D = points_to_edm(points)
        L = Euclid.edm_to_laplacian(D)
        L_small = SchurAlgebra.mmerge(L, set([0, 1]))
        w = np.linalg.eigvalsh(L_small)
        D_small = Euclid.laplacian_to_edm(L_small)
        result = Counterexample(points, D, w, D_small)
        # see if the counterexample is interesting
        if best_result is None:
            best_result = result
        elif min(result.L_eigenvalues) < min(best_result.L_eigenvalues):
            best_result = result
        nchecked += 1
    out = StringIO()
    print >> out, 'checked', nchecked, 'matrices each formed from', npoints, 'points'
    print >> out
    print >> out, 'eigenvalues of the induced matrix with lowest eigenvalue:'
    for value in reversed(sorted(best_result.L_eigenvalues)):
        print >> out, value
    print >> out
    print >> out, 'corresponding induced distance matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D_small)
    print >> out
    print >> out, 'the original distance matrix corresponding to this matrix:'
    print >> out, MatrixUtil.m_to_string(best_result.D)
    print >> out
    print >> out, 'the points that formed the original distance matrix:'
    for point in best_result.points:
        print >> out, '\t'.join(str(x) for x in point)
    return out.getvalue().strip()
Ejemplo n.º 27
0
def get_subtree_messages(D, eigensplit, ordered_tip_names):
    """
    @param D: the matrix of pairwise distances among tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and D
    @return: a multi-line string
    """
    out = StringIO()
    n = len(D)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child)))
        child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement)
        v_child = BuildTreeTopology.edm_to_fiedler(D_child)
        print >> out, 'the Fiedler split of Schur complements of subtree', i + 1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Ejemplo n.º 28
0
def get_subtree_messages(D, eigensplit, ordered_tip_names):
    """
    @param D: the matrix of pairwise distances among tips of the tree
    @param eigensplit: the split induced by the fiedler vector
    @param ordered_tip_names: names of the tips of the tree conformant to v and D
    @return: a multi-line string
    """
    out = StringIO()
    n = len(D)
    ordered_label_sets = [set([i]) for i in range(n)]
    all_labels = set(range(n))
    for i, child in enumerate(eigensplit):
        complement = all_labels - child
        D_child = MatrixUtil.get_principal_submatrix(D, list(sorted(child)))
        child_label_sets = SchurAlgebra.vdelete(ordered_label_sets, complement)
        v_child = BuildTreeTopology.edm_to_fiedler(D_child) 
        print >> out, 'the Fiedler split of Schur complements of subtree', i+1
        for label_set, value in zip(child_label_sets, v_child):
            s = label_set_to_string(label_set, ordered_tip_names)
            print >> out, s, ':', value
        print >> out
    return out.getvalue().strip()
Ejemplo n.º 29
0
def get_response_content(fs):
    # use a default border; actually this is ignored
    border_info = BorderInfo(10, 10)
    # Collect the image format information.
    axis_info = AxisInfo(fs.flip_x, fs.flip_y, fs.show_x, fs.show_y)
    # read the points and edges
    points, edges = read_points_and_edges(fs.graph_data)
    # define edge weights
    if fs.weighted:
        np_points = [np.array(p) for p in points]
        dists = [np.linalg.norm(np_points[j] - np_points[i]) for i, j in edges]
        weights = [1.0 / d for d in dists]
    else:
        weights = [1.0 for e in edges]
    # create the full laplacian
    L = edges_to_laplacian(edges, weights)
    if fs.schur:
        # remove internal nodes by schur complementation
        index_to_degree = edges_to_node_degrees(edges)
        internal_indices = set(i
                for i, d in enumerate(index_to_degree) if d > 1)
        L_schur = SchurAlgebra.mschur(L, internal_indices)
        L_final = L_schur
    else:
        L_final = L
    # define the point colors using the graph Fiedler loadings
    G = np.linalg.pinv(L_final)
    X = Euclid.dccov_to_points(G)
    points = [(p[0], p[1]) for p in X]
    xs, ys = zip(*points)
    colors = valuations_to_colors(xs)
    # draw the image
    ext = Form.g_imageformat_to_ext[fs.imageformat]
    image_info = ImageInfo(fs.width, fs.height,
            fs.black, fs.show_labels,
            axis_info, border_info, ext)
    return get_image_string(xs, ys, colors, edges, image_info, fs.scale)
Ejemplo n.º 30
0
 def test_commutativity(self):
     """
     Schur complementation and merging can be done in either order.
     """
     reciprocal_adjacency_big = np.array([
             [0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
             [0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 9, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 3, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
             [0, 2, 9, 0, 0, 0, 0, 4, 0, 0],
             [2, 0, 0, 0, 0, 0, 4, 0, 0, 1],
             [0, 0, 0, 1, 3, 0, 0, 0, 0, 7],
             [0, 0, 0, 0, 0, 2, 0, 1, 7, 0]], dtype=float)
     A_big = self.nonzero_reciprocal(reciprocal_adjacency_big)
     L_big = adjacency_to_laplacian(A_big)
     # define the pruned branch length
     p = 101.0 / 39.0
     reciprocal_adjacency_small = np.array([
             [0, 0, 0, 0, 0, 2],
             [0, 0, 0, 0, 2, 0],
             [0, 0, 0, 0, 9, 0],
             [0, 0, 0, 0, 0, p],
             [0, 2, 9, 0, 0, 4],
             [2, 0, 0, p, 4, 0]])
     A_small = self.nonzero_reciprocal(reciprocal_adjacency_small)
     L_small = adjacency_to_laplacian(A_small)
     # get the small matrix in terms of the big matrix by schur complementation followed by merging
     reconstructed_small_a = SchurAlgebra.mmerge(SchurAlgebra.mschur(L_big, set([8, 9])), set([3, 4, 5]))
     self.assertTrue(np.allclose(L_small, reconstructed_small_a))
     # get the small matrix in terms of the big matrix by merging followed by schur complementation
     reconstructed_small_b = SchurAlgebra.mschur(SchurAlgebra.mmerge(L_big, set([3, 4, 5])), set([6, 7]))
     self.assertTrue(np.allclose(L_small, reconstructed_small_b))
     # get the laplacian associated with a 4x4 distance matrix in multiple ways
     first_result = SchurAlgebra.mmerge(SchurAlgebra.mschur(L_big, set([6, 7, 8, 9])), set([3, 4, 5]))
     second_result = SchurAlgebra.mschur(L_small, set([4, 5]))
     self.assertTrue(np.allclose(first_result, second_result))
Ejemplo n.º 31
0
class TreeSearch:
    """
    This is a virtual base class.
    """

    def __init__(self):
        # boolean requirements defined by the user
        self.informative_children = None
        self.force_difference = None
        self.informative_full_split = None
        self.invalid_dendrogram = None
        # search options defined by the subclass
        self.tree = None
        self.desired_primary_split = None
        self.id_to_index = None
        # initialize the counts that are tracked for bookkeeping
        self.aug_split_collision_count = 0
        self.aug_split_degenerate_count = 0
        self.error_primary_split_count = 0
        self.invalid_primary_split_count = 0
        self.degenerate_primary_split_count = 0
        self.undesired_primary_split_count = 0
        self.desired_primary_split_count = 0
        self.uninformative_child_count = 0
        self.informative_child_count = 0
        self.valid_dendrogram_count = 0
        self.success_count = 0

    def is_initialized(self):
        required_data = [
                self.informative_children,
                self.force_difference,
                self.informative_full_split,
                self.invalid_dendrogram,
                self.tree,
                self.desired_primary_split,
                self.id_to_index]
        return not (None in required_data)

    def get_result_text(self):
        """
        @return: a multi-line string of text
        """
        out = StringIO()
        if self.force_difference or self.informative_full_split:
            print >> out, 'full graph split stats:'
            print >> out, self.aug_split_collision_count,
            print >> out, 'full graph splits collided with the desired primary split'
            print >> out, self.aug_split_degenerate_count,
            print >> out, 'full graph splits were degenerate'
            print >> out
        print >> out, 'primary split stats:'
        print >> out, self.error_primary_split_count,
        print >> out, 'errors in finding the primary split (should be 0)'
        print >> out, self.invalid_primary_split_count,
        print >> out, 'invalid primary splits (should be 0)'
        print >> out, self.degenerate_primary_split_count,
        print >> out, 'degenerate primary splits'
        print >> out, self.undesired_primary_split_count,
        print >> out, 'primary splits were not the target split'
        print >> out, self.desired_primary_split_count,
        print >> out, 'primary splits were the target split'
        print >> out
        if self.informative_children:
            print >> out, 'secondary split stats:'
            print >> out, self.uninformative_child_count,
            print >> out, 'samples had at least one uninformative child tree'
            print >> out, self.informative_child_count,
            print>> out, 'samples had two informative child trees'
            print >> out
        if self.invalid_dendrogram:
            print >> out, 'naive dendrogram stats:'
            print >> out, self.valid_dendrogram_count,
            print >> out, 'naive dendrograms were valid'
            print >> out
        return out.getvalue().strip()

    def do_search(self, nseconds, sampling_function):
        """
        @param nseconds: allowed search time or None
        @param sampling_function: a function that samples a branch length
        @return: True if a tree was found that met the criteria
        """
        if not self.is_initialized():
            raise RuntimeError('the search was not sufficiently initialized')
        true_splits = self.tree.get_nontrivial_splits()
        start_time = time.time()
        while True:
            elapsed_time = time.time() - start_time
            if nseconds and elapsed_time > nseconds:
                return False
            # assign new sampled branch lengths
            for branch in self.tree.get_branches():
                branch.length = sampling_function()
            # get the distance matrix so we can use a library function to get the split
            D = np.array(self.tree.get_distance_matrix())
            ntips = len(D)
            # get the Laplacian matrix of the full tree and the corresponding Fiedler split of the leaves
            if self.force_difference or self.informative_full_split:
                A_aug = np.array(self.tree.get_weighted_adjacency_matrix(self.id_to_index))
                L_aug = Euclid.adjacency_to_laplacian(A_aug)
                v_aug = BuildTreeTopology.laplacian_to_fiedler(L_aug)
                left_aug, right_aug = BuildTreeTopology.eigenvector_to_split(v_aug)
                left = [x for x in left_aug if x in range(ntips)]
                right = [x for x in right_aug if x in range(ntips)]
                leaf_eigensplit_aug = BuildTreeTopology.make_split(left, right)
                if self.force_difference:
                    if leaf_eigensplit_aug == self.desired_primary_split:
                        self.aug_split_collision_count += 1
                        continue
                if self.informative_full_split:
                    if min(len(s) for s in leaf_eigensplit_aug) < 2:
                        self.aug_split_degenerate_count += 1
                        continue
            # get the eigensplit
            try:
                eigensplit = BuildTreeTopology.split_using_eigenvector(D)
            except BuildTreeTopology.DegenerateSplitException, e:
                self.degenerate_primary_split_count += 1
                continue
            except BuildTreeTopology.InvalidSpectralSplitException, e:
                self.error_primary_split_count += 1
                continue
            if eigensplit not in true_splits:
                raise RuntimeError('INVALID SPLIT:' + tree.get_newick_string())
            if eigensplit != self.desired_primary_split:
                self.undesired_primary_split_count += 1
                continue
            self.desired_primary_split_count += 1
            # check the splits of the two child trees
            degenerate_subsplit_count = 0
            L = Euclid.edm_to_laplacian(D)
            for side in eigensplit:
                L_child = SchurAlgebra.mmerge(L, side)
                v = BuildTreeTopology.laplacian_to_fiedler(L_child)
                child_eigensplit = BuildTreeTopology.eigenvector_to_split(v)
                if min(len(s) for s in child_eigensplit) < 2:
                    degenerate_subsplit_count += 1
            if degenerate_subsplit_count:
                self.uninformative_child_count += 1
            else:
                self.informative_child_count += 1
            if self.informative_children:
                if degenerate_subsplit_count:
                    continue
            # check the dendrogram
            if self.invalid_dendrogram:
                labels = range(len(D))
                hierarchy = Dendrogram.get_hierarchy(D, Dendrogram.spectral_split, labels)
                dendrogram_splits = set(Dendrogram.hierarchy_to_nontrivial_splits(hierarchy))
                if dendrogram_splits == true_splits:
                    self.valid_dendrogram_count += 1
                    continue
            # the tree has met all of the requirements
            return True
Ejemplo n.º 32
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name() for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(pruned_full_tree, 120) 
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'), (bacteria_names, 'bacteria'), (eukaryota_names, 'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices), (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets, index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Ejemplo n.º 33
0
def get_standard_response(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # begin the response
    out = StringIO()
    # show a summary of the original data
    print >> out, 'data summary before removing branches with zero length:'
    print >> out, len(archaea_names), 'archaea names in the original tree'
    print >> out, len(bacteria_names), 'bacteria names in the original tree'
    print >> out, len(eukaryota_names), 'eukaryota names in the original tree'
    print >> out, len(all_names), 'total names in the original tree'
    print >> out
    # get the pruned full tree
    pruned_full_tree = get_pruned_tree(full_tree)
    ordered_names = list(node.get_name()
                         for node in pruned_full_tree.gen_tips())
    # show a summary of the processed data
    print >> out, 'data summary after removing branches with zero length:'
    print >> out, len(
        ordered_names), 'total names in the processed non-degenerate tree'
    print >> out
    # draw the pruned full tree
    print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
    formatted_tree_string = NewickIO.get_narrow_newick_string(
        pruned_full_tree, 120)
    print >> out, formatted_tree_string
    print >> out
    # split the distance matrix
    D = np.array(pruned_full_tree.get_distance_matrix(ordered_names))
    L = Euclid.edm_to_laplacian(D)
    v = BuildTreeTopology.laplacian_to_fiedler(L)
    eigensplit = BuildTreeTopology.eigenvector_to_split(v)
    # report the eigendecomposition
    print >> out, get_eigendecomposition_report(D)
    # report the clade intersections of sides of the split
    side_names = [set(ordered_names[i] for i in side) for side in eigensplit]
    clade_name_pairs = ((archaea_names, 'archaea'),
                        (bacteria_names, 'bacteria'), (eukaryota_names,
                                                       'eukaryota'))
    print >> out, 'clade intersections with each side of the split:'
    for side, side_name in zip(side_names, ('left', 'right')):
        for clade, clade_name in clade_name_pairs:
            if clade & side:
                print >> out, 'the', side_name, 'side intersects', clade_name
    print >> out
    # prepare to do the secondary splits
    left_indices, right_indices = eigensplit
    full_label_sets = [set([i]) for i in range(len(ordered_names))]
    # get a secondary split
    for index_selection, index_complement in ((left_indices, right_indices),
                                              (right_indices, left_indices)):
        L_s1 = SchurAlgebra.mmerge(L, index_complement)
        next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                              index_complement)
        v = BuildTreeTopology.laplacian_to_fiedler(L_s1)
        left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
            v)
        left_sublabels = set()
        for i in left_subindices:
            left_sublabels.update(next_label_sets[i])
        right_sublabels = set()
        for i in right_subindices:
            right_sublabels.update(next_label_sets[i])
        left_subnames = set(ordered_names[i] for i in left_sublabels)
        right_subnames = set(ordered_names[i] for i in right_sublabels)
        print >> out, 'clade intersections with a subsplit:'
        for clade, clade_name in clade_name_pairs:
            if clade & left_subnames:
                print >> out, 'the left side intersects', clade_name
        for clade, clade_name in clade_name_pairs:
            if clade & right_subnames:
                print >> out, 'the right side intersects', clade_name
        print >> out
    # show debug info
    print >> out, 'archaea names:'
    print >> out, '\n'.join(x for x in sorted(archaea_names))
    print >> out
    print >> out, 'bacteria names:'
    print >> out, '\n'.join(x for x in sorted(bacteria_names))
    print >> out
    print >> out, 'eukaryota names:'
    print >> out, '\n'.join(x for x in sorted(eukaryota_names))
    print >> out
    # return the response
    response_text = out.getvalue().strip()
    return [('Content-Type', 'text/plain')], response_text
Ejemplo n.º 34
0
 def get_verbose_summary(self):
     """
     @return: a multiline string
     """
     # begin the response
     out = StringIO()
     # show the number of taxa in various domains
     print >> out, self._get_name_summary()
     print >> out
     # show the pruned full tree
     formatted_tree_string = NewickIO.get_narrow_newick_string(
         self.pruned_tree, 120)
     print >> out, 'this is the tree that represents all clades but for which redundant nodes have been pruned:'
     print >> out, formatted_tree_string
     print >> out
     # split the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # report the eigendecomposition
     print >> out, get_eigendecomposition_report(D)
     print >> out
     # report the clade intersections of sides of the split
     side_names = [
         set(self.pruned_names[i] for i in side) for side in eigensplit
     ]
     print >> out, 'domains represented by each side of the primary split:'
     print >> out, 'the left side has:\t', ', '.join(
         self._get_domains(side_names[0]))
     print >> out, 'the right side has:\t', ', '.join(
         self._get_domains(side_names[1]))
     print >> out
     # prepare to do the secondary splits
     left_indices, right_indices = eigensplit
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     # do the secondary splits
     for index_selection, index_complement in ((left_indices,
                                                right_indices),
                                               (right_indices,
                                                left_indices)):
         L_secondary = SchurAlgebra.mmerge(L, index_complement)
         next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                               index_complement)
         v = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
         left_subindices, right_subindices = BuildTreeTopology.eigenvector_to_split(
             v)
         left_sublabels = set()
         for i in left_subindices:
             left_sublabels.update(next_label_sets[i])
         right_sublabels = set()
         for i in right_subindices:
             right_sublabels.update(next_label_sets[i])
         left_subnames = set(self.pruned_names[i] for i in left_sublabels)
         right_subnames = set(self.pruned_names[i] for i in right_sublabels)
         print >> out, 'domains represented by a subsplit:'
         print >> out, 'the left side has:\t', ', '.join(
             self._get_domains(left_subnames))
         print >> out, 'the right side has:\t', ', '.join(
             self._get_domains(right_subnames))
         print >> out
     # return the multiline string
     return out.getvalue().strip()
Ejemplo n.º 35
0
 def _do_analysis(self, use_generalized_nj):
     """
     Do some splits of the tree.
     @param use_generalized_nj: True if we use an old method of outgrouping
     """
     # define the distance matrix
     D = np.array(self.pruned_tree.get_distance_matrix(self.pruned_names))
     # get the primary split of the criterion matrix
     L = Euclid.edm_to_laplacian(D)
     v = BuildTreeTopology.laplacian_to_fiedler(L)
     eigensplit = BuildTreeTopology.eigenvector_to_split(v)
     # assert that the first split cleanly separates the bacteria from the rest
     left_indices, right_indices = eigensplit
     left_domains = self._get_domains(
         [self.pruned_names[x] for x in left_indices])
     right_domains = self._get_domains(
         [self.pruned_names[x] for x in right_indices])
     if ('bacteria' in left_domains) and ('bacteria' in right_domains):
         raise HandlingError('bacteria were not defined by the first split')
     # now we have enough info to define the first supplementary csv file
     self.first_split_object = SupplementarySpreadsheetObject(
         self.pruned_names, L, v)
     # define the bacteria indices vs the non-bacteria indices for the second split
     if 'bacteria' in left_domains:
         bacteria_indices = left_indices
         non_bacteria_indices = right_indices
     elif 'bacteria' in right_domains:
         bacteria_indices = right_indices
         non_bacteria_indices = left_indices
     # get the secondary split of interest
     if use_generalized_nj:
         D_secondary = BuildTreeTopology.update_generalized_nj(
             D, bacteria_indices)
         L_secondary = Euclid.edm_to_laplacian(D_secondary)
     else:
         L_secondary = SchurAlgebra.mmerge(L, bacteria_indices)
     full_label_sets = [set([i]) for i in range(len(self.pruned_names))]
     next_label_sets = SchurAlgebra.vmerge(full_label_sets,
                                           bacteria_indices)
     v_secondary = BuildTreeTopology.laplacian_to_fiedler(L_secondary)
     eigensplit_secondary = BuildTreeTopology.eigenvector_to_split(
         v_secondary)
     left_subindices, right_subindices = eigensplit_secondary
     pruned_names_secondary = []
     for label_set in next_label_sets:
         if len(label_set) == 1:
             label = list(label_set)[0]
             pruned_names_secondary.append(self.pruned_names[label])
         else:
             pruned_names_secondary.append('all-bacteria')
     # assert that the second split cleanly separates the eukaryota from the rest
     left_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in left_subindices])
     right_subdomains = self._get_domains(
         [pruned_names_secondary[x] for x in right_subindices])
     if ('eukaryota' in left_subdomains) and ('eukaryota'
                                              in right_subdomains):
         raise HandlingError(
             'eukaryota were not defined by the second split')
     # now we have enough info to define the second supplementary csv file
     self.second_split_object = SupplementarySpreadsheetObject(
         pruned_names_secondary, L_secondary, v_secondary)