forked from argriffing/xgcode
-
Notifications
You must be signed in to change notification settings - Fork 0
/
20090929a.py
166 lines (154 loc) · 6.23 KB
/
20090929a.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
Examine iterative projection onto axes of the leaf Steiner ellipsoid.
Examine steps of a projection onto axes
of the Steiner ellipsoid of the leaves.
Look at each step of the projection onto the
principal axes of the Steiner ellipsoid of the leaves.
Because this really is a projection it can obviously be done in one step,
but I haven't worked this out yet.
"""
from StringIO import StringIO
import random
import time
import argparse
import numpy as np
from SnippetUtil import HandlingError
import SnippetUtil
import Form
import FormOut
import NewickIO
import FelTree
import Euclid
import TreeSampler
def CounterexampleError(Exception): pass
def get_form():
"""
@return: a list of form objects
"""
# define the list of form objects
form_objects = [
Form.Integer('ntaxa', 'number of taxa',
5, low=3, high=20)]
return form_objects
def get_form_out():
return FormOut.Report()
def process(ntaxa):
np.set_printoptions(linewidth=200)
out = StringIO()
# sample an xtree topology
xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
# sample an xtree with exponentially distributed branch lengths
mu = 2.0
for branch in xtree.get_branches():
branch.length = random.expovariate(1/mu)
# convert the xtree to a FelTree so we can use the internal vertices
tree_string = xtree.get_newick_string()
tree = NewickIO.parse(tree_string, FelTree.NewickTree)
# get ordered ids and the number of leaves and some auxiliary variables
ordered_ids = get_ordered_ids(tree)
nleaves = len(list(tree.gen_tips()))
id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids))
# get the distance matrix relating all of the points
D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
# Now do the projection so that
# the resulting points are in the subspace whose basis vectors are the axes of the leaf ellipsoid.
# First get the points such that the n rows in X are points in n-1 dimensional space.
X = Euclid.edm_to_points(D_full)
print >> out, 'points with centroid at origin:'
print >> out, X
print >> out
# Translate all of the points so that the origin is at the centroid of the leaves.
X -= np.mean(X[:nleaves], 0)
print >> out, 'points with centroid of leaves at origin:'
print >> out, X
print >> out
# Extract the subset of points that define the leaves.
L = X[:nleaves]
# Find the orthogonal transformation of the leaves onto their MDS axes.
# According to the python svd documentation, singular values are sorted most important to least important.
U, s, Vt = np.linalg.svd(L)
# Transform all of the points (including the internal vertices) according to this orthogonal transformation.
# The axes are now the axes of the Steiner circumscribed ellipsoid of the leaf vertices.
# I am using M.T[:k].T to get the first k columns of M.
Z = np.dot(X, Vt.T)
print >> out, 'orthogonally transformed points (call this Z):'
print >> out, Z
print >> out
Y = Z.T[:(nleaves-1)].T
print >> out, 'projection of the points onto the axes of the leaf ellipsoid,'
print >> out, '(these are the first columns of Z; call this projected matrix Y):'
print >> out, Y
print >> out
# Show the inner products.
inner_products_of_columns = np.dot(Y.T, Y)
print >> out, "pairwise inner products of the columns of Y (that is, Y'Y)"
print >> out, inner_products_of_columns
print >> out
# Show other inner products.
inner_products_of_columns = np.dot(Y[:5].T, Y[:5])
print >> out, "pairwise inner products of the first few columns of Y"
print >> out, inner_products_of_columns
print >> out
# Extract the subset of points that define the points of articulation.
# Note that the origin is the centroid of the leaves.
R = X[nleaves:]
Y_leaves = Y[:nleaves]
W = np.dot(np.linalg.pinv(L), Y_leaves)
print >> out, 'leaf projection using pseudoinverse (first few rows of Y):'
print >> out, np.dot(L, W)
print >> out
print >> out, 'projection of points of articulation using pseudoinverse (remaining rows of Y):'
print >> out, np.dot(R, W)
print >> out
# Get all of the points in high dimensional space.
X = Euclid.edm_to_points(D_full)
# Get the MDS onto the lower dimensional space.
X = X.T[:(nleaves-1)].T
assert np.allclose(sum(X, 0), 0)
print >> out, 'all points projected onto the first principal axes of the full ellipsoid:'
print >> out, X
print >> out
# Look at only the leaves in this space.
L = X[:nleaves]
L -= np.mean(L, 0)
print >> out, 'leaves projected onto the first principal axes of the full ellipsoid and then centered:'
print >> out, L
print >> out
# Re-project the leaves onto the axes of leaf ellipsoid.
D_leaves = Euclid.dccov_to_edm(np.dot(L, L.T))
Y = Euclid.edm_to_points(D_leaves)
print >> out, 'leaves further projected onto principal axes of their own ellipsoid:'
print >> out, Y
print >> out
# Try something else
D_all = Euclid.dccov_to_edm(np.dot(X, X.T))
Y = Euclid.edm_to_points(D_all).T[:(nleaves-1)].T
print >> out, 'all points further projected onto their own principal axes of inertia:'
print >> out, Y
print >> out
# Try the same thing some more
D_again = Euclid.dccov_to_edm(np.dot(Y, Y.T))
Z = Euclid.edm_to_points(D_again).T[:(nleaves-1)].T
print >> out, 'all points further projected onto their own principal axes of inertia (second iteration):'
print >> out, Z
print >> out
return out.getvalue().strip()
def get_response_content(fs):
return process(fs.ntaxa) + '\n'
def get_ordered_ids(tree):
"""
Maybe I could use postorder here instead.
@param tree: a tree
@return: a list of ids beginning with the leaves
"""
ordered_ids = []
ordered_ids.extend(id(node) for node in tree.gen_tips())
ordered_ids.extend(id(node) for node in tree.gen_internal_nodes())
return ordered_ids
def main(args):
print process(args.ntaxa)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--ntaxa', type=int, default=5, help='number of taxa in each sampled tree topology')
args = parser.parse_args()
main(args)