-
Notifications
You must be signed in to change notification settings - Fork 1
/
tste.py
249 lines (205 loc) · 9.31 KB
/
tste.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python2
"""TSTE: t-Distributed Stochastic Triplet Embedding
X = tste(triplets, no_dims=2, lambda=0, alpha=no_dims-1, use_log=True, verbose=True)
where 'triplets' is an integer array with shape (k, 3) containing the
known triplet relationships between points. Each point is refered to
by its index: consider a set of points and some (possibly nonmetric)
distance function d.
For each t in triplets' rows,
assume d(point[t[0]], point[t[1]])
< d(point[t[0]], point[t[2]])
The function implements t-distributed stochastic triplet embedding
(t-STE) based on the specified triplets, to construct an embedding
with no_dims dimensions. The parameter lambda specifies the amount of
L2- regularization (default = 0), whereas alpha sets the number of
degrees of freedom of the Student-t distribution (default = 1). The
variable use_log determines whether the sum of the log-probabilities
or the sum of the probabilities is maximized (default = true).
Note: This function directly learns the embedding.
Original MATLAB implementation: (C) Laurens van der Maaten, 2012, Delft University of Technology
Python and Theano port: (C) Michael Wilber, 2013, UC San Diego
"""
raise ValueError("This gradient calculation is incorrect for dimensionality >2! Please switch to https://github.com/gcr/cython_tste which is nicer anyways")
USING_THEANO = True
import numpy as np
try:
import theano as t
import theano.tensor as T
except ImportError:
USING_THEANO=False
import warnings
warnings.warn("Please go install Theano for a ~2-3x speedup. Until then, I'm falling back to the slower pure Python implementation.", UserWarning)
def tste(triplets, no_dims=2, lamb=0, alpha=None, use_log=True,verbose=True, max_iter=1000, save_each_iteration=False, initial_X=None,static_points=np.array([]), normalize_gradient=False, ignore_zeroindexed_error=True):
"""Learn the triplet embedding for the given triplets.
Returns an array with shape (max(triplets)+1, no_dims). The i-th
row in this array corresponds to the no_dims-dimensional
coordinate of the point.
"""
if alpha is None:
alpha = no_dims-1
N = np.max(triplets) + 1
assert -1 not in triplets
# A warning to Matlab users:
if not ignore_zeroindexed_error:
assert 0 in triplets, "Triplets should be 0-indexed, not 1-indexed!"
# Technically, this is allowed I guessss... if your triplets don't
# refer to some points you need... Just don't say I didn't warn
# you. Remove this assertion at your own peril!
n_triplets = len(triplets)
# Initialize some variables
if initial_X is None:
X = np.random.randn(N, no_dims) * 0.0001
else:
X = initial_X
C = np.Inf
tol = 1e-7 # convergence tolerance
eta = 2. # learning rate
best_C = np.Inf # best error obtained so far
best_X = X # best embedding found so far
iteration_Xs = [] # for debugging ;) *shhhh*
# Perform main iterations
iter = 0; no_incr = 0;
while iter < max_iter and no_incr < 5:
old_C = C;
# Calculate gradient descent and cost
if USING_THEANO:
if use_log:
C,G = tste_grad_theano_log(X, N, no_dims, triplets, lamb, alpha)
else:
C,G = tste_grad_theano(X, N, no_dims, triplets, lamb, alpha)
else:
C,G = tste_grad_python(X, N, no_dims, triplets, lamb, alpha, use_log)
if C < best_C:
best_C = C
best_X = X
# Perform gradient update
if save_each_iteration:
iteration_Xs.append(X.copy())
# (NEW:) Optionally normalize each point's gradient by the
# number of times that the point appears in the triplets
if normalize_gradient:
prior = np.bincount(triplets.ravel(),
minlength=N)
prior[prior==0] = 1
# if prior[i]==0, then the point has no gradient anyway
G = G / (prior/np.linalg.norm(prior))[:,np.newaxis]
X = X - (float(eta) / n_triplets * N) * G
if len(static_points):
X[static_points] = initial_X[static_points]
# Update learning rate
if old_C > C + tol:
no_incr = 0
eta *= 1.01
else:
no_incr = no_incr + 1
eta *= 0.5
# Print out progress
iter += 1
if verbose and iter%10 == 0:
# These are Euclidean distances:
sum_X = np.sum(X**2, axis=1)
D = -2 * (X.dot(X.T)) + sum_X[np.newaxis,:] + sum_X[:,np.newaxis]
# ^ D = squared Euclidean distance?
no_viol = np.sum(D[triplets[:,0],triplets[:,1]] > D[triplets[:,0],triplets[:,2]]);
print "Iteration ",iter, ' error is ',C,', number of constraints: ', (float(no_viol) / n_triplets)
if save_each_iteration:
return iteration_Xs
return best_X
def tste_grad_python(X, N, no_dims, triplets, lamb, alpha, use_log):
"""Computes the cost function and corresponding gradient of t-STE. A
purely python implementation."""
triplets_A = triplets[:,0].copy()
triplets_B = triplets[:,1].copy()
triplets_C = triplets[:,2].copy()
# Compute Student-t kernel
sum_X = np.sum(X**2, axis=1)
a = -2 * (X.dot(X.T))
b = a + sum_X[np.newaxis,:] + sum_X[:,np.newaxis]
# ^ something like the squared Euclidean distance?
K = (1 + b / alpha) ** ((alpha+1)/-2)
# ^ Student-T kernel
# Compute value of cost function
P = K[triplets_A,triplets_B] / (
K[triplets_A,triplets_B] +
K[triplets_A,triplets_C])
if use_log:
# Pp = P.copy()
# Pp[P <= 0] = np.finfo(np.float64).tiny
C = -np.sum(np.log(P)) + lamb * np.sum(X**2)
else:
C = -np.sum(P) + lamb * np.sum(X**2);
# Compute gradient for each point
dC = np.zeros((N, no_dims))
for i in xrange(no_dims):
# For i = each dimension to use
const = (alpha+1) / alpha
if use_log:
A_to_B = (1 - P) * K[triplets_A,triplets_B] * (X[triplets_A, i] - X[triplets_B, i])
B_to_C = (1 - P) * K[triplets_A,triplets_C] * (X[triplets_A, i] - X[triplets_C, i])
else:
A_to_B = P * (1-P) * K[triplets_A, triplets_B] * (X[triplets_A,i] - X[triplets_B,i])
B_to_C = P * (1-P) * K[triplets_A, triplets_C] * (X[triplets_A,i] - X[triplets_C,i])
this_dim = -const * np.array([A_to_B - B_to_C,
-A_to_B,
B_to_C]).T
# Group up each point into our gradient
# for n_triplet in xrange(N):
# dC[n_triplet, i] = np.sum(this_dim*(triplets == n_triplet))
# Doing it this way avoids python for loops.
dC[:, i] = np.bincount(triplets.ravel(),
minlength=N,
weights=this_dim.ravel())
dC = -dC + 2*lamb*X
return C, dC
# Theano-flavored tste_grad
if USING_THEANO:
def make_theano_evaluator(use_log):
"""This returns a function(!) that calculates the gradient and cost. Heh."""
X = T.dmatrix('X')
triplets = T.imatrix('triplets')
alpha = T.dscalar('alpha')
lamb = T.dscalar('lambda')
no_dims = T.iscalar('no_dims')
N = T.iscalar('N')
triplets_A = triplets[:,0]
triplets_B = triplets[:,1]
triplets_C = triplets[:,2]
# Compute Student-t kernel. Look familiar?
sum_X = T.sum(X**2, axis=1)
a = -2 * (X.dot(X.T))
b = a + sum_X[np.newaxis,:] + sum_X[:,np.newaxis]
K = (1 + b / alpha) ** ((alpha+1)/-2)
# Compute value of cost function
P = K[triplets_A,triplets_B] / (
K[triplets_A,triplets_B] +
K[triplets_A,triplets_C])
if use_log:
C = -T.sum(T.log(P)) + lamb * T.sum(X**2)
else:
C = -T.sum(P) + lamb * T.sum(X**2)
# Compute gradient, for each dimension
const = (alpha+1) / alpha
dim = T.iscalar('dim')
def each_dim(dim):
if use_log:
A_to_B = (1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim])
B_to_C = (1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim])
else:
A_to_B = P*(1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim])
B_to_C = P*(1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim])
this_dim = (-const * T.stack(A_to_B - B_to_C, -A_to_B, B_to_C)).T
dC = T.extra_ops.bincount(triplets.ravel(),
weights=this_dim.ravel(),
# minlength=N
)
return -dC + 2*lamb*X[:,dim]
# loop across all dimensions... theano loops are weird, yes...
all_dims = (t.scan(each_dim,
# non_sequences=N,
sequences=T.arange(no_dims))
)[0].T
return t.function([X,N,no_dims,triplets,lamb,alpha],
[C, all_dims],
on_unused_input='ignore')
tste_grad_theano_log = make_theano_evaluator(use_log=True)
tste_grad_theano = make_theano_evaluator(use_log=False)