/
hdda.py
557 lines (491 loc) · 23 KB
/
hdda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
# -*- coding: utf-8 -*-
import scipy as sp
from scipy import linalg
from sklearn.cluster import KMeans
from scipy.linalg.blas import dsyrk,dsymm
from joblib import Parallel, delayed
# TODO: clean the output of predict when out=proba, add the posterior probabilities
# TODO: Work on ni rather than n for selected the number of eigenvalues -> needs to re-define check for the values of pi
# TODO: Work on return values for checking errors
# TODO: Check the prediction part
# TODO: Add "d*sp.log(2*sp.pi)" as a constant of the method, at the initialization
# TODO: Check modification in-place for T
## Numerical precision - Some constant
EPS = sp.finfo(sp.float64).eps
MIN = sp.finfo(sp.float64).min
MAX = sp.finfo(sp.float64).max
E_MAX = sp.log(MAX) # Maximum value that is possible to compute with sp.exp
## HDDA Class
class HDGMM():
"""
This class implements the HDDA models proposed by Charles Bouveyron and Stephane Girard
Details about methods can be found here:
http://w3.mi.parisdescartes.fr/~cbouveyr/
"""
def __init__(self,model='M1'):
"""
This function initialize the HDDA stucture
:param model: the model used.
:type mode: string
- M1 = aijbiQidi
- M2 = aijbiQid
- M3 = aijbQidi
- M4 = aijbQid
- M5 = aibiQidi
- M6 = aibiQid
- M7 = aibQidi
- M8 = aibQid
- M9 = abiQidi <--
- M10 = abiQid
- M11 = abQidi
- m12 = abQid
"""
self.ni = [] # Number of samples of each class
self.prop = [] # Proportion of each class
self.mean = [] # Mean vector
self.pi=[] # Signal subspace size
self.L = [] # Eigenvalues of covariance matrices
self.Q = [] # Eigenvectors of covariance matrices
self.trace = [] # Trace of the covariance matrices
self.a = [] # Eigenvalues of signal subspaces
self.b = [] # Values of the noise
self.logdet = [] # Pre-computation of the logdet of covariance matrices using HDDA models
if model in ('M1','M2','M3','M4','M5','M6','M7','M8'):
self.model=model # Name of the model
else:
print "Model parameter {} is not available".format(model)
exit()
self.q = [] # Number of parameters of the full models
self.bic = [] # bic values of the model
self.icl = [] # icl values of the model
self.niter = None # Number of iterations
self.X = [] # Matrice to project samples when n<d$
self.T = [] # Probabilities of each sample to belong to each class
def free(self,full=False):
"""This function free some parameters of the model. It is used to
speed-up the cross validation process.
:param full: To free only the parcimonious part or all the model
:type full: bool
"""
self.pi=[]
self.a = []
self.b = []
self.logdet = []
self.q = []
if full:
self.ni = [] # Number of samples of each class
self.prop = [] # Proportion of each class
self.mean = [] # Mean vector
self.pi=[] # Signal subspace size
self.L = [] # Eigenvalues of covariance matrices
self.Q = []
self.trace = []
self.X = []
def fit(self,x,y=None,param={},yi=None):
"""
This function fit the HDDA model
:param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables
:param y: The vector of corresponding labels, is of size n \times 1 in the supervised case, otherwise it is None
:param param: A dictionnary of parameters. For the supervised case, it contains the threshold or the size of the signal subspace. For the unsupervised case, it contains also the number of classes and the initialization method.
:type x: float
:type y: int
:type param: dictionnary
:return: the predicted label for the unsupervised case
"""
EM = False
n,d = x.shape
# Set defaults parameters
default_param={'th':0.9,'init':'kmeans','itermax':100,'tol':0.001,'C':4,'population':2,'random_state':0}
for key,value in default_param.iteritems():
if not param.has_key(key):
param[key]=value
# If unsupervised case
if y == None: # Initialisation of the class membership
EM,ITER,ITERMAX,TOL,LL = True,0,param['itermax'],param['tol'],[]
if param['C'] == 1:
y = sp.ones((n,1))
else:
init = param['init']
if init == 'kmeans':
y = KMeans(n_clusters=param['C'],n_init=5,n_jobs=-1,random_state=param['random_state']).fit_predict(x)
# Check for minimal size of cluster
nc = sp.asarray([len(sp.where(y==i)[0]) for i in xrange(param['C'])])
if sp.any(nc<2):
self.LL,self.bic,self.icl,self.niter = LL, MIN, MIN, (ITER+1)
return -1 # Kmeans failed
else:
y += 1 # Label starts at one
elif init == 'random':
sp.random.seed(param['random_state'])
y = sp.random.randint(1,high=param['C']+1,size=n)
elif init == 'user':
if param['C'] != yi.max():
print "The number of class does not match is param['C'] and yi"
y = yi
else:
print "Initialization should be kmeans or random or user"
return - 2 # Bad init values
# Initialization of the parameter
self.fit_init(x,y)
self.fit_update(param)
if EM == True: # Unsupervised case, needs iteration
ll,t = self.loglike(x)
LL.append(ll)
#T.append(t) ###
while(ITER<ITERMAX):
# E step - Use the precomputed t
# Check for empty classes
if sp.any(t.sum(axis=0)<param['population']): # If empty return infty bic
self.LL,self.T,self.bic,self.icl,self.niter = LL, t,MIN, MIN, (ITER+1) ###
return - 3 # population empty
# M step
self.free(full=True)
self.fit_init(x,t)
self.fit_update(param)
# Compute the BIC and do the E step
ll = self.loglike(x,T=t)
LL.append(ll)
if abs((LL[-1]-LL[-2])/LL[-2]) < TOL:
break
else:
ITER += 1
# Return the class membership and some parameters of the optimization
self.LL = LL
self.T = t ###
self.bic = 2*LL[-1] - self.q*sp.log(n)
self.icl = self.bic + 2*sp.log(t.max(axis=1)+EPS).sum() # Add small constant to prevent numerical issues
self.niter = ITER + 1
return 1
def fit_init(self,x,y):
"""This function computes the empirical estimators of the mean
vector, the convariance matrix and the proportion of each
class.
:param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables
:param y: The vector of corresponding labels, is of size n \times 1 in the supervised case and n \times C in the unsupervised case
:type x: float
:type y: int
"""
## Get information from the data
n,d = x.shape # Number of samples and number of variables
if y.ndim == 1: # Number of classes
C = int(y.max(0))
else:
C = y.shape[1]
if n != y.shape[0]:
print("size of x and y should match")
exit()
## Compute constant
self.cst = d*sp.log(2*sp.pi)
## Compute the whole covariance matrix
if self.model in ('M2','M4','M6','M8'):
X = (x - sp.mean(x,axis=0))
if n >= d: # Here use dsyrk to take benefit of the product symmetric matrices X^{t}X or XX^{t}
self.W = dsyrk(1.0/n,X.T,trans=False) # Transpose to put in fortran order
else:
self.W = dsyrk(1.0/n,X.T,trans=True) # Transpose to put in fortran order
X = None
## Learn the empirical of the model for each class
for c in xrange(C):
if y.ndim == 1: # Supervised case
j = sp.where(y==(c+1))[0]
self.ni.append(j.size)
self.prop.append(float(self.ni[c])/n)
self.mean.append(sp.mean(x[j,:],axis=0))
X = (x[j,:]-self.mean[c])
else: # Unsupervised case
self.ni.append(y[:,c].sum())
self.prop.append(float(self.ni[c])/n)
self.mean.append(sp.average(x,weights=y[:,c],axis=0))
X = (x-self.mean[c])*sp.sqrt(y[:,c]).reshape(n,1)
if n >= d: # Here use dsyrk to take benefit of the product of symmetric matrices X^{t}X or XX^{t}
cov = dsyrk(1.0/(self.ni[c]-1),X.T,trans=False) # Transpose to put in fortran order
else:
cov = dsyrk(1.0/(self.ni[c]-1),X.T,trans=True) # Transpose to put in fortran order
self.X.append(X)
X = None
L,Q = linalg.eigh(cov,lower=False) # Only the upper part of cov is initialize -> dsyrk
idx = L.argsort()[::-1]
L,Q = L[idx],Q[:,idx]
L[L<EPS]=EPS # Chek for numerical errors
self.L.append(L)
self.Q.append(Q)
self.trace.append(cov.trace())
def fit_update(self,param):
"""
This function compute the parcimonious HDDA model from the empirical estimates obtained with fit_init
"""
## Get parameters
C = len(self.ni)
n,d = sp.sum(self.ni).astype(int),self.mean[0].size
th = param['th']
## Estimation of the signal subspace
if self.model in ('M2','M4','M6','M8'): # For common size subspace models
L = linalg.eigh(self.W,eigvals_only=True,lower=False) # Compute intrinsic dimension on the whole data set
idx = L.argsort()[::-1]
L = L[idx]
L[L<EPS]=EPS # Chek for numerical errors
dL,p = sp.absolute(sp.diff(L)),1 # To take into account python broadcasting a[:p] = a[0]...a[p-1]
dL /= dL.max()
while sp.any(dL[p:]>th):
p += 1
minDim = int(min(min(self.ni),d))
# Check if (p >= ni-1 or d-1) and p > 0
if p < minDim - 1 :
self.pi = [p for c in xrange(C)]
else:
self.pi = [max((minDim-2),1) for c in xrange(C)]
elif self.model in ('M1','M3','M5','M7'): # For specific size subspace models
for c in xrange(C):
# Scree test
dL,pi = sp.absolute(sp.diff(self.L[c])),1
dL /= dL.max()
while sp.any(dL[pi:]>th):
pi += 1
self.pi.append(pi)
# Check if (pi >= ni-1 or d-1) and pi > 0
self.pi = [sPI if sPI < int(min(sNI,d)-1) else max(int(min(sNI,d)-2),1) for sPI,sNI in zip(self.pi,self.ni)]
## Estim signal part
self.a = [sL[:sPI] for sL,sPI in zip(self.L,self.pi)]
if self.model in ('M5','M6','M7','M8'):
self.a = [sp.repeat(sA[:].mean(),sA.size) for sA in self.a]
## Estim noise term
if self.model in ('M1','M2','M5','M6'): # Noise free
self.b = [(sT-sA.sum())/(d-sPI) for sT,sA,sPI in zip(self.trace,self.a,self.pi)]
# Check for very small value of b
self.b = [b if b > EPS else EPS for b in self.b]
elif self.model in ('M3','M4','M7','M8'):# Noise common
# Estimation of b
denom = d - sp.sum([sPR*sPI for sPR,sPI in zip(self.prop,self.pi)])
num = sp.sum([sPR*(sT-sA.sum()) for sPR,sT,sA in zip(self.prop,self.trace,self.a)])
# Check for very small values
if num<EPS:
self.b = [EPS for i in xrange(C)]
elif denom<EPS:
self.b = [1/EPS for i in xrange(C)]
else:
self.b = [num/denom for i in xrange(C)]
## Compute remainings parameters
# Precompute logdet
self.logdet = [(sp.log(sA).sum() + (d-sPI)*sp.log(sB)) for sA,sPI,sB in zip(self.a,self.pi,self.b)]
# Update the Q matrices
if n >= d :
self.Q = [sQ[:,:sPI] for sQ,sPI in zip(self.Q,self.pi)]
else:
self.Q = [sp.dot(sX.T,sQ[:,:sPI])/sp.sqrt(sL[:sPI]) for sX,sQ,sPI,sL in zip(self.X,self.Q,self.pi,self.L)]
## Compute the number of parameters of the model
self.q = C*d + (C-1) + sum([sPI*(d-(sPI+1)/2) for sPI in self.pi])
if self.model in ('M1','M3','M5','M7'): # Number of noise subspaces
self.q += C
elif self.model in ('M2','M4','M6','M8'):
self.q += 1
if self.model in ('M1','M2'): # Size of signal subspaces
self.q += sum(self.pi)+C
elif self.model in ('M3','M4'):
self.q += sum(self.pi)+ 1
elif self.model in ('M5','M6'):
self.q += 2*C
elif self.model in ('M7','M8'):
self.q += C+1
def predict(self,xt,out=None):
"""
This function compute the decision of the fitted HD model.
:param xt: The samples matrix of testing samples
:param out: Setting to a value different from None will let the function returns the posterior probability for each class.
:type xt: float
:type out: string
:return yp: The predicted labels and posterior probabilities if asked.
"""
nt,d = xt.shape
C = len(self.a)
K = sp.empty((nt,C))
## Start the prediction for each class
for c in xrange(C):
# Compute the constant term
K[:,c] = self.logdet[c] - 2*sp.log(self.prop[c]) + self.cst
# Remove the mean
xtc = xt-self.mean[c]
# Do the projection
Px = sp.dot(xtc,sp.dot(self.Q[c],self.Q[c].T)) ## BLAS dsyrk for "sp.dot(self.Q[c],self.Q[c].T)" and dsymm for PX
temp = sp.dot(Px,self.Q[c]/sp.sqrt(self.a[c]))
K[:,c] += sp.sum(temp**2,axis=1)
K[:,c] += sp.sum((xtc - Px)**2,axis=1)/self.b[c]
## Assign the label to the minimum value of K
if out == None:
yp = sp.argmin(K,1)+1
return yp
elif out == 'proba':
for c in xrange(C):
K[:,c] += 2*sp.log(self.prop[c])
K *= -0.5
return yp,K
elif out == 'ki':
return K
elif out == 'post':
# for c in xrange(C):
# K[:,c] += 2*sp.log(self.prop[c])
K *= -0.5
K[K>E_MAX],K[K<-E_MAX] = E_MAX,-E_MAX
sp.exp(K,out=K)
K /= K.sum(axis=1).reshape(nt,1)
return K
# def CV(self,x,y,param,v=5,seed=0):
# """
# This function computes the cross validation estimate of the Kappa coefficient of agreement given a set of parameters in the supervised case.
# To speed up the processing, the empirical estimate (mean, proportion, eigendecomposition) is done only one for each fold.
# :param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables
# :param y: The vector of corresponding labels, is of size n \times 1 in the supervised case, otherwise it is None
# :param param: A dictionnary of parameters.
# :param v: the number of folds of the CV.
# :param seed: the initial state of the random generator.
# :return: the optimal value for the given model and the corresponding Kappa
# """
# # Initialization of the stratified K-Fold
# KF = StratifiedKFold(y.reshape(y.size,),v,random_state=seed)
# # Get parameters grid
# if self.model in ('M1','M3','M5','M7'): # TODO: Add other models
# param_grid = param['th']
# elif self.model in ('M2','M4','M6','M8'):
# param_grid = param['p']
# # Initialize the confusion matrix and the Kappa coefficient vector
# acc,Kappa = ai.CONFUSION_MATRIX(),sp.zeros((len(param_grid)))
# for train,test in KF:
# modelTemp = HDGMM(model=self.model)
# modelTemp.fit_init(x[train,:],y[train])
# for i,param_grid_ in enumerate(param_grid):
# # Fit model on train subests
# if modelTemp.model in ('M1','M3','M5','M7'):
# param_= {'th':param_grid_}
# elif modelTemp.model in ('M2','M4','M6','M8'):
# param_= {'p':param_grid_}
# modelTemp.fit_update(param_)
# # Predict on test subset
# yp = modelTemp.predict(x[test,:])
# acc.compute_confusion_matrix(yp,y[test])
# Kappa[i] += acc.Kappa
# modelTemp.free()
# Kappa /= v
# # Select the value with the highest Kappa value
# ind = sp.argmax(Kappa)
# return param_grid[ind],Kappa[ind]
def loglike(self,x,T=None):
"""
Compute the log likelyhood given a set of samples.
:param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables
"""
flag = False
## Get some parameters
n = x.shape[0]
## Compute the membership function
K = self.predict(x,out='ki')
## Compute the Loglikelhood
K *= (-0.5)
Km = K.max(axis=1).reshape(n,1)
LL = (sp.log(sp.exp(K-Km).sum(axis=1)).reshape(n,1)+Km).sum() # logsumexp trick
## Compute the posterior
if T is None:
flag = True
T = sp.empty_like(K)
with sp.errstate(over='ignore'):
for i in xrange(K.shape[1]):
T[:,i] = 1 / sp.exp(K-K[:,i][:,sp.newaxis]).sum(axis=1)
if flag:
return LL, T
else:
return LL
# def posterior(self,K=None,T=None):
# """Compute the posterior probability given the membership function
# :param K: A n \times c matrix containing the decision function (obtained with predict)
# """
# if K == None and T == None:
# print "At least one of K or T should be not None"
# exit()
# if K != None:
# T = -0.5*K
# n = T.shape[0]
# # Check fo numerical stability : remove to high/low values NOT SO GOOD -> to be modified
# T[T>E_MAX],T[T<-E_MAX] = E_MAX,-E_MAX
# sp.exp(T,out=T) # No need to take 0.5
# T /= T.sum(axis=1).reshape(n,1)
# return T
def fit_all(self,x,MODEL=['M1','M2','M3','M4','M5','M6','M7','M8'],th=[0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.3],C = [1,2,3,4,5,6,7,8],VERBOSE=False,random_state=0,criteria='icl'):
"""
This method fits all the model given the parameter th and the
number of class C, and return the best model in terms of the
BIC or ICL.
"""
nmod,nC,nt = len(MODEL),len(C),len(th)
CRIT = sp.zeros((nmod,nC,nt))
param = {'init':'user','tol':0.00001,'random_state':random_state}
for i,c_ in enumerate(C):
param['C']=c_
# Kmeans initialization
yi = KMeans(n_clusters=param['C'],n_init=10,n_jobs=-1,random_state=param['random_state']).fit_predict(x)
# Check for minimal size of cluster
nc = sp.asarray([len(sp.where(yi==i_)[0]) for i_ in xrange(param['C'])])
if sp.any(nc<2):
CRIT [:,i,:] = MIN
else:
yi+=1
for m,model_ in enumerate(MODEL): # Loop over the models
for j,th_ in enumerate(th): # Loop over the threshold
param['th']=th_
model = HDGMM(model=model_)
model.fit(x,param=param,yi=yi)
if criteria == 'bic':
CRIT [m,i,j]=model.bic
elif criteria == 'icl':
CRIT [m,i,j]=model.icl # model.bic
if VERBOSE:
print("Models \t C \t th \t {0} ".format(criteria))
for m in xrange(len(MODEL)):
t = sp.where(CRIT [m,:,:]==CRIT [m,:,:].max())
print MODEL[m] + " \t " + str(C[t[0][0]]) + " \t " + str(th[t[1][0]]) + " \t " + str(CRIT [m,:,:].max())
t = sp.where(CRIT ==CRIT .max())
print ("\nBest model is {}".format(MODEL[t[0][0]]))
else:
t = sp.where(CRIT ==CRIT .max())
## Return the best model
param['init']='kmeans'
param['C']=C[t[1][0]]
param['th']=th[t[2][0]]
self.model = MODEL[t[0][0]]
self.fit(x,param=param)
def fit_best_init(self,x,C,n_init=10,M="M2",th=0.1,criteria='icl',n_jobs=-1):
"""This method fits the given model, given the number of clusters and the parameter th, for 10 model initializations, and returns the best model in terms of BIC or ICL.
Input:
-x: input data
-C: number of clusters
-M: model name (default M2)
-th: threshold parameter (default 0.1)
-criteria: model selection criteria (default: icl)
-n_jobs (int): number of jobs to run in parallel (default -1)
"""
param = {'init':'kmeans','tol':0.00001,'C':C,'th':th}
CRIT = Parallel(n_jobs=n_jobs,verbose=False)(delayed(worker_init)(i,x, M, param, criteria) for i in range(n_init))
if criteria == "bic":
t = sp.argmin(CRIT)
elif criteria == "icl":
t = sp.argmax(CRIT)
## Return the best model
param['init']='kmeans'
param['random_state'] = t
self.fit(x,param=param)
def worker_init(i,x,M,param,criteria):
"""Parallel run of the same model but with different model initializations.
Input:
-i (int): initialization
-x: input data
-M: model name
-param (dict): parameters of the model
-criteria: model selection criteria ('bic' or 'icl')
Return:
- crit (float): value of the model selection criteria
"""
param['random_state'] = i
model = HDGMM(model=M)
model.fit(x,param=param)
if criteria == 'bic':
crit=model.bic
elif criteria == 'icl':
crit =model.icl
return crit