def computeCovar(bed, shrinkMethod, fitIndividuals): eigen = dict([]) if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']): import sklearn.covariance as cov t0 = time.time() print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...' if (shrinkMethod == 'lw'): covEstimator = cov.LedoitWolf(assume_centered=True, block_size = 5*bed.val.shape[0]) elif (shrinkMethod == 'oas'): covEstimator = cov.OAS(assume_centered=True) elif (shrinkMethod == 'l1'): covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True) elif (shrinkMethod == 'cv'): shrunkEstimator = cov.ShrunkCovariance(assume_centered=True) param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]} covEstimator = sklearn.grid_search.GridSearchCV(shrunkEstimator, param_grid) else: raise Exception('unknown covariance regularizer') covEstimator.fit(bed.val[fitIndividuals, :].T) if (shrinkMethod == 'l1'): alpha = covEstimator.alpha_ print 'l1 alpha chosen:', alpha covEstimator2 = cov.GraphLasso(alpha=alpha, assume_centered=True, verbose=True) else: if (shrinkMethod == 'cv'): shrinkEstimator = clf.best_params_['shrinkage'] else: shrinkEstimator = covEstimator.shrinkage_ print 'shrinkage estimator:', shrinkEstimator covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator, assume_centered=True) covEstimator2.fit(bed.val.T) XXT = covEstimator2.covariance_ * bed.val.shape[1] print 'Done in %0.2f'%(time.time()-t0), 'seconds' else: print 'Computing kinship matrix...' t0 = time.time() XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) print 'Done in %0.2f'%(time.time()-t0), 'seconds' try: shrinkParam = float(shrinkMethod) except: shrinkParam = -1 if (shrinkMethod == 'mylw'): XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)] sE2R = (np.sum(XXT_fit**2) - np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2) #temp = (bed.val**2).dot((bed.val.T)**2) temp = symmetrize(blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1)) sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1] shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1]-1)) if (shrinkParam > 0): print 'shrinkage estimator:', 1-shrinkParam XXT = (1-shrinkParam)*XXT + bed.val.shape[1]*shrinkParam*np.eye(XXT.shape[0]) return XXT
def __init__(self, X): Kernel.__init__(self) self.X_scaled = X / X.shape[1] if (X.shape[1] >= X.shape[0]): self.XXT = gpUtils.symmetrize(blas.dsyrk(1.0/X.shape[1], X, lower=0)) else: self.XXT = None self.X = X
def run_dsyrk(N, l): A = randn(N, N).astype('float64', order='F') C = zeros((N, N), dtype='float64', order='F') start = time.time() for i in range(0, l): blas.dsyrk(1.0, A, c=C, overwrite_c=True) end = time.time() timediff = (end - start) mflops = (N * N * N) * l / timediff mflops *= 1e-6 size = "%dx%d" % (N, N) print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff))
def tdot_blas(mat, out=None): """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles.""" if (mat.dtype != 'float64') or (len(mat.shape) != 2): return np.dot(mat, mat.T) nn = mat.shape[0] if out is None: out = np.zeros((nn, nn)) else: assert (out.dtype == 'float64') assert (out.shape == (nn, nn)) # FIXME: should allow non-contiguous out, and copy output into it: assert (8 in out.strides) # zeroing needed because of dumb way I copy across triangular answer out[:] = 0.0 # # Call to DSYRK from BLAS mat = np.asfortranarray(mat) out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1, trans=0, lower=0) symmetrify(out, upper=True) return np.ascontiguousarray(out)
def syrk(A, B, MKLProc): from scipy.linalg.blas import dsyrk import os os.environ['MKL_NUM_THREADS'] = str(MKLProc) alpha = -1.0 beta = 1.0 B = dsyrk(alpha, A, c=B, beta=beta, lower=True) return B
def create_standard_kinship(self): """ compute kinship matrix ( X * X.transpose() ) / (number of sites) returns matrix of dimensions nXn """ logging.info("Creating a standard kinship...") X = self.data if not self.standardize: X = tools.standardize(X, axis=0) # compute kinship matrix ( X * X.transpose() ) / (number of sites) return tools.symmetrize(blas.dsyrk(1.0, X, lower=1)) / X.shape[1]
def _XXT(XT): """ [Added 30/9/2018] Computes X @ XT much faster than naive X @ XT. Notice X @ XT is symmetric, hence instead of doing the full matrix multiplication X @ XT which takes O(pn^2) time, compute only the upper triangular which takes slightly less time and memory. """ if XT.dtype == float64: return dsyrk(1, XT, trans=1).T return ssyrk(1, XT, trans=1).T
def findRelated(bed, cutoff): print 'Computing kinship matrix...' t0 = time.time() XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1) / bed.val.shape[1]) print 'Done in %0.2f'%(time.time()-t0), 'seconds' #Find related individuals removeSet = set(np.sort(vc.VertexCut().work(XXT, cutoff))) #These are the indexes of the IIDs to remove print 'Marking', len(removeSet), 'individuals to be removed due to high relatedness' #keepArr = np.array([(1 if iid in keepSet else 0) for iid in bed.iid], dtype=bool) keepArr = np.ones(bed.iid.shape[0], dtype=bool) for i in removeSet: keepArr[i] = False return keepArr
def findRelated(bed, cutoff): print 'Computing kinship matrix...' t0 = time.time() XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1) / bed.val.shape[1]) print 'Done in %0.2f' % (time.time() - t0), 'seconds' #Find related individuals removeSet = set(np.sort(vc.VertexCut().work( XXT, cutoff))) #These are the indexes of the IIDs to remove print 'Marking', len( removeSet), 'individuals to be removed due to high relatedness' #keepArr = np.array([(1 if iid in keepSet else 0) for iid in bed.iid], dtype=bool) keepArr = np.ones(bed.iid.shape[0], dtype=bool) for i in removeSet: keepArr[i] = False return keepArr
def eigenDecompose(bed, outFile=None): bed = leapUtils._fixupBed(bed) #Compute kinship matrix t0 = time.time() print 'Computing kinship matrix...' XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1] print 'Done in %0.2f'%(time.time()-t0), 'seconds' #Compute eigendecomposition S,U = leapUtils.eigenDecompose(XXT) if (outFile is not None): np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT) eigen = dict([]) eigen['XXT'] = XXT eigen['arr_0'] = U eigen['arr_1'] = S return eigen
def eigenDecompose(bed, outFile): bed = leapUtils._fixupBed(bed) # Compute kinship matrix t0 = time.time() print "Computing kinship matrix..." XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1] print "Done in %0.2f" % (time.time() - t0), "seconds" # Compute eigendecomposition S, U = leapUtils.eigenDecompose(XXT) if outFile is not None: np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT) eigen = dict([]) eigen["XXT"] = XXT eigen["arr_0"] = U eigen["arr_1"] = S return eigen
def eigenDecompose(bed, outFile=None): bed = leapUtils._fixupBed(bed) #Compute kinship matrix t0 = time.time() print 'Computing kinship matrix...' XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1] print 'Done in %0.2f' % (time.time() - t0), 'seconds' #Compute eigendecomposition S, U = leapUtils.eigenDecompose(XXT) if (outFile is not None): np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT) eigen = dict([]) eigen['XXT'] = XXT eigen['arr_0'] = U eigen['arr_1'] = S return eigen
def tdot_blas(mat, out=None): """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles.""" if (mat.dtype != 'float64') or (len(mat.shape) != 2): return np.dot(mat, mat.T) nn = mat.shape[0] if out is None: out = np.zeros((nn, nn)) else: assert(out.dtype == 'float64') assert(out.shape == (nn, nn)) # FIXME: should allow non-contiguous out, and copy output into it: assert(8 in out.strides) # zeroing needed because of dumb way I copy across triangular answer out[:] = 0.0 # # Call to DSYRK from BLAS mat = np.asfortranarray(mat) out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1, trans=0, lower=0) symmetrify(out, upper=True) return np.ascontiguousarray(out)
def eigenDecompose(bed, kinshipFile=None, outFile=None, ignore_neig=False): if (kinshipFile is None): #Compute kinship matrix bed = leapUtils._fixupBed(bed) t0 = time.time() print('Computing kinship matrix...') XXT = leapUtils.symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) / bed.val.shape[1] print('Done in %0.2f' % (time.time() - t0), 'seconds') else: XXT = np.loadtxt(kinshipFile) #Compute eigendecomposition S, U = leapUtils.eigenDecompose(XXT, ignore_neig) if (outFile is not None): np.savez_compressed(outFile, arr_0=U, arr_1=S, XXT=XXT) eigen = dict([]) eigen['XXT'] = XXT eigen['arr_0'] = U eigen['arr_1'] = S return eigen
def removeTopPCs(X, numRemovePCs): t0 = time.time() X_mean = X.mean(axis=0) X -= X_mean XXT = symmetrize(blas.dsyrk(1.0, X, lower=0)) s, U = la.eigh(XXT) if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found') s[s < 0] = 0 ind = np.argsort(s)[::-1] U = U[:, ind] s = s[ind] s = np.sqrt(s) #remove null PCs ind = (s > 1e-6) U = U[:, ind] s = s[ind] V = X.T.dot(U / s) #print 'max diff:', np.max(((U*s).dot(V.T) - X)**2) X = (U[:, numRemovePCs:] * s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :]) X += X_mean return X
def removeTopPCs(X, numRemovePCs): t0 = time.time() X_mean = X.mean(axis=0) X -= X_mean XXT = symmetrize(blas.dsyrk(1.0, X, lower=0)) s,U = la.eigh(XXT) if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found') s[s<0]=0 ind = np.argsort(s)[::-1] U = U[:, ind] s = s[ind] s = np.sqrt(s) #remove null PCs ind = (s>1e-6) U = U[:, ind] s = s[ind] V = X.T.dot(U/s) #print 'max diff:', np.max(((U*s).dot(V.T) - X)**2) X = (U[:, numRemovePCs:]*s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :]) X += X_mean return X
def computeCovar(bed, shrinkMethod, fitIndividuals): eigen = dict([]) if (shrinkMethod in ['lw', 'oas', 'l1', 'cv']): import sklearn.covariance as cov t0 = time.time() print 'Estimating shrunk covariance using', shrinkMethod, 'estimator...' if (shrinkMethod == 'lw'): covEstimator = cov.LedoitWolf(assume_centered=True, block_size=5 * bed.val.shape[0]) elif (shrinkMethod == 'oas'): covEstimator = cov.OAS(assume_centered=True) elif (shrinkMethod == 'l1'): covEstimator = cov.GraphLassoCV(assume_centered=True, verbose=True) elif (shrinkMethod == 'cv'): shrunkEstimator = cov.ShrunkCovariance(assume_centered=True) param_grid = {'shrinkage': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]} covEstimator = sklearn.grid_search.GridSearchCV( shrunkEstimator, param_grid) else: raise Exception('unknown covariance regularizer') covEstimator.fit(bed.val[fitIndividuals, :].T) if (shrinkMethod == 'l1'): alpha = covEstimator.alpha_ print 'l1 alpha chosen:', alpha covEstimator2 = cov.GraphLasso(alpha=alpha, assume_centered=True, verbose=True) else: if (shrinkMethod == 'cv'): shrinkEstimator = clf.best_params_['shrinkage'] else: shrinkEstimator = covEstimator.shrinkage_ print 'shrinkage estimator:', shrinkEstimator covEstimator2 = cov.ShrunkCovariance(shrinkage=shrinkEstimator, assume_centered=True) covEstimator2.fit(bed.val.T) XXT = covEstimator2.covariance_ * bed.val.shape[1] print 'Done in %0.2f' % (time.time() - t0), 'seconds' else: print 'Computing kinship matrix...' t0 = time.time() XXT = symmetrize(blas.dsyrk(1.0, bed.val, lower=1)) print 'Done in %0.2f' % (time.time() - t0), 'seconds' try: shrinkParam = float(shrinkMethod) except: shrinkParam = -1 if (shrinkMethod == 'mylw'): XXT_fit = XXT[np.ix_(fitIndividuals, fitIndividuals)] sE2R = (np.sum(XXT_fit**2) - np.sum(np.diag(XXT_fit)**2)) / (bed.val.shape[1]**2) #temp = (bed.val**2).dot((bed.val.T)**2) temp = symmetrize( blas.dsyrk(1.0, bed.val[fitIndividuals, :]**2, lower=1)) sER2 = (temp.sum() - np.diag(temp).sum()) / bed.val.shape[1] shrinkParam = (sER2 - sE2R) / (sE2R * (bed.val.shape[1] - 1)) if (shrinkParam > 0): print 'shrinkage estimator:', 1 - shrinkParam XXT = (1 - shrinkParam) * XXT + bed.val.shape[ 1] * shrinkParam * np.eye(XXT.shape[0]) return XXT
def fit(self, X, y=None): """Estimate the model parameters with the EM algorithm Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self """ # Initialization n, d = X.shape self.n = n self.d = d LL = [] ITER = 0 X = check_array(X, copy=False, order='C', dtype=sp.float64) # Compute constant self.cst = self.d*sp.log(2*sp.pi) # Set minimum clusters size # Rule of dumbs for minimal size pi = 1 : # one mean vector (d) + one eigenvalues/vectors (1 + d) # + noise term (1) ~ 2(d+1) if self.population is None: self.population = 2*(self.d+1) if self.population > self.n/self.C: print("Number of classes to high w.r.t the number of samples:" "C should be deacreased") return - 2 # Initialization of the clustering if self.C == 1: self.T = sp.ones((self.n, 1)) else: if self.init == 'kmeans': label = KMeans(n_clusters=self.C, n_init=1, n_jobs=-1, random_state=self.random_state).fit(X).labels_ label += 1 # Label starts at one elif self.init == 'random': sp.random.seed(self.random_state) label = sp.random.randint(1, high=self.C+1, size=n) elif self.init == 'user': if self.C != y.max(): print("The number of class does not" "match between self.C and y") label = y else: print("Initialization should be kmeans or random or user") return - 2 # Bad init values # Convert label to membership self.T = sp.zeros((self.n, self.C)) self.T[sp.arange(self.n), label-1] = 1 # Compute the whole covariance matrix and its eigenvalues if needed if self.model in ('M2', 'M4', 'M6', 'M8'): X_ = (X - sp.mean(X, axis=0)) # Use dsyrk to take benefit of the product symmetric matrices # X^{t}X or XX^{t} # Transpose to put in fortran order if self.n >= self.d: W = dsyrk(1.0/self.n, X_.T, trans=False) else: W = dsyrk(1.0/self.n, X_.T, trans=True) del X_ # Compute intrinsic dimension on the whole data set L = linalg.eigh(W, eigvals_only=True, lower=False) idx = L.argsort()[::-1] L = L[idx] # Chek for numerical errors L[L < EPS] = EPS self.dL = sp.absolute(sp.diff(L)) self.dL /= self.dL.max() del W, L # Initialization of the parameter self.m_step(X) ll = self.e_step(X) LL.append(ll) # Main while loop while(ITER < self.itermax): # M step self.free() self.m_step(X) # E step ll = self.e_step(X) LL.append(ll) if (abs((LL[-1]-LL[-2])/LL[-2]) < self.tol) and \ (self.C_[-2] == self.C_[-1]): break else: ITER += 1 # Return the class membership and some parameters of the optimization self.LL = LL self.bic = - 2*LL[-1] + self.q*sp.log(self.n) self.aic = - 2*LL[-1] + 2*self.q # Add small constant to ICL to prevent numerical issues self.icl = self.bic - 2*sp.log(self.T.max(axis=1)+EPS).sum() self.niter = ITER + 1 # Remove temporary variables self.T = None self.X = None return self
def fit_init(self, x, y): """This function computes the empirical estimators of the mean vector, the convariance matrix and the proportion of each class. :param x: The sample matrix, is of size x \times d where n is the number of samples and d is the number of variables :param y: The vector of corresponding labels, is of size n \times 1 in the supervised case and n \times C in the unsupervised case :type x: float :type y: int """ ## Get information from the data n, d = x.shape # Number of samples and number of variables if y.ndim == 1: # Number of classes C = int(y.max(0)) else: C = y.shape[1] if n != y.shape[0]: print("size of x and y should match") exit() ## Compute constant self.cst = d * sp.log(2 * sp.pi) ## Compute the whole covariance matrix if self.model in ('M2', 'M4', 'M6', 'M8'): X = (x - sp.mean(x, axis=0)) if n >= d: # Here use dsyrk to take benefit of the product symmetric matrices X^{t}X or XX^{t} self.W = dsyrk( 1.0 / n, X.T, trans=False) # Transpose to put in fortran order else: self.W = dsyrk(1.0 / n, X.T, trans=True) # Transpose to put in fortran order X = None ## Learn the empirical of the model for each class for c in xrange(C): if y.ndim == 1: # Supervised case j = sp.where(y == (c + 1))[0] self.ni.append(j.size) self.prop.append(float(self.ni[c]) / n) self.mean.append(sp.mean(x[j, :], axis=0)) X = (x[j, :] - self.mean[c]) else: # Unsupervised case self.ni.append(y[:, c].sum()) self.prop.append(float(self.ni[c]) / n) self.mean.append(sp.average(x, weights=y[:, c], axis=0)) X = (x - self.mean[c]) * sp.sqrt(y[:, c]).reshape(n, 1) if n >= d: # Here use dsyrk to take benefit of the product of symmetric matrices X^{t}X or XX^{t} cov = dsyrk(1.0 / (self.ni[c] - 1), X.T, trans=False) # Transpose to put in fortran order else: cov = dsyrk(1.0 / (self.ni[c] - 1), X.T, trans=True) # Transpose to put in fortran order self.X.append(X) X = None L, Q = linalg.eigh( cov, lower=False ) # Only the upper part of cov is initialize -> dsyrk idx = L.argsort()[::-1] L, Q = L[idx], Q[:, idx] L[L < EPS] = EPS # Chek for numerical errors self.L.append(L) self.Q.append(Q) self.trace.append(cov.trace())
def m_step(self, X): """M step of the algorithm This function computes the empirical estimators of the mean vector, the convariance matrix and the proportion of each class. """ # Learn the model for each class C_ = self.C c_delete = [] for c in xrange(self.C): ni = self.T[:, c].sum() # Check if empty if self.check_empty and \ ni < self.population: C_ -= 1 c_delete.append(c) else: self.ni.append(ni) self.prop.append(float(self.ni[-1])/self.n) self.mean.append(sp.dot(self.T[:, c].T, X)/self.ni[-1]) X_ = (X-self.mean[-1])*(sp.sqrt(self.T[:, c])[:, sp.newaxis]) # Use dsyrk to take benefit of symmetric matrices if self.n >= self.d: cov = dsyrk(1.0/(self.ni[-1]-1), X_.T, trans=False) else: cov = dsyrk(1.0/(self.ni[-1]-1), X_.T, trans=True) self.X.append(X_) X_ = None # Only the upper part of cov is initialize -> dsyrk L, Q = linalg.eigh(cov, lower=False) # Chek for numerical errors L[L < EPS] = EPS if self.check_empty and (L.max() - L.min()) < EPS: # In that case all eigenvalues are equal # and this does not match the model C_ -= 1 c_delete.append(c) del self.ni[-1] del self.prop[-1] del self.mean[-1] if self.n < self.d: del self.X[-1] else: idx = L.argsort()[::-1] L, Q = L[idx], Q[:, idx] self.L.append(L) self.Q.append(Q) self.trace.append(cov.trace()) # Update T if c_delete: self.T = sp.delete(self.T, c_delete, axis=1) # Update the number of clusters self.C_.append(C_) self.C = C_ # Estimation of the signal subspace for specific size subspace models if self.model in ('M1', 'M3', 'M5', 'M7'): for c in xrange(self.C): # Scree test dL, pi = sp.absolute(sp.diff(self.L[c])), 1 dL /= dL.max() while sp.any(dL[pi:] > self.th): pi += 1 if (pi < (min(self.ni[c], self.d) - 1)) and (pi > 0): self.pi.append(pi) else: self.pi.append(1) elif self.model in ('M2', 'M4', 'M6', 'M8'): dL, p = self.dL, 1 while sp.any(dL[p:] > self.th): p += 1 min_dim = int(min(min(self.ni), self.d)) # Check if (p >= ni-1 or d-1) and p > 0 if p < (min_dim - 1): self.pi = [p for c in xrange(self.C)] else: self.pi = [max((min_dim-2), 1) for c in xrange(self.C)] del dL, p, idx # Estim signal part self.a = [sL[:sPI] for sL, sPI in zip(self.L, self.pi)] if self.model in ('M5', 'M6', 'M7', 'M8'): self.a = [sp.repeat(sA[:].mean(), sA.size) for sA in self.a] # Estim noise term if self.model in ('M1', 'M2', 'M5', 'M6'): # Noise free self.b = [(sT-sA.sum())/(self.d-sPI) for sT, sA, sPI in zip(self.trace, self.a, self.pi)] # Check for very small value of b self.b = [b if b > EPS else EPS for b in self.b] elif self.model in ('M3', 'M4', 'M7', 'M8'): # Noise common denom = self.d - sp.sum([sPR*sPI for sPR, sPI in zip(self.prop, self.pi)]) num = sp.sum([sPR*(sT-sA.sum()) for sPR, sT, sA in zip(self.prop, self.trace, self.a)]) # Check for very small values if num < EPS: self.b = [EPS for i in xrange(self.C)] elif denom < EPS: self.b = [1/EPS for i in xrange(self.C)] else: self.b = [num/denom for i in xrange(self.C)] # Compute remainings parameters # Precompute logdet self.logdet = [(sp.log(sA).sum() + (self.d-sPI)*sp.log(sB)) for sA, sPI, sB in zip(self.a, self.pi, self.b)] # Update the Q matrices if self.n >= self.d: self.Q = [sQ[:, :sPI] for sQ, sPI in zip(self.Q, self.pi)] else: self.Q = [sp.dot(sX.T, sQ[:, :sPI])/sp.sqrt(sL[:sPI]) for sX, sQ, sPI, sL in zip(self.X, self.Q, self.pi, self.L)] # Compute the number of parameters of the model self.q = self.C*self.d + (self.C-1) + sum([sPI*(self.d-(sPI+1)/2) for sPI in self.pi]) # Number of noise subspaces if self.model in ('M1', 'M3', 'M5', 'M7'): self.q += self.C elif self.model in ('M2', 'M4', 'M6', 'M8'): self.q += 1 # Size of signal subspaces if self.model in ('M1', 'M2'): self.q += sum(self.pi) + self.C elif self.model in ('M3', 'M4'): self.q += sum(self.pi) + 1 elif self.model in ('M5', 'M6'): self.q += 2*self.C elif self.model in ('M7', 'M8'): self.q += self.C+1
def m_step(self, X): """M step of the algorithm This function computes the empirical estimators of the mean vector, the convariance matrix and the proportion of each class. """ # Learn the model for each class C_ = self.C c_delete = [] for c in xrange(self.C): ni = self.T[:, c].sum() # Check if empty if self.check_empty and \ ni < self.population: C_ -= 1 c_delete.append(c) else: self.ni.append(ni) self.prop.append(float(self.ni[-1]) / self.n) self.mean.append(sp.dot(self.T[:, c].T, X) / self.ni[-1]) X_ = (X - self.mean[-1]) * (sp.sqrt(self.T[:, c])[:, sp.newaxis]) # Use dsyrk to take benefit of symmetric matrices if self.n >= self.d: cov = dsyrk(1.0 / (self.ni[-1] - 1), X_.T, trans=False) else: cov = dsyrk(1.0 / (self.ni[-1] - 1), X_.T, trans=True) self.X.append(X_) X_ = None # Only the upper part of cov is initialize -> dsyrk L, Q = linalg.eigh(cov, lower=False) # Chek for numerical errors L[L < EPS] = EPS if self.check_empty and (L.max() - L.min()) < EPS: # In that case all eigenvalues are equal # and this does not match the model C_ -= 1 c_delete.append(c) del self.ni[-1] del self.prop[-1] del self.mean[-1] if self.n < self.d: del self.X[-1] else: idx = L.argsort()[::-1] L, Q = L[idx], Q[:, idx] self.L.append(L) self.Q.append(Q) self.trace.append(cov.trace()) # Update T if c_delete: self.T = sp.delete(self.T, c_delete, axis=1) # Update the number of clusters self.C_.append(C_) self.C = C_ # Estimation of the signal subspace for specific size subspace models if self.model in ('M1', 'M3', 'M5', 'M7'): for c in xrange(self.C): # Scree test dL, pi = sp.absolute(sp.diff(self.L[c])), 1 dL /= dL.max() while sp.any(dL[pi:] > self.th): pi += 1 if (pi < (min(self.ni[c], self.d) - 1)) and (pi > 0): self.pi.append(pi) else: self.pi.append(1) elif self.model in ('M2', 'M4', 'M6', 'M8'): dL, p = self.dL, 1 while sp.any(dL[p:] > self.th): p += 1 min_dim = int(min(min(self.ni), self.d)) # Check if (p >= ni-1 or d-1) and p > 0 if p < (min_dim - 1): self.pi = [p for c in xrange(self.C)] else: self.pi = [max((min_dim - 2), 1) for c in xrange(self.C)] del dL, p, idx # Estim signal part self.a = [sL[:sPI] for sL, sPI in zip(self.L, self.pi)] if self.model in ('M5', 'M6', 'M7', 'M8'): self.a = [sp.repeat(sA[:].mean(), sA.size) for sA in self.a] # Estim noise term if self.model in ('M1', 'M2', 'M5', 'M6'): # Noise free self.b = [(sT - sA.sum()) / (self.d - sPI) for sT, sA, sPI in zip(self.trace, self.a, self.pi)] # Check for very small value of b self.b = [b if b > EPS else EPS for b in self.b] elif self.model in ('M3', 'M4', 'M7', 'M8'): # Noise common denom = self.d - sp.sum( [sPR * sPI for sPR, sPI in zip(self.prop, self.pi)]) num = sp.sum([ sPR * (sT - sA.sum()) for sPR, sT, sA in zip(self.prop, self.trace, self.a) ]) # Check for very small values if num < EPS: self.b = [EPS for i in xrange(self.C)] elif denom < EPS: self.b = [1 / EPS for i in xrange(self.C)] else: self.b = [num / denom for i in xrange(self.C)] # Compute remainings parameters # Precompute logdet self.logdet = [(sp.log(sA).sum() + (self.d - sPI) * sp.log(sB)) for sA, sPI, sB in zip(self.a, self.pi, self.b)] # Update the Q matrices if self.n >= self.d: self.Q = [sQ[:, :sPI] for sQ, sPI in zip(self.Q, self.pi)] else: self.Q = [ sp.dot(sX.T, sQ[:, :sPI]) / sp.sqrt(sL[:sPI]) for sX, sQ, sPI, sL in zip(self.X, self.Q, self.pi, self.L) ] # Compute the number of parameters of the model self.q = self.C * self.d + (self.C - 1) + sum( [sPI * (self.d - (sPI + 1) / 2) for sPI in self.pi]) # Number of noise subspaces if self.model in ('M1', 'M3', 'M5', 'M7'): self.q += self.C elif self.model in ('M2', 'M4', 'M6', 'M8'): self.q += 1 # Size of signal subspaces if self.model in ('M1', 'M2'): self.q += sum(self.pi) + self.C elif self.model in ('M3', 'M4'): self.q += sum(self.pi) + 1 elif self.model in ('M5', 'M6'): self.q += 2 * self.C elif self.model in ('M7', 'M8'): self.q += self.C + 1
def fit(self, X, y=None): """Estimate the model parameters with the EM algorithm Parameters ---------- X : array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self """ # Initialization n, d = X.shape self.n = n self.d = d LL = [] ITER = 0 X = check_array(X, copy=False, order='C', dtype=sp.float64) # Compute constant self.cst = self.d * sp.log(2 * sp.pi) # Set minimum clusters size # Rule of dumbs for minimal size pi = 1 : # one mean vector (d) + one eigenvalues/vectors (1 + d) # + noise term (1) ~ 2(d+1) if self.population is None: self.population = 2 * (self.d + 1) if self.population > self.n / self.C: print( "Number of classes to high w.r.t the number of samples:" "C should be deacreased") return -2 # Initialization of the clustering if self.C == 1: self.T = sp.ones((self.n, 1)) else: if self.init == 'kmeans': label = KMeans(n_clusters=self.C, n_init=1, n_jobs=-1, random_state=self.random_state).fit(X).labels_ label += 1 # Label starts at one elif self.init == 'random': sp.random.seed(self.random_state) label = sp.random.randint(1, high=self.C + 1, size=n) elif self.init == 'user': if self.C != y.max(): print( "The number of class does not" "match between self.C and y") label = y else: print("Initialization should be kmeans or random or user") return -2 # Bad init values # Convert label to membership self.T = sp.zeros((self.n, self.C)) self.T[sp.arange(self.n), label - 1] = 1 # Compute the whole covariance matrix and its eigenvalues if needed if self.model in ('M2', 'M4', 'M6', 'M8'): X_ = (X - sp.mean(X, axis=0)) # Use dsyrk to take benefit of the product symmetric matrices # X^{t}X or XX^{t} # Transpose to put in fortran order if self.n >= self.d: W = dsyrk(1.0 / self.n, X_.T, trans=False) else: W = dsyrk(1.0 / self.n, X_.T, trans=True) del X_ # Compute intrinsic dimension on the whole data set L = linalg.eigh(W, eigvals_only=True, lower=False) idx = L.argsort()[::-1] L = L[idx] # Chek for numerical errors L[L < EPS] = EPS self.dL = sp.absolute(sp.diff(L)) self.dL /= self.dL.max() del W, L # Initialization of the parameter self.m_step(X) ll = self.e_step(X) LL.append(ll) # Main while loop while (ITER < self.itermax): # M step self.free() self.m_step(X) # E step ll = self.e_step(X) LL.append(ll) if (abs((LL[-1]-LL[-2])/LL[-2]) < self.tol) and \ (self.C_[-2] == self.C_[-1]): break else: ITER += 1 # Return the class membership and some parameters of the optimization self.LL = LL self.bic = -2 * LL[-1] + self.q * sp.log(self.n) self.aic = -2 * LL[-1] + 2 * self.q # Add small constant to ICL to prevent numerical issues self.icl = self.bic - 2 * sp.log(self.T.max(axis=1) + EPS).sum() self.niter = ITER + 1 # Remove temporary variables self.T = None self.X = None return self