def testarg1(self): X = nr.randn(10,2) A = np.vstack(( np.ones((7,2)), np.zeros((3,2)) )) X = X + 3*A C,L,J = fc.kmeans(X,2.0) C,L,J = fc.kmeans(X,0.5) C,L,J = fc.kmeans(X,-42) self.assert_(True)
def testarg2(self): X = nr.randn(10,2) A = np.vstack(( np.ones((7,2)), np.zeros((3,2)) )) L = np.array([0,0,0,0,0,1,1,1,1,1]) C,L,J = fc.kmeans(X,2,L) L = np.array([0.0,0,0,0,0,1,1,1,1,1.42]) C,L,J = fc.kmeans(X,2,L) L = np.array([0.0,0,0,0,0,1,1,1,1,-1]) C,L,J = fc.kmeans(X,2,L) self.assert_(True)
def testarg3(self): A = np.vstack(( np.ones((7,2)), np.zeros((3,2)) )) X = (nr.randn(10,2) * 100).astype(np.int) C,L,J = fc.kmeans(X,2) C,L,J = fc.kmeans(X,2) X = nr.randn(1,2) C,L,J = fc.kmeans(X,2) X = nr.randn(1,2) C,L,J = fc.kmeans(X,2) X = nr.randn(2,5) C,L,J = fc.kmeans(X,30) self.assert_(True)
def initialize(self, x): """ this function initializes self according to a certain dataset x: 1. sets the regularizing hyper-parameters 2. initializes z using a k-means algorithm, then 3. upate the parameters Parameters ---------- x, array of shape (n_samples,self.dim) the data used in the estimation process """ import nipy.neurospin.clustering.clustering as fc n = x.shape[0] #1. set the priors self.guess_regularizing(x, bcheck=1) # 2. initialize the memberships if self.k>1: cent,z,J = fc.kmeans(x, self.k) else: z = np.zeros(n).astype(np.int) l = np.zeros((n, self.k)) l[np.arange(n),z]=1 # 3.update the parameters self.update(x,l)
def testkmeans1(self): X = nr.randn(10,2) A = np.concatenate([np.ones((7,2)),np.zeros((3,2))]) X = X+3*A; L = np.array([0,0,0,0,0,1,1,1,1,1]) C,L,J = fc.kmeans(X,2,L) self.assert_(np.mean(L[:7])<0.5)
def VB_estimate(self,x,niter = 100,delta = 0.0001): """ Estimation of the BGMM using a Variational Bayes approach Parameters ---------- x array of shape (nbitems,dim) the input data niter = 100, the maximal number of iterations of the VB algo delta = 0.0001, the increment in log-likelihood to declare convergence Returns ------- label: array of shape nbitems: resulting MAP labelling """ x = self.check_data(x) # pre_cluster the data (this improves convergence...) label = np.zeros(x.shape[0]) nit = 10 mean,label,J = fc.kmeans(x,self.k,label,nit) label, mean, meansc, prec, we, dof, Li = fc.bayesian_gmm (x,self.prior_means,self.prior_precisions,self.prior_shrinkage,self.prior_weights, self.prior_dof,label,niter,delta) self.estimated = 1 self.means = mean self.shrinkage = meansc self.precisions = prec self.weights = we self.dof = dof return label
def estimate(self, data, Labels=None, maxiter = 300, delta = 0.001, ninit=1): """ Estimation of the GMM based on data and an EM algorithm Parameters ---------- data : (n*p) feature array, n = nb items, p=feature dimension Labels=None : prior labelling of the data (this may improve convergence) maxiter=300 : max number of iterations of the EM algorithm delta = 0.001 : criterion on the log-likelihood increments to declare converegence ninit=1 : number of possible iterations of the GMM estimation Returns ------- Labels : array of shape(n), type np.int: discrete labelling of the data items into clusters LL : (float) average log-likelihood of the data bic : (float) associated bic criterion """ data = self.check_x(data) if Labels==None: Labels = np.zeros(data.shape[0],np.int) nit = 10 C,Labels,J = fc.kmeans(data,self.k,Labels,nit) if (self.k>data.shape[0]-1): print "too many clusters" self.k = data.shape[0]-1 if self.prec_type=='full':prec_type=0 if self.prec_type=='diag': prec_type=1 C, P, W, Labels, bll = fc.gmm(data,self.k,Labels,prec_type, maxiter,delta) self.means = C if self.prec_type=='diag': self.precisions = P if self.prec_type=='full': self.precisions = np.reshape(P,(self.k,self.dim,self.dim)) self.weights = W self.check() for i in range(ninit-1): Labels = np.zeros(data.shape[0]) C, P, W, labels, ll = fc.gmm(data,self.k,Labels, prec_type,maxiter,delta) if ll>bll: self.means = C if self.prec_type=='diag': self.precisions = P if self.prec_type=='full': self.precisions = np.reshape(P,(self.k,self.dim,self.dim)) self.weights = W self.check() bll = ll Labels = labels return Labels,bll, self.bic_from_all (bll,data.shape[0])
def optimize_with_bic(self,data, kvals=None, maxiter = 300, delta = 0.001, ninit=1,verbose = 0): """ Find the optimal GMM using bic criterion. The method is run with all the values in kmax for k Parameters ---------- data : (n,p) feature array, n = nb items, p=feature dimension kvals=None : range of values for k. if kvals==None, self.k is used maxiter=300 : max number of iterations of the EM algorithm delta = 0.001 : criterion on the log-likelihood increments to declare converegence ninit=1 : number of possible iterations of the GMM estimation verbsose=0: verbosity mode Returns ------- Labels : array of shape(n), type np.int, discrete labelling of the data items into clusters LL : array of shape(n): log-likelihood of the data bic : (float) associated bic criterion """ data = self.check_x(data) if kvals==None: LogLike, Labels, bic = self.estimate(data,None, maxiter,\ delta, ninit) return Labels, LogLike, self.bic(LogLike) bic_ref = -np.infty for k in kvals: self.k = k nit = 10 mean, label,J = fc.kmeans(data, k, Labels=None) Lab,LL, bic = self.estimate(data, label, maxiter, delta, ninit) if bic>bic_ref: kopt = k C = self.means.copy() P = self.precisions.copy() W = self.weights.copy() bic_ref = bic if verbose: print k,LL,bic,kopt self.means = C self.precisions = P self.weights = W self.k = kopt if self.prec_type=='full': precisions = np.reshape(self.precisions,(self.k,self.dim*self.dim)) else: precisions = self.precisions Labels, LogLike = fc.gmm_partition(data,self.means,precisions,\ self.weights) return Labels, LogLike, self.bic_from_ll(LogLike)
def testkmeans2(self): X = nr.randn(10000,2) A = np.concatenate([np.ones((7000,2)),np.zeros((3000,2))]) X = X+3*A L = np.concatenate([np.ones(5000), np.zeros(5000)]).astype(np.int) C,L,J = fc.kmeans(X,2,L) l = L[:7000].astype(np.float) self.assert_(np.mean(l)>0.9)
def initialize(self, x): """ initialize z using a k-means algorithm, then upate the parameters Parameters ---------- x: array of shape (nb_samples,self.dim) the data used in the estimation process """ if self.k>1: cent,z,J = fc.kmeans(x,self.k) else: z = np.zeros(x.shape[0]).astype(np.int) self.update(x,z)
def initialize(self, x): """ initialize z using a k-means algorithm, then upate the parameters Parameters ---------- x: array of shape (nb_samples,self.dim) the data used in the estimation process """ n = x.shape[0] if self.k>1: cent, z, J = fc.kmeans(x, self.k) else: z = np.zeros(x.shape[0]).astype(np.int) l = np.zeros((n,self.k)) l[np.arange(n),z]=1 self._Mstep(x,l)
def VB_estimate_and_sample(self,x,niter = 1000,delta = 0.0001,gd = None,verbose = 0): """ Estimation of the BGMM using a Variational Bayes approach, and sampling of the model on test points in order to have an estimate of the posterior on these points Parameters ---------- x array of shape (nbitems,dim) the input data niter = 100, the maximal number of iterations of the VB algo delta = 0.0001, the increment in log-likelihood to declare convergence gd = None a grid descriptor, i.e. the grid on chich the model is sampled if gd==None, x is used as Grid verbose = 0: the verbosity mode Returns ------- Li : array of shape (nbnodes): the average log-posterior label: array of shape nbitems: resulting MAP labelling """ x = self.check_data(x) # pre_cluster the data (this improves convergence...) label = np.zeros(x.shape[0]) nit = 10 mean,label,J = fc.kmeans(x,self.k,label,nit) if gd==None: grid = x else: grid = gd.make_grid() label, mean, meansc, prec, we,dof,Li = fc.bayesian_gmm (x, self.prior_means, self.prior_precisions, self.prior_shrinkage, self.prior_weights, self.prior_dof, label, niter, delta, grid) self.estimated = 1 self.means = mean self.shrinkage = meansc self.precisions = prec self.weights = we self.dof = dof if verbose: self.show(x,gd,np.exp(Li)) return Li,label
def mask_parcellation(mask_images, nb_parcel, output_image=None): """ Performs the parcellation of a certain mask Parameters ---------- mask_images: list of strings, paths of the mask images that define the common space. nb_parcel: int, number of desired parcels output_image: string, optional path of the output image Returns ------- wim: Nifti1Imagine instance, the resulting parcellation """ from ..mask import intersect_masks # compute the group mask affine = load(mask_images[0]).get_affine() shape = load(mask_images[0]).get_shape() mask = intersect_masks(mask_images, threshold=0)>0 ijk = np.where(mask) ijk = np.array(ijk).T nvox = ijk.shape[0] # Get and cluster coordinates ijk = np.hstack((ijk,np.ones((nvox,1)))) coord = np.dot(ijk, affine.T)[:,:3] cent, tlabs, J = kmeans(coord, nb_parcel) # Write the results label = -np.ones(shape) label[mask]= tlabs wim = Nifti1Image(label, affine) wim.get_header()['descrip'] = 'Label image in %d parcels'%nb_parcel if output_image is not None: save(wim, output_image) return wim
def one_subj_parcellation(MaskImage, betas, nbparcel, nn=6, method='ward', write_dir=None, mu=10., verbose=0, fullpath=None): """ Parcellation of a one-subject dataset Return: a tuple (Parcellation instance, parcellation labels) Parameters ---------- MaskImage: path to the mask-defining_image of the subject betas: list of paths to activation images from the subject nbparcel, int : number fo desired parcels nn=6: number of nearest neighbors to define the image topology (6, 18 or 26) method='ward': clustering method used, to be chosen among 'ward', 'gkm', 'ward_and-gkm' 'ward': Ward's clustering algorithm 'gkm': Geodesic k-means algorithm, random initialization 'gkm_and_ward': idem, initialized by Ward's clustering write_dir=None: write directory. If fullpath is None too, then no file output. mu = 10., float: the relative weight of anatomical information verbose=0: verbosity mode fullpath=None, string, path of the output image If write_dir and fullpath are None then no file output. If only fullpath is None then it is the write dir + a name depending on the method. Note ---- Ward's method takes time (about 6 minutes for a 60K voxels dataset) Geodesic k-means is 'quick and dirty' Ward's + GKM is expensive but quite good To reduce CPU time, rather use nn=6 (especially with Ward) """ import nipy.neurospin.graph as fg import nipy.neurospin.graph.field as ff if method not in ['ward','gkm','ward_and_gkm','kmeans']: raise ValueError, 'unknown method' if nn not in [6,18,26]: raise ValueError, 'nn should be 6,18 or 26' nbeta = len(betas) # step 1: load the data ---------------------------- #1.1 the mask image nim = load(MaskImage) ref_dim = nim.get_shape() affine = nim.get_affine() mask = nim.get_data() xyz = np.array(np.where(mask>0)).T nvox = xyz.shape[0] if method is not 'kmeans': # 1.2 get the main cc of the graph # to remove the small connected components g = fg.WeightedGraph(nvox) g.from_3d_grid(xyz.astype(np.int),nn) aux = np.zeros(g.V).astype('bool') imc = g.main_cc() aux[imc]= True if np.sum(aux)==0: raise ValueError, "empty mask. Cannot proceed" g = g.subgraph(aux) lmask = np.zeros(ref_dim) lmask[xyz[:,0],xyz[:,1],xyz[:,2]]=aux xyz = xyz[aux,:] nvox = xyz.shape[0] # 1.3 from vox to mm xyz2 = np.hstack((xyz,np.ones((nvox,1)))) coord = np.dot(xyz2, affine.T)[:,:3] # 1.4 read the functional data beta = [] for b in range(nbeta): rbeta = load(betas[b]) lbeta = rbeta.get_data() lbeta = lbeta[lmask>0] beta.append(lbeta) beta = np.array(beta).T #step 2: parcel the data --------------------------- feature = np.hstack((beta, mu*coord/np.std(coord))) if method is not 'kmeans': g = ff.Field(nvox, g.edges, g.weights, feature) if method=='kmeans': cent, u, J = kmeans(feature, nbparcel) if method=='ward': u, J0 = g.ward(nbparcel) if method=='gkm': seeds = np.argsort(np.random.rand(g.V))[:nbparcel] seeds, u, J1 = g.geodesic_kmeans(seeds) if method=='ward_and_gkm': w,J0 = g.ward(nbparcel) seeds, u, J1 = g.geodesic_kmeans(label=w) lpa = Parcellation(nbparcel, xyz, np.reshape(u,(nvox,1))) if verbose: pi = np.reshape(lpa.population(), nbparcel) vi = np.sum(lpa.var_feature_intra([beta])[0], 1) vf = np.dot(pi,vi)/nvox va = np.dot(pi,np.sum(lpa.var_feature_intra([coord])[0],1))/nvox print nbparcel, "functional variance", vf, "anatomical variance",va # step3: write the resulting label image Label = -np.ones(ref_dim,'int16') Label[lmask>0] = u if fullpath is not None: LabelImage = fullpath elif write_dir is not None: if method=='kmeans': LabelImage = os.path.join(write_dir,"parcel_kmeans.nii") if method=='ward': LabelImage = os.path.join(write_dir,"parcel_wards.nii") elif method=='gkm': LabelImage = os.path.join(write_dir,"parcel_gkmeans.nii") elif method=='ward_and_gkm': LabelImage = os.path.join(write_dir,"parcel_wgkmeans.nii") else: LabelImage = None if LabelImage is not None: wim = Nifti1Image(Label, affine) hdr = wim.get_header() hdr['descrip'] = 'Intra-subject parcellation image' save(wim, LabelImage) print "Wrote the parcellation images as %s" %LabelImage return lpa, Label
def optim_hparcel(Ranat, RFeature, Feature, Pa, Gs, anat_coord, lamb=1., dmax=10., chunksize=1.e5, niter=5, verbose=0): """ Core function of the heirrachical parcellation procedure. Parameters ---------- Ranat: array of shape (n,3): set of positions sampled form the data RFeature: array of shape (n,f): assocaited feature Feature: list of subject-related feature arrays Pa : parcellation instance that is updated Gs: graph that represents the topology of the parcellation anat_coord: arrao of shape (nvox,3) space defining set of coordinates lamb=1.0: parameter to weight position and feature impact on the algorithm dmax = 10: locality parameter (in the space of anat_coord) to limit surch volume (CPU save) chunksize=1.e5 not used here (to be removed) niter = 5: number of iterations in teh algorithm verbose=0: verbosity level Returns ------- U: list of arrays of length nsubj subject-dependent parcellations Proto_anat: array of shape (nvox) labelling of the common space (template parcellation) """ Sess = Pa.nb_subj # Ranat,RFeature,Pa,chunksize,dmax,lamb,Gs # a1. perform a rough clustering of the data to make prototype #Labs = np.zeros(RFeature.shape[0]) proto, Labs, J = fc.kmeans(RFeature, Pa.k, Labels=None, maxiter=10) proto_anat = [np.mean(Ranat[Labs==k],0) for k in range(Pa.k)] proto_anat = np.array(proto_anat) proto = [np.mean(RFeature[Labs==k],0) for k in range(Pa.k)] proto = np.array(proto) proto_anat_old = proto_anat.copy() # a2. topological model of the parcellation # group-level part spatial_proto = ff.Field(Pa.k) spatial_proto.set_field(proto_anat) spatial_proto.Voronoi_diagram(proto_anat,anat_coord) spatial_proto.set_gaussian(proto_anat) spatial_proto.normalize() for git in range(niter): LP = [] LPA = [] U = [] Energy = 0 for s in range(Sess): # b.subject-specific instances of the model # b.0 subject-specific information Fs = Feature[s] lac = anat_coord[Pa.label[:,s]>-1] target = proto_anat.copy() for nit in range(1): lseeds = np.zeros(Pa.k,'i') aux = np.argsort(rand(Pa.k)) tata = 0 toto = np.zeros(lac.shape[0]) for j in range(Pa.k): # b.1 speed-up :only take a small ball i = aux[j] dX = lac-target[i,:] iz = np.nonzero(np.sum(dX**2,1)<dmax**2) iz = np.reshape(iz,np.size(iz)) if np.size(iz)==0: iz = np.array([np.argmin(np.sum(dX**2,1))]) # b.2: anatomical constraints lanat = np.reshape(lac[iz,:],(np.size(iz),anat_coord.shape[1])) pot = np.zeros(np.size(iz)) JM,rmin = _exclusion_map(i,spatial_proto,target,lanat) pot[JM<0] = np.infty pot[JM>=0] = -JM[JM>=0] # b.3: add feature discrepancy dF = Fs[iz]-proto[i] dF = np.reshape(dF,(np.size(iz),proto.shape[1])) pot += lamb*np.sum(dF**2,1) # b.4: solution pb = 0 if np.sum(np.isinf(pot))==np.size(pot): pot = np.sum(dX[iz,:]**2,1) tata +=1 pb = 1 sol = iz[np.argmin(pot)] target[i] = lac[sol] if toto[sol]==1: print "pb",pb ln = spatial_proto.list_of_neighbors() argtoto = np.squeeze(np.nonzero(lseeds==sol)) print i,argtoto,ln[i] if np.size(argtoto)==1: print [ln[argtoto]] print target[argtoto] print target[i] print rmin print JM[iz==lseeds[argtoto]] print pot[iz==lseeds[argtoto]] lseeds[i]= sol toto[sol]=1 if verbose>1: jm = _Field_Gradient_Jac(spatial_proto,target) print jm.min(),jm.max(),np.sum(toto>0),tata # c.subject-specific parcellation g = Gs[s] f = ff.Field(g.V,g.edges,g.weights,Fs) u = f.constrained_voronoi(lseeds) U.append(u) Energy += np.sum((Fs-proto[u])*(Fs-proto[u]))/np.sum(Pa.label[:,s]>-1) # recompute the prototypes # (average in subject s) lproto = [np.mean(Fs[u==k],0) for k in range(Pa.k)] lproto = np.array(lproto) lproto[np.isnan(lproto)] = proto[np.isnan(lproto)] lproto_anat = [np.mean(lac[u==k],0) for k in range(Pa.k)] lproto_anat = np.array(lproto_anat) lproto_anat[np.isnan(lproto_anat)] = proto_anat[np.isnan(lproto_anat)] LP.append(lproto) LPA.append(lproto_anat) # recompute the prototypes across subjects proto_mem = proto.copy() proto_anat_mem = proto_anat.copy() proto = np.mean(np.array(LP),0) proto_anat = np.mean(np.array(LPA),0) displ = np.sqrt(np.sum((proto_mem-proto)**2,1).max()) if verbose: print 'energy',Energy, 'displacement',displ # recompute the topological model spatial_proto.set_field(proto_anat) spatial_proto.Voronoi_diagram(proto_anat,anat_coord) spatial_proto.set_gaussian(proto_anat) spatial_proto.normalize() if displ<1.e-4*dmax: break return U,proto_anat
def test_parcel_one_subj_4(): nbparcel = 10 g = make_data_field() _, u, _ = fc.kmeans(g.field, nbparcel) assert((np.unique(u) == np.arange(nbparcel)).all())