def _fit_dpgmm(self, x): # clustering k = max(self.crange) for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) if 'max_iter' in self.clus_kwargs: model_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model = DPGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.fit(x) self._labels[r] = model.predict(x) self._parameters[r] = model.means_ self._ll[r] = model.score(x).sum() # evaluate goodness of fit for this run #self._gof[r] = self.gof(x, self._ll[r], k) if self.gof_type == 'aic': self._gof[r] = model.aic(x) if self.gof_type == 'bic': self._gof[r] = model.bic(x) # debug if self.debug is True: print self._gof[r], model.n_components, model.weights_.shape[0]
for chunks in np.arange(1, opts.size, step = 3): # Sample the specified number of points from X_unlabeled size = np.cumsum(chunk_sizes[:chunks])[-1] # Fit a Dirichlet process mixture of Gaussians using up to ten components dpgmm = DPGMM(n_components=10, alpha=10.0, covariance_type='full') indices = np.arange(X_unlabeled.shape[0]) np.random.shuffle(indices) X = X_unlabeled[indices[:size],] print("fitting a model with", size, "data points") with timeit(): dpgmm.fit(X) print("Done!") print("AIC for this model & data: ", dpgmm.aic(X)) print("BIC for this model & data: ", dpgmm.bic(X)) Y_hat = dpgmm.predict(X) print ("Model assigned points to", np.max(Y_hat), "components") # How can I best check this out? #color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) #for i, (clf, title) in enumerate([(gmm, 'GMM'), #(dpgmm, 'Dirichlet Process GMM')]): #splot = plt.subplot(2, 1, 1 + i) #Y_ = clf.predict(X) #for i, (mean, covar, color) in enumerate(zip( #clf.means_, clf._get_covars(), color_iter)): #v, w = linalg.eigh(covar) #u = w[0] / linalg.norm(w[0])