def _fit_dpgmm(self, x): # clustering k = max(self.crange) for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) if 'max_iter' in self.clus_kwargs: model_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model = DPGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.fit(x) self._labels[r] = model.predict(x) self._parameters[r] = model.means_ self._ll[r] = model.score(x).sum() # evaluate goodness of fit for this run #self._gof[r] = self.gof(x, self._ll[r], k) if self.gof_type == 'aic': self._gof[r] = model.aic(x) if self.gof_type == 'bic': self._gof[r] = model.bic(x) # debug if self.debug is True: print self._gof[r], model.n_components, model.weights_.shape[0]
def main(): if len(sys.argv) != 4: print(__doc__) return 1 infile = sys.argv[1] N = int(sys.argv[2]) num_random = int(sys.argv[3]) print("Reading in", infile) fullarr = np.loadtxt(fileinput.input(infile), delimiter = '\t')[:,:-7] stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T stds[stds == 0] = 1.0 num_lines = num_random fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:] fullarr = (fullarr - means) / stds output = '' print("Parameter searching...") igmm = None best_score = -100000 best_alpha = -1 best_model = None for alpha in [0.01,0.1,1,10]: print("Learning infinite GMM with N={}, alpha={}".format(N, alpha)) output += "Learning infinite GMM with N={}, alpha={}\n".format(N, alpha) igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc') igmm.fit(fullarr) score = igmm.score(fullarr) score = sum(score)/len(score) print('{}: {} with {} clusters'.format(alpha, score, igmm.n_components)) output += '{}: {} with {} clusters\n'.format(alpha, score, igmm.n_components) if score > best_score: best_score = score best_alpha = alpha best_model = igmm print('Best alpha={}, score={}'.format(best_alpha, best_score)) output += 'Best alpha={}, score={}\n'.format(best_alpha, best_score) with open('parameter_search_results.txt', 'a+') as outf: outf.write(output) return 0
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None): best_bic = np.inf bic_dpgmm = None lbl_vec_dpgmm = np.zeros(X.shape[0]) prob_vec_dpgmm = np.zeros(X.shape[0]) log_prob_dpgmm = None for i in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=alpha, random_state=rand_state) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: bic_dpgmm = b lbl_vec = dpgmm.predict(X) prob_vec = dpgmm.predict_proba(X) log_prob_dpgmm = np.sum(dpgmm.score(X)) return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
def main(): if len(sys.argv) != 5: print(__doc__) return 1 infiles = glob(sys.argv[1]) outfile = sys.argv[2] N = int(sys.argv[3]) alpha = float(sys.argv[4]) print("Reading in", len(infiles), "files") fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7] stds = np.apply_along_axis(np.std, 0, fullarr)[:,np.newaxis].T means = np.apply_along_axis(np.mean, 0, fullarr)[:,np.newaxis].T stds[stds == 0] = 1.0 num_lines = 10000 fullarr = fullarr[np.random.choice(fullarr.shape[0], num_lines, replace=True),:] fullarr = (fullarr - means) / stds print("Learning infinite GMM with N={}, alpha={}".format(N, alpha)) igmm = DPGMM(covariance_type='diag', n_components=N, alpha=alpha, init_params='wmc') igmm.fit(fullarr) print("Infinite GMM trained, saving") with open(outfile + '_' + num_lines, 'wb') as out_model: pickle.dump(igmm, out_model) print("Score:", igmm.score(fullarr)) print("Num Components:", igmm.n_components) return 0