def _fit_dpgmm(self, x): # clustering k = max(self.crange) for r in xrange(self.repeats): # info if self.debug is True: print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1), # fit and evaluate model model_kwargs = {} if 'alpha' in self.clus_kwargs: model_kwargs.update(alpha=self.clus_kwargs['alpha']) if 'conv_thresh' in self.clus_kwargs: model_kwargs.update(thresh=self.clus_kwargs['conv_thresh']) if 'max_iter' in self.clus_kwargs: model_kwargs.update(n_iter=self.clus_kwargs['max_iter']) model = DPGMM(n_components=k, covariance_type=self.cvtype, **model_kwargs) model.fit(x) self._labels[r] = model.predict(x) self._parameters[r] = model.means_ self._ll[r] = model.score(x).sum() # evaluate goodness of fit for this run #self._gof[r] = self.gof(x, self._ll[r], k) if self.gof_type == 'aic': self._gof[r] = model.aic(x) if self.gof_type == 'bic': self._gof[r] = model.bic(x) # debug if self.debug is True: print self._gof[r], model.n_components, model.weights_.shape[0]
def fit_vel_profile_dpgmm(vel_profile, n_comps=5, dp=False): """ fit a velocity profile with DP-GMM """ N = 1000 # 1000 samples to fit integral = np.sum(vel_profile) #vel_profile is a 1D array, try to convert it to samples t = np.linspace(0, 1, len(vel_profile)) data = np.array([]) for i in range(len(t)): n_samples = vel_profile[i] / integral * N if n_samples > 0: #add samples samples = np.ones(n_samples) * t[i] #add noise data = np.concatenate([data, samples]) fit_data = np.array([data]).transpose() #fit Dirichlet-Process Gaussian Mixture Model, #something wrong with the module? The clusters seem merged... if dp: model = DPGMM(n_components=n_comps, n_iter=1000, alpha=10) else: model = GMM(n_components=n_comps) model.fit(fit_data) return model
def cluster(self, dim, method='dpgmm', max_n_clusters=80, max_iter=300, refresh=True): ''' dim is the dim index for clustering ''' print('clustering DPGMM') from sklearn.mixture import BayesianGaussianMixture as DPGMM dpgmm = DPGMM(n_components=max_n_clusters, covariance_type='full', weight_concentration_prior=1e-3, weight_concentration_prior_type='dirichlet_process', init_params="kmeans", max_iter=max_iter, random_state=0, verbose=1, verbose_interval=10) # init can be "kmeans" or "random" dpgmm.fit(self.fet[:, dim]) label = dpgmm.predict(self.fet[:, dim]) self.clu.membership = label self.clu.__construct__() self.clu.emit('cluster') if refresh is True: self.set_data(self.fet, self.clu) return label
def clustering_algorithm(lengths, covs, kmers, algorithm='dirichlet', K=300, max_epoch=25, t=0, seed=None, mu_pkl=None): """Clusters using given algorithm Takes as argument cluster names, lengths, and coverage/kmer matrices. """ # create matrix for clustering logging.info('Creating data matrix') X = create_matrix(lengths, covs, kmers) # project down dimension logging.info('Performing dimensionality reduction') X = reduce_dimensionality(X) # do the clustering logging.info('Starting clustering algorithm') if algorithm == 'sk-gmm': gmm = GMM(n_components=K, covariance_type='full', n_iter=500) gmm.fit(X) z = gmm.predict(X) return z elif algorithm == 'sk-dpgmm': gmm = DPGMM(n_components=K, covariance_type='full', n_iter=500) gmm.fit(X) z = gmm.predict(X) return z elif algorithm == 'dirichlet': n_data = X.shape[0] mu_pred_dem, Sigma_pred_dem, asgn_dem, llik = dirichlet_em( X.T, K=K, n_minibatch=n_data, max_epoch=max_epoch, seed=seed) if mu_pkl: with open(mu_pkl, 'wb') as f: pickle.dump((mu_pred_dem, asgn_dem), f) # further compress stuff compressed_clusters = agglomerative(mu_pred_dem.T, t=t) transl_dict = {i: c for i, c in enumerate(compressed_clusters)} asign_dem_agg = np.array([transl_dict[i] for i in asgn_dem]) return asign_dem_agg elif algorithm == 'ard': n_data = X.shape[0] _, _, asgn, _ = variational_em(X.T, K=K, n_minibatch=n_data, max_epoch=max_epoch) return asgn else: raise ValueError("Invalid algorithm name")
def Dirichlet(cluster_data, identification, iteration_number=1): print "In Dirichlet" for i in range(0, iteration_number): print "On iteration number ", i dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data) #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit predict = dirichlet.predict(cluster_data) n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0) print('Estimated number of clusters with Dirichlet: %d' % n_clusters_) return _make_final_list(identification, predict)
def dpgmm(self, k=10, alpha=1.0): self.h = DPGMM(n_components=k, alpha=alpha, random_state=self.random_seed).fit(self.X) self.Y = self.h.predict(self.X) self.k = k # this is the max number of components in dpgmm self.centers = self.getCenters() #TODO # posterior = self.h.predict_proba( self.X[:5] ) # likelihood = self.h.score( self.X[:5] ) return self
def _dpgmm(fet, n_comp=8, max_iter=400): from sklearn.mixture import BayesianGaussianMixture as DPGMM dpgmm = DPGMM(n_components=n_comp, covariance_type='full', weight_concentration_prior=1e-3, weight_concentration_prior_type='dirichlet_process', init_params="kmeans", max_iter=100, random_state=0, verbose=0, verbose_interval=10) # init can be "kmeans" or "random" dpgmm.fit(fet) label = dpgmm.predict(fet) return label
def test1(): print 'test1' model = VDPGMM(T=10, alpha=1, max_iter=50) X, Y = getXY('iris') model.fit(X) y = model.predict(X) print 'VDPGMM' print len(np.unique(y)), np.unique(y) print[np.sum(y == label) for label in np.unique(y)] from sklearn.mixture import DPGMM model = DPGMM(n_components=10, alpha=1, n_iter=50) model.fit(X) y = model.predict(X) print 'DPGMM' print len(np.unique(y)), np.unique(y) print[np.sum(y == label) for label in np.unique(y)]
def dpgmm_cluster(self, max_n_clusters=30, max_iter=300, verbose=False): from sklearn.mixture import BayesianGaussianMixture as DPGMM dpgmm = DPGMM(n_components=max_n_clusters, covariance_type='full', weight_concentration_prior=1e-3, weight_concentration_prior_type='dirichlet_process', init_params="kmeans", max_iter=max_iter, random_state=0, verbose=verbose, verbose_interval=10) # init can be "kmeans" or "random" dpgmm.fit(self.fet) label = dpgmm.predict(self.fet) self.clu.membership = label self.clu.__construct__() self.clu.emit('cluster') return dpgmm
def get_best_dpgmm(X, num_c, cv_type, alpha, iters, n_init, rand_state=None): best_bic = np.inf bic_dpgmm = None lbl_vec_dpgmm = np.zeros(X.shape[0]) prob_vec_dpgmm = np.zeros(X.shape[0]) log_prob_dpgmm = None for i in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=alpha, random_state=rand_state) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: bic_dpgmm = b lbl_vec = dpgmm.predict(X) prob_vec = dpgmm.predict_proba(X) log_prob_dpgmm = np.sum(dpgmm.score(X)) return [lbl_vec, prob_vec, bic_dpgmm, log_prob_dpgmm]
def train_DPGMM(d, max_n_comp=100, max_n_iter=500): '''Imports Data, Trains a DPGMM, Generates predictions testing''' print "Training Model..." gmm = DPGMM(max_n_comp, n_iter=max_n_iter) start = timeit.default_timer() gmm.fit(d) end = timeit.default_timer() print "Training completed in %f seconds" % (end-start) print print "Converged: " print gmm.converged_ print return gmm
def select_model(model_key): model = None if model_key == 'b': model = GradientBoostingClassifier() elif model_key == 'svc': model = SVC(probability=True, gamma='auto') elif model_key == 'nusvc': print 'selecting NuSVC' model = NuSVC(probability=True) elif model_key == 'r': model = RandomForestClassifier(class_weight={'buy': 1, 'stay': .75}) elif model_key == 'e': model = ExtraTreesClassifier() elif model_key == 'nn': model = KNeighborsClassifier() elif model_key == 'gmm': model = DPGMM() return model, model_key
def _Dirichlet(cluster_data, identification): print "In Dirichlet" for i in range(0, 3): print "i is ", i dirichlet = DPGMM(n_components=len(cluster_data)).fit(cluster_data) #paremeters= dirichlet.get_params #returns parameters of the algorithm as a whole from the fit predict = dirichlet.predict(cluster_data) n_clusters_ = len(set(predict)) - (1 if -1 in predict else 0) print('Estimated number of clusters with Dirichlet: %d' % n_clusters_) final = [] for x in range(0, len(identification)): final.append([identification[x], predict[x]]) print "this is what final sort of looked like" print final[:3] return final
def plot_num_iters_dpgmm(X, num_c, cv_type, alpha, max_iters, n_init): bic = [] for iters in np.arange(1, max_iters): best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=comp, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(np.arange(1, max_iters), bic) ax.set_title('BIC vs. Number of Iterations DPGMM') ax.set_xlabel('Number of iterations') ax.set_ylabel('BIC score') return fig
def plot_alpha_dpgmm(X, num_c, cv_type, alphas, iters, n_init): bic = [] for a in alphas: best_bic = np.inf for j in xrange(n_init): dpgmm = DPGMM(n_components=num_c, covariance_type=cv_type, \ alpha=a, n_iter=iters) dpgmm.fit(X) b = dpgmm.bic(X) if b < best_bic: best_bic = b bic.append(best_bic) fig, ax = plt.subplots(figsize=(10, 8)) ax.plot(alphas, bic, 'bo-', lw=2) ax.set_title('BIC vs. Alpha DPGMM') ax.set_xlabel('Alpha') ax.set_ylabel('BIC score') return fig
def plotClustering(fullpath, order=1, sr=4, cutoff=.1, n_singv=3, feature='chroma', dim_red='SVD', round_to=0, normalize=1, scale=1, length=4, clustering='KMEANS'): feat = {} print( 'Analyzing {} with feature {}, order {}, sr {}, cutoff {}, ' 'n_singv {}, scale {} normalize {}, round_to {}'.format( fullpath, feature, order, sr, cutoff, n_singv, scale, normalize, round_to)) # extract filename, filepath and beat aligned feature filename, file_ext = os.path.splitext(fullpath) # extract filter and apply pre-processing feat[feature], beat_times = extractFeature(filename, file_ext, feature, scale, round_to, normalize, beat_sync=True, save=True) feat['LPF'] = lpf(feat[feature], cutoff, sr, order) feat[dim_red] = dim_red_fn(dim_red, feat[feature], n_singv) feat['{}(LPF)'.format(dim_red)] = dim_red_fn(dim_red, feat['LPF'], n_singv) feat['LPF({})'.format(dim_red)] = lpf(feat[dim_red], cutoff, sr, order) feat['{}-LPF'.format(feature)] = feat[feature] - feat['LPF'] feat['LPF({}-LPF)'.format(feature)] = lpf(feat['{}-LPF'.format(feature)], cutoff, sr, order) feat['{}(LPF({}-LPF))'.format(dim_red, feature)] = dim_red_fn( dim_red, feat['LPF({}-LPF)'.format(feature)], n_singv) # create vars for plotting ts = np.arange(0, len(feat[feature])) step_size = max(1, int(len(ts) * .01)) fig = plt.figure(figsize=(98, 64)) fig.suptitle('feature {} order {}, cutoff {}, sr {}'.format( feature, order, cutoff, sr)) gs = mpl.gridspec.GridSpec(12, 4, width_ratios=[1, 1, 1, 1]) i = 0 print "\tPlot data and pre-processing" for name in (feature, '{}-LPF'.format(feature), '{}(LPF)'.format(dim_red), 'LPF({})'.format(dim_red), 'LPF({}-LPF)'.format(feature), '{}(LPF({}-LPF))'.format(dim_red, feature)): data = feat[name] data_wide = np.array([ feat[name][m:m + length, :] for m in xrange(len(feat[name]) - length) ]) data_wide = data_wide.reshape(data_wide.shape[0], data_wide.shape[1] * data_wide.shape[2]) # build codebook using kmeans or DP-GMM if clustering == 'KMEANS': K_MIN, K_MAX = 2, 16 KM = [ KMeans(n_clusters=l, init='k-means++').fit(data_wide) for l in xrange(K_MIN, K_MAX + 1) ] # compute scores to assess fit scores_bic = [ computeBic(KM[x], data_wide) for x in xrange(len(KM)) ] scores_inertia = [KM[x].inertia_ for x in xrange(len(KM))] scores_silhouette = [ silhouette_score(data_wide, KM[x].labels_, metric='euclidean') for x in xrange(len(KM)) ] # get best clusters idx_best_bic = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_bic))[0]) idx_best_inertia = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_inertia))[0]) idx_best_silhouette = findElbow( np.dstack((xrange(K_MIN, K_MAX + 1), scores_silhouette))[0]) idx_best = int( np.median( (idx_best_bic, idx_best_inertia, idx_best_silhouette))) + 1 # get clusters and cluster allocations given best K k_best = idx_best + K_MIN centroids = KM[idx_best].cluster_centers_ centroid_idx = KM[idx_best].labels_ elif clustering == 'DPGMM': n_components = 12 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(data_wide) # compute scores to assess fit scores_bic = dpgmm.bic(data_wide) scores_silhouette = [ silhouette_score(data_wide, centroids, metric='euclidean') ] scores_silhouette = [0.0] # get clusters and cluster allocations given best K k_best = dpgmm.means_.shape[0] centroids = dpgmm.means_ centroid_idx = np.argmax(dpgmm.predict_proba(data_wide), axis=1) # plot data if data.shape[1] == 3: data = data.reshape(1, data.shape[0], data.shape[1]) else: data = data.T ax = fig.add_subplot(gs[i, :]) ax.set_title(name) ax.imshow(data, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) xlabels = [ "{}:{}".format(int(x / 60), int(x % 60)) for x in beat_times[::step_size] ] ax.set_xticks(ts[::step_size]) ax.set_xticklabels(xlabels, rotation=60) ax.grid(False) # plot clustering on raw feature changes = np.hstack(([True], centroid_idx[:-1] != centroid_idx[1:])) for c in xrange(changes.shape[0] - 1): if changes[c] and changes[c + 1]: changes[c] = False ax_twin = ax.twiny() ax_twin.set_xlim(ax.get_xlim()) ax_twin.set_xticks(np.argwhere(changes)[:, 0]) ax_twin.set_xticklabels(centroid_idx[changes]) ax_twin.grid(False) # plot codebook (centroids) ax = fig.add_subplot(gs[i + 1, 0]) ax.set_title(name) if centroids.shape[1] == 3: centroids = centroids.reshape(1, centroids.shape[0], centroids.shape[1]) elif centroids.shape[1] == n_singv * length: centroids = centroids.reshape(1, centroids.shape[0] * length, centroids.shape[1] / length) else: centroids = centroids.reshape(centroids.shape[0] * length, centroids.shape[1] / length).T ax.imshow(centroids, interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(xrange(0, centroids.shape[1], 4)) ax.set_xticklabels(xrange(k_best)) ax.grid(False) # plot elbow curve c = 1 for k, v, idx in (('BIC', scores_bic, idx_best_bic), ('INERTIA', scores_inertia, idx_best_inertia), ('SILHOUETTE', scores_silhouette, idx_best_silhouette)): ax = fig.add_subplot(gs[i + 1, c]) ax.set_title('{}, {} best K {}'.format(name, k, idx + K_MIN)) ax.plot(xrange(K_MIN, K_MAX + 1), v, 'b*-') ax.set_xlim((K_MIN, K_MAX + 1)) ax.set_xlabel('Number of clusters') ax.set_ylabel('Score') ax.grid(True) ax.axvline(idx + K_MIN, color='r') c += 1 i += 2 """ if 'SVD' in name: # scikit-image clustering segments_slic = slic( data, n_segments=10, compactness=10, sigma=1) segments_quickshift = quickshift( data, kernel_size=3, max_dist=6, ratio=0.5) ax = fig.add_subplot(gs[k, 0]) ax.set_title('{} with quickshift'.format(name)) ax.imshow(mark_boundaries(data, segments_quickshift, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) ax = fig.add_subplot(gs[k, 1]) ax.set_title('{} with slic'.format(name)) ax.imshow(mark_boundaries(data, segments_slic, mode='outer'), interpolation='nearest', origin='low', aspect='auto', cmap=plt.cm.Oranges) ax.set_xticks(ts[::step_size]) ax.set_xticklabels(beat_times[::step_size], rotation=60) ax.grid(False) k += 1 """ plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with large size plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) # save with smaller size fig.set_figwidth(36) fig.set_figheight(24) plt.tight_layout() plt.savefig("{}_clustering_{}_{}_r_{}_n_{}_s_{}_l_{}_{}_small.png".format( filename, feature, cutoff, round_to, normalize, scale, length, dim_red)) plt.close(fig)
aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[2].imshow(feats_log_normed, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) fig.tight_layout() # Clustering with DP-GMM n_components = 32 dpgmm = DPGMM(n_components=n_components, tol=1e-3, n_iter=32, alpha=1000, covariance_type='diag', verbose=True) dpgmm.fit(feats_log.T) preds_proba = dpgmm.predict_proba(feats_log.T) preds = np.argmax(preds_proba, axis=1) np.unique(preds) # resynthesis by sampling from clusters resynthesis = dpgmm.means_[preds.astype(int), :] fig, axes = plt.subplots(4, 1, figsize=(18, 8)) axes[0].set_title(feature) axes[1].set_title('Prediction Probability') axes[2].set_title('Resynthesis') axes[3].set_title('Max(Prediction Probability)')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.mixture import GMM, DPGMM, BayesianGaussianMixture, VBGMM from sklearn.svm import NuSVC, SVC # Useful for seeing all sklearn estimators that have `predict_prob` attribute estimators = all_estimators() for name, class_ in estimators: if hasattr(class_, 'predict_proba'): print(name) # Now pick and choose the ones you like estimators = { AdaBoostClassifier(): 'AdaBoost', BayesianGaussianMixture(): 'BayesianGaussianMixture', BernoulliNB(): 'BernoulliNB', DPGMM(): 'DPGMM', ExtraTreesClassifier(): 'ExtraTreesClassifier', GMM(): 'GMM', GaussianNB(): 'GaussianNB', GaussianProcessClassifier(): 'GaussianProcessClassifier', GradientBoostingClassifier(): 'GradientBoostingClassifier', KNeighborsClassifier(): 'KNeighborsClassifier', LabelPropagation(): 'LabelPropagation', LabelSpreading(): 'LabelSpreading', LinearDiscriminantAnalysis(): 'LinearDiscriminantAnalysis', LogisticRegression(): 'LogisticRegression', MLPClassifier(): 'MLPClassifier', NuSVC(): 'NuSVC', QuadraticDiscriminantAnalysis(): 'QuadraticDiscriminantAnalysis', RandomForestClassifier(): 'RandomForestClassifier', SGDClassifier(): 'SGDClassifier',
# Choose a max number of components for the algorithm max_components = 8 # Count the number of clusters the DPGMM chooses num_clusters = [] size_sample = [] # Try clustering at different sample sizes for iteration in range(int(np.floor(len(gaussian_data) / 10)) - 2): # Number of samples to use max_sample_value = ((iteration + 2) * 10) sample_set = gaussian_data[0:max_sample_value] size_sample.append(max_sample_value - 0) # Fit Dirichlet Process Gaussian Mixture Model dpgmm_model = DPGMM(n_components = max_components, n_iter=1000, alpha=1.0) fitted_dpgmm = dpgmm_model.fit(sample_set) dpgmm_predictions = fitted_dpgmm.predict(gaussian_data) num_clusters.append(len(set(dpgmm_predictions))) # Append predicted labels to dataframe gaussian_data['predicted'] = dpgmm_predictions # Give a unique color to each category unique_categories = list(set(gaussian_data['predicted'])) color_labels = ['r', 'y', 'g', 'b', 'c', 'm', 'k', 'w'] colors = [color_labels[unique_categories.index(i)] for i in gaussian_data['predicted']] # Plot predicted data plt.scatter(gaussian_data['x'], gaussian_data['y'], c=colors) plt.xlim([-12,12])
data_cluster_train = query_features(training, 15, 10, 23, data) data_cluster_test = query_features(testing, 15, 10, 23, data) data_cluster_train_ds = data_cluster_train """if you want clustering on the dissimilarity space uncomment below and change accordingly""" # print 'Calculating dissimilarity space for training queries...' # data_cluster_train_ds = sc.pdist(data_cluster_train, 'euclidean') # data_cluster_train_ds = sc.squareform(data_cluster_train_ds) # # plt.figure(1) # # plt.imshow(data_cluster_train_ds) # # plt.colorbar() # # plt.title('Initial dissimilarity') print 'Training a Dirichlet Process Gaussian Mixture model...' dpgmm = DPGMM(alpha=1.0, n_iter=100, n_components=50) dpgmm.fit(data_cluster_train_ds) prediction = dpgmm.predict(data_cluster_train_ds) clusters = np.unique(prediction) print 'Found %i clusters!' % clusters.shape[0] print clusters """create the reordered input data according to the clusters it is only needed if you want to visuallize the clustering afterwards""" #data_cluster = np.zeros((1, data_cluster_train.shape[1])) # each cluster is a list of lists that contains the indices # of the queries for each cluster each_cluster = [] for i in xrange(clusters.shape[0]):
def fit_dirichlet_gmm_to_points(points, n_components, mdl, ps=[], num_iter=100, covariance_type='full', mass_multiplier=1.0): """fit a GMM to some points. Will return core::Gaussians. if no particles are provided, they will be created points: list of coordinates (python) n_components: number of gaussians to create mdl: IMP Model ps: list of particles to be decorated. if empty, will add num_iter: number of EM iterations covariance_type: covar type for the gaussians. options: 'full', 'diagonal', 'spherical' init_centers: initial coordinates of the GMM force_radii: fix the radii (spheres only) force_weight: fix the weights mass_multiplier: multiply the weights of all the gaussians by this value """ new_sklearn = True try: from sklearn.mixture import BayesianGaussianMixture except ImportError: from sklearn.mixture import DPGMM new_sklearn = False ### create and fit GMM print('using dirichlet prior') if new_sklearn: gmm = BayesianGaussianMixture( weight_concentration_prior_type='dirichlet_process', n_components=n_components, max_iter=num_iter, covariance_type=covariance_type) else: gmm = DPGMM(n_components=n_components, n_iter=num_iter, covariance_type=covariance_type) gmm.fit(points) #print('>>> GMM score',gmm.score(points)) #print gmm.covars_ #print gmm.weights_ #print gmm.means_ ### convert format to core::Gaussian if not new_sklearn: gmm.precisions_ = gmm.precs_ for ng in range(n_components): invcovar=gmm.precisions_[ng] covar=np.linalg.inv(invcovar) if covar.size==3: covar=np.diag(covar).tolist() else: covar=covar.tolist() center=list(gmm.means_[ng]) weight=mass_multiplier*gmm.weights_[ng] if ng>=len(ps): ps.append(IMP.Particle(mdl)) shape=IMP.algebra.get_gaussian_from_covariance(covar,IMP.algebra.Vector3D(center)) g=IMP.core.Gaussian.setup_particle(ps[ng],shape) IMP.atom.Mass.setup_particle(ps[ng],weight) IMP.core.XYZR.setup_particle(ps[ng],sqrt(max(g.get_variances())))
def dpgmm_simple(X, init_numC, random_state): model = DPGMM(n_components = init_numC, n_iter=100, tol=0.000001, random_state=random_state) model.fit(X) y = model.predict(X) cluster_num = len(np.unique(y)) return cluster_num, y
# In[4]: train_dataset = train.values X = train_dataset[:, 2:] y = train_dataset[:, 1] y = y.astype('int') test_dataset = test.values X_test = test_dataset[:, 2:] print(type(X_test)) print('X.shape, y.shape, X_test.shape', X.shape, y.shape, X_test.shape) # In[5]: df = pd.DataFrame({"SK_ID_CURR": df['SK_ID_CURR']}) print('dirichlet process gaussian mixture begins****************') dpgmm = DPGMM(n_components=3) print('fitting****************') dpgmm_train = dpgmm.fit(X, y) print('predicting on train****************') dpgmm_X_prediction = dpgmm.predict_proba(X)[:, 1] print('predicting on test****************') dpgmm_X_test_prediction = dpgmm.predict_proba(X_test)[:, 1] tr_te_concatenated = np.concatenate( [dpgmm_X_prediction, dpgmm_X_test_prediction]) df['dirichlet_process_gaussian_mixture'] = tr_te_concatenated print('final tr_te shape', df.shape) print(df.head()) df.to_csv('dirichlet_process_gaussian_mixture_tr_te.csv', index=False)
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False): """ The list of all classifiers was generated by running the following commented code. Args: a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets. a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will be printed to a file with the current date and time. a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally describing the run that is being performed. Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_output_scores_to_csv, bool) assert isinstance(output_scores_csv_file_suffix, object) import time # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from sklearn.utils.testing import all_estimators #estimators = all_estimators() #for name, class_ in estimators: # log_print(name) from sklearn.calibration import CalibratedClassifierCV from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegressionCV from sklearn.linear_model import SGDClassifier from sklearn.mixture import BayesianGaussianMixture from sklearn.mixture import DPGMM from sklearn.mixture import GaussianMixture from sklearn.mixture import GMM from sklearn.mixture import VBGMM from sklearn.naive_bayes import BernoulliNB from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier #from xgboost import XGBClassifier models = [] models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) models.append(('BernoulliNB', BernoulliNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('DPGMM', DPGMM())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=SEED))) models.append(('ExtraTreesClassifier', ExtraTreesClassifier(random_state=SEED))) models.append(('GMM', GMM())) models.append(('GaussianMixture', GaussianMixture())) models.append(('GaussianNB', GaussianNB())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('LogisticRegression', LogisticRegression())) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('MLPClassifier', MLPClassifier())) #models.append(('MultinomialNB', MultinomialNB())) #models.append(('NuSVC', NuSVC())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('RandomForestClassifier', RandomForestClassifier(random_state=SEED))) models.append(('SGDClassifier', SGDClassifier())) models.append(('SVC', SVC())) models.append(('VBGMM', VBGMM())) #models.append(('XGBClassifier', XGBClassifier())) output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table) if print_output_scores_to_csv: output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv') return output_scores_df def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True): """ Run all classifiers of sklearn Args: X_train, X_test, y_train, y_test: The train and tests datasets. print_details: if true, print details of all models and save csv table ; if false, print only table with summary of the models Returns: dataset: Returns output scores dataset. """ assert isinstance(X_train, pd.core.frame.DataFrame) assert isinstance(X_test, pd.core.frame.DataFrame) assert isinstance(y_train, pd.core.frame.Series) assert isinstance(y_test, pd.core.frame.Series) assert isinstance(print_details, bool) log_method_execution_time(log_funcname()) from sklearn.utils.testing import all_estimators import sklearn.metrics import time from src.util.acq_util import RANDOM_SEED # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn #from xgboost import XGBClassifier #models.append(('XGBClassifier', XGBClassifier())) models = all_estimators(type_filter='classifier') output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0', 'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'], columns=list(zip(*models))[0]) for name, model in models: if print_details is True: print('------------------------------------------------------------------------------') print(name) print('------------------------------------------------------------------------------') if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'): continue model = model() if 'random_state' in model.get_params(): model.random_state = SEED #Fitting the model. model.fit(X_train, y_train) #Measuring accuracy. y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset, ['Accuracy on the train set', name], print_details) output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset, ['Accuracy on the test set', name], print_details) #Plotting confusion matrix. output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details) #Showing classification report. if print_details is True: print(sklearn.metrics.classification_report(y_test, y_test_pred)) # Printing scores to output dataset. output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name) # Can use idxmax with axis=1 to find the column with the greatest value on each row. output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1) #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1) if print_details is True: output_scores_dataset.to_csv('output_scores' + '.csv') return output_scores_dataset def train_test_split_for_classification(dataset, label, test_size, random_state=SEED): """ Selects X and y, considering that y has been renamed to label. """ from sklearn.model_selection import train_test_split assert isinstance(dataset, pd.core.frame.DataFrame) assert isinstance(test_size, float) assert isinstance(random_state, int) X = dataset.loc[:, dataset.columns != label] y = dataset[g_label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) log_print('X_train: {}'.format(X_train.shape)) log_print('y_train: {}'.format(y_train.shape)) log_print('X_test: {}'.format(X_test.shape)) log_print('y_test: {}'.format(y_test.shape)) return(X_train, X_test, y_train, y_test)
# print(pca.n_components_) # print(pca.explained_variance_ratio_[0:3]) print(pca.reconstruction_err_) fig = plt.figure(figsize=(20, 20)) ax = fig.add_subplot(111, projection='3d') plt.scatter(datat[:, 0], datat[:, 1], zs=datat[:, 2], c=labels, marker='o') plt.show() plt.close() #km = KMeans(n_clusters=5) km = DPGMM(n_components=7, covariance_type='tied') clabels = km.fit_predict(datat) # for ex, lab in zip(exid, clabels): # print(ex, lab) fig = plt.figure(figsize=(20, 20)) ax = fig.add_subplot(111, projection='3d') plt.scatter(datat[:, 0], datat[:, 1], zs=datat[:, 2], c=clabels, marker='o') plt.show() plt.close()
angle=angle, color='m', alpha=0.5, clip_box=ax.bbox) ax.add_artist(e) ax1_min, ax1_max, ax2_min, ax2_max = plt.axis() plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.title(u'GMM', fontsize=20) plt.grid(True) # DPGMM n_components = 3 dpgmm = DPGMM(n_components=n_components, alpha=1, covariance_type='full', random_state=0) dpgmm.fit(x) centers = dpgmm.means_ covs = dpgmm._get_covars() print 'DPGMM均值 = \n', centers print 'DPGMM方差 = \n', covs y_hat = dpgmm.predict(x) # print y_hat ax = plt.subplot(212) grid_hat = dpgmm.predict(grid_test) grid_hat = grid_hat.reshape(x1.shape) plt.pcolormesh(x1, x2, grid_hat, cmap=cm) plt.scatter(x[:, 0], x[:, 1], s=30, c=y, cmap=cm, marker='o')
plt.figure() plt.subplot(1, 2, 1) plt.imshow(img) plt.title('Original Image') if K.image_dim_ordering() == "th": img = np.moveaxis( img.reshape((1, img.shape[0], img.shape[1], img.shape[2])), -1, 1) img = vgg16.preprocess_input(img.astype('float32')) """ Scaling activations to fit random initialization scheme""" actvs = get_activations(model, layer, img).squeeze() actvs /= np.max(actvs) * 0.1 """ Clustering with dirichlet process Gaussian Mixture Model""" dpgmm = DPGMM(n_components=50, alpha=1, verbose=2, tol=0.01, n_iter=250, min_covar=1e-6) #dpgmm = BayesianGaussianMixture(n_components=50, covariance_type="diag", reg_covar = 1e-6, # weight_concentration_prior_type="dirichlet_process", # weight_concentration_prior=1, verbose=2, # tol=0.01, max_iter=250, init_params='random', # mean_precision_prior=actvs.std(), # mean_prior=np.repeat(actvs.max()/5,actvs.shape[0])) dpgmm.fit( np.transpose(actvs.reshape(actvs.shape[0], actvs.shape[1] * actvs.shape[2]))) labels = dpgmm.predict( np.transpose(actvs.reshape(actvs.shape[0], actvs.shape[1] * actvs.shape[2])))
def __init__(self, cluster_method=2, cluter_tag=False, train_path=None, event_info_path=None, city_id=None): self.loss_choice = 0 # 0:reg; 1:pairwise ranking self.ndim = 20 self.tr_method = 0 # 0:SGD1; 1:SGD2 self.cluster_method = cluster_method # 0:DPGMM; 1:GMM; 2:K-means self.n_components = 20 self.city_id = city_id # SGD self.niters1 = 10 self.lr1 = 0.01 self.lambda1 = 0.001 self.neg_num1 = 5 self.beta1 = 1 self.alpha1 = 1 self.ins_weight = [self.beta1, self.alpha1] pois = [] if cluter_tag == True: events = set( [entry[1] for entry in csv.reader(open(train_path, "r"))]) for entry in csv.reader(open(event_info_path, "r")): event = entry[0] if event in events: poi = map(float, entry[3].split(" ")) pois.append(poi) if not checkGeoScope(poi, self.city_id): print 'Invalic location' sys.exit(1) if self.cluster_method == 0: cluster = DPGMM(n_components=500, covariance_type='diag', alpha=1, n_iter=50) cluster.fit(pois) centers = removeDup(cluster.means_) outputCenterforVis(centers) self.n_components = len(centers) cluster_fd = open(settings["DPGMM_CLUSTER"], "wb") pickle.dump([centers, None], cluster_fd) self.model_path = settings["GEOMF"] outputCenterforVis(centers) elif self.cluster_method == 1: cluster = GMM(n_components=self.n_components, covariance_type='diag', min_covar=1e-7, n_init=10, random_state=0, n_iter=100) cluster.fit(pois) outputCenterforVis(cluster.means_) labels = deterClusterRel(pois, cluster.means_) #showNumInEachCluster(labels, self.n_components) dis_variances = calDisVariance(self.n_components, labels, pois) dis_variances = smoothVar(dis_variances) covars = smoothVar(cluster.covars_) cluster_fd = open(settings["GMM_CLUSTER"], "wb") pickle.dump([cluster.means_, covars, dis_variances], cluster_fd) elif self.cluster_method == 2: cluster = KMeans(n_clusters=self.n_components, max_iter=300, init='k-means++') cluster.fit(pois) means, variances = calCenterCov(self.n_components, cluster.labels_, pois) outputCenterforVis(means) dis_variances = calDisVariance(self.n_components, cluster.labels_, pois) variances = smoothVar(variances) dis_variances = smoothVar(dis_variances) cluster_fd = open(settings["KMEANS_CLUSTER"], "wb") pickle.dump([means, variances, dis_variances], cluster_fd) else: print 'Invalid choice of clustering method' sys.exit(1)
def em_stereo(self,n_component=1,dp=True,thresh_hold=0.4): self.num_params = 0 #The range of len(params) _step = 0 for var_idx in tqdm(range(len(self.merge_var[0]))): for x_v in range(len(self.merge_var[0][var_idx])): print('Step %d'%_step,end='\r') _step += 1 try: for y_v in range(len(self.merge_var[0][var_idx][x_v])): #print('cluster weights ....%d'%var_idx) dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v][y_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v][y_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=1000, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v][y_v]).reshape(-1,1)) # if new_idx in new_idx_list: self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v][y_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v][y_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0) except TypeError: dist = [] for task_idx in range(len(self.merge_var)): nor = np.random.normal(self.merge_var[task_idx][var_idx][x_v],np.log(1.0+np.exp(self.merge_uncertainty[task_idx][var_idx][x_v])),200) dist.append(nor) dist = np.array(np.asmatrix(np.concatenate(dist)).T) if dp: print('Initializing DPGMM%d ... '%_step,end='\r') gmm = DPGMM( max_iter=200, n_components=n_component, covariance_type='spherical') else: gmm = GMM( max_iter=200, n_components=n_component, covariance_type='spherical') gmm.fit(dist) new_idx_list = [] for task_idx in range(len(self.merge_var)): #if dp: #Strategy 1. Set threshold predict_probability = gmm.predict_proba(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) f_ = True while f_: #if gmm.weights_[np.argmax(predict_probability)] > ( 1 / len(self.merge_var)): if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax(predict_probability)] = 0.0 self.num_params += 1 #else: # new_idx = gmm.predict(np.array(self.merge_var[task_idx][var_idx][x_v]).reshape(-1,1)) # if new_idx in new_idx_list: # self.num_params += 1 new_idx_list.append(new_idx) self.merge_var[task_idx][var_idx][x_v] = gmm.means_[new_idx] self.merge_uncertainty[task_idx][var_idx][x_v] = np.log(np.exp(gmm.covariances_[new_idx]) - 1.0)
def _st_smooth(self, var_idx, x_v, y_v=None, n_component=1, thresh_hold=0.3, dp=False): mixture_dist = [] for task_idx in range(self.num_task): if y_v is not None: mean = self.params_mean[task_idx][var_idx][x_v][y_v] var = self.transform_var( self.params_var[task_idx][var_idx][x_v][y_v]) else: mean = self.params_mean[task_idx][var_idx][x_v] var = self.transform_var( self.params_var[task_idx][var_idx][x_v]) mixture_dist.append({'kwargs': {'loc': mean, 'scale': var}}) alpha = 0.3 alpha_list = [(1 - alpha) / (self.num_task - 1)] * (self.num_task - 1) alpha_list.append(alpha) sample = create_mixture(mixture_dist, alpha_list=alpha_list) if dp: gmm = DPGMM(max_iter=1000, n_components=n_component, covariance_type='spherical') else: gmm = GMM(max_iter=500, n_components=n_component, covariance_type='spherical') gmm.fit(sample) new_idx_list = [] for task_idx in range(self.num_task): if y_v is not None: predict_probability = gmm.predict_proba( np.array( self.params_mean[task_idx][var_idx][x_v][y_v]).reshape( -1, 1)) else: predict_probability = gmm.predict_proba( np.array(self.params_mean[task_idx][var_idx][x_v]).reshape( -1, 1)) f_ = True while f_: if gmm.weights_[np.argmax(predict_probability)] > thresh_hold: new_idx = np.argmax(predict_probability) f_ = False else: predict_probability[0][np.argmax( predict_probability)] = 0.0 #self.num_merged_params += 1 if new_idx in new_idx_list: self.num_merged_params += 1 new_idx_list.append(new_idx) if y_v is not None: self.params_mean[task_idx][var_idx][x_v][y_v] = gmm.means_[ new_idx] self.params_var[task_idx][var_idx][x_v][ y_v] = self.retransform_var(gmm.covariances_[new_idx]) else: self.params_mean[task_idx][var_idx][x_v] = gmm.means_[new_idx] self.params_var[task_idx][var_idx][x_v] = self.retransform_var( gmm.covariances_[new_idx]) """