def calculate_GMM(self, df, coeff, tsne_results): clf = mixture.BayesianGaussianMixture( n_components=self.dict_c['clusters'], covariance_type='full') clf.fit(coeff) clusters = clf.predict(coeff) probas = clf.predict_proba(coeff) df['clusters'] = clusters df['probas'] = [[] for x in range(len(df))] for i, c in enumerate(probas): df['probas'].iloc[i] = probas[i] df = df.apply(self.apply_P_AUC_V, axis=1) df_groups = pd.DataFrame(columns=df.columns) for group in df.groupby('clusters'): x = np.argmax(np.array(group[1]['Score'])) row = group[1].iloc[x] df_groups = df_groups.append(row) df_groups, coeff, tsne_results = self.calculate_correlation(df_groups) return df
def _estimate_density(self, ds): XU = self._get_training_input(ds) if self.variational: gmm = mixture.BayesianGaussianMixture(**self.opts) else: gmm = mixture.GaussianMixture(**self.opts) gmm.fit(XU.cpu().numpy()) return gmm
def gmm_cluster(dec_vars, files_root_directory): print 'Running clustering.' # Fit a Dirichlet process Gaussian mixture using five components dpgmm = mixture.BayesianGaussianMixture(covariance_type='full').fit(dec_vars) plot_results(dec_vars, dpgmm.predict(dec_vars), dpgmm.means_, dpgmm.covariances_, 1, files_root_directory)
def cluster_datatrace(dt, n_components=10, n_init=1, excludes='_'): datatrace_filter = dt.filter(regex='^(?!' + excludes + ')') gm = mixture.BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=n_init).fit(datatrace_filter) cluster_gm = gm.predict(datatrace_filter) dt['_cluster'] = cluster_gm
def fit_gmm(path_times_list, num_components=2): # costs_dummy = np.array(costs_gaus) costs_dummy = np.reshape(path_times_list, (-1, 1)) # costs_gaus_t = np.ndarray.transpose(costs_dummy) bayes_gmm = mixture.BayesianGaussianMixture(n_components=num_components) bayes_data = bayes_gmm.fit(costs_dummy) return bayes_data
def subCluster(clust_sub): print 'Loading subcluster...' X = featArray(clust_sub) dpgmm = mixture.BayesianGaussianMixture(n_components=5,covariance_type='full',n_init=1,max_iter=1000,init_params='kmeans',weight_concentration_prior_type='dirichlet_process').fit(X) #dpgmm = mixture.GaussianMixture(n_components=5,covariance_type='full',n_init=1,max_iter=1000,init_params='kmeans').fit(X) labels = dpgmm.predict(X) labels = labels.reshape(-1,1024) return labels
def Fit(self, Y): Y = np.reshape(Y, (len(Y), 1)) Y = self.preprocess(Y) dpgmm = mixture.BayesianGaussianMixture(n_components=self.init_compo, weight_concentration_prior=1, max_iter=100, tol=1e-8).fit(Y) return dpgmm
def clusters(transcriptFile, demonstrations=None, transcripts=None): #for i in range (demonstrations.shape[0]): traj = demonstrations temporal_window = 2 traj = generate_transition_features(traj, temporal_window) print traj.shape n_components_range = range(5, 6) cv_types = ['full'] for cv_type in cv_types: for n_components in n_components_range: gmm = mixture.BayesianGaussianMixture(n_components=15, covariance_type='full', max_iter=1000, tol=1e-5, random_state=00) #gmm = cluster.AgglomerativeClustering(linkage = 'average', n_clusters = 6) start = time.time() results = gmm.fit(traj) end = time.time() #gmm.predict(traj[0].reshape(1,-1)) print "time taken: {}".format(start - end) #gmm = mixture.DPGMM(n_components = 7, covariance_type='diag', n_iter = 10000, tol= 1e-4) #gmm = mixture.GaussianMixture(n_components=n_components, max_iter = 10000,covariance_type=cv_type, tol = 1e-5, random_state = 500) results = gmm.predict(traj) best_gmm = gmm score = 0 cp_times = [] prev = 0 time_stamp = np.zeros((results.shape[0], 1)) for i in range(len(time_stamp)): time_stamp[i][0] = i results = results.reshape(-1, 1) results = np.concatenate((results, time_stamp), axis=1) new_segments = np.concatenate((results, traj), axis=1) new_segments = np.sort(new_segments, axis=0) print new_segments[1][1] current_label = new_segments[0][0] cluster_array = [] for i in range(new_segments.shape[0]): if (new_segments[i][0] != current_label): current_label = new_segments[i][0] subClusters(cluster_array) cluster_array = [] else: cluster_array.append(new_segments[i][1:]) #results = results.reshape(-1,1) #print "checking results {} {}" .format(results.shape, traj.shape) #traj = np.concatenate((traj, results), axis = 1) #gmm = mixture.GaussianMixture(n_components=5, max_iter = 10000,covariance_type='full', tol = 1e-5, random_state = 00) #gmm.fit(traj) #results = gmm.predict(traj) '''transition_points = []
def remove_dc_from_spad_gmm(h, n_components=4, weight_concentration_prior=1e0, depth_values=None, axs=None): assert len(h.shape) == 1 h_denoised = h.copy().astype('float') # bins = np.logspace(np.log(np.min(h[h > 0])), np.log(np.max(h)), 100) # h_hist_hist, cutoffs = np.histogram(h, bins=bins) # print(cutoffs) nz_ind = h > 0 h_nz = h[h > 0].copy() # model = skmix.GaussianMixture(n_components=n_components) model = skmix.BayesianGaussianMixture( n_components=n_components, weight_concentration_prior=weight_concentration_prior) if depth_values is None: classes = model.fit_predict(np.log(h_nz).reshape(-1, 1)) else: # print(np.stack([np.log(h_nz), depth_values_nz]).shape) # print(np.log(h_nz).shape) depth_values_nz = depth_values[h > 0] classes = model.fit_predict( np.stack([np.log(h_nz), depth_values_nz], axis=-1)) # print(classes.shape) # classes = model.fit_predict(h_nz.reshape(-1,1)) # print(model.weights_) # print(classes) print([(np.mean(h_nz[classes == i]), i) for i in np.unique(classes)]) noise_class = min( (np.mean(h_nz[classes == i]), i) for i in np.unique(classes))[1] print(noise_class) print(len(h_nz[classes == noise_class])) cutoff = (np.max(h_nz[classes == noise_class]) + np.min(h_nz[classes != noise_class])) / 2 if axs is not None: axs[0].bar(range(len(h)), h, log=True) axs[0].axhline(y=cutoff, color='r', linewidth=0.5) h_noise, _ = np.histogram(np.log(h_nz[classes == noise_class]), bins=200, range=(np.min(np.log(h_nz)), np.max(np.log(h_nz)))) h_signal, _ = np.histogram(np.log(h_nz[classes != noise_class]), bins=200, range=(np.min(np.log(h_nz)), np.max(np.log(h_nz)))) axs[1].bar(range(len(h_noise)), h_noise) axs[1].bar(range(len(h_signal)), h_signal) h_denoised[h_denoised <= cutoff] = 0. dc = np.mean(h_nz[classes == noise_class]) h_denoised[h_denoised > cutoff] -= dc # plt.figure() # plt.bar(range(len(h)), h, log=True) return h_denoised
def run_ES_SCOREplus(W, k, c=0.1): start = time.time() r = k + 1 n = len(W) Degree = np.sum(W, axis=1) # D = np.diag(Degree) delta = c * max(Degree) # I = np.identity(len(Degree)) d = 1. / np.sqrt(np.add(delta, Degree)) # d = 1. / np.add(delta, Degree) # D^(-1/2) L D^(-1/2) sqrtMatrix = np.diag(d) L = np.dot(np.dot(sqrtMatrix, W), sqrtMatrix) eig_val, eig_vect = eigsh(L, r, which='LM') tao = 0.2 ratio = eig_val[k] / eig_val[k - 1] F = np.dot(eig_vect[:, :r], np.diag(eig_val[:r])) if ratio < 1 + tao: F = F[:, :] for i in range(r - 1): F[:, i] = np.multiply(eig_vect[:, i], 1. / eig_vect[:, r - 1]) temp = (eig_val[0] - eig_val[1]) / eig_val[1] # print(temp) if temp < c: F = F[:, 1:(r - 1)] # sp_kmeans = KMeans(n_clusters=k).fit(F) sp_kmeans = mixture.BayesianGaussianMixture( n_components=k + 1, covariance_type='full').fit(F) else: F = F[:, :r - 1] for i in range(r - 1): F[:, i] = np.multiply(eig_vect[:, i], 1. / eig_vect[:, r - 1]) temp = (eig_val[0] - eig_val[1]) / eig_val[1] # print(temp) if temp < c: F = F[:, 1:(r - 1)] # sp_kmeans = KMeans(n_clusters=k).fit(F) sp_kmeans = mixture.BayesianGaussianMixture( n_components=k + 1, covariance_type='full').fit(F) # print(ratio, 1 + tao) end = time.time() # print(p, max(l)-min(l)+1) # return sp_kmeans.labels_, end - start return sp_kmeans.predict(F), end - start
def vbgmm_clustering(dataset, parameters): cputime_start = time.process_time() vbgmm_result = mixture.BayesianGaussianMixture(n_components=parameters["max_n_components"]).fit(dataset) result_labels = vbgmm_result.predict(dataset) cputime_end = time.process_time() n_clusters = determine_n_clusters(result_labels) return result_labels, cputime_end - cputime_start, n_clusters
def dpmm_calc_scores(model, train_dataset, eval_normal_dataset, eval_abn_dataset=None, args=None, ret_metadata=False, dpmm_components=10, dpmm_downsample_fac=10, pt_dpmm_path=None): """ Wrapper for extracting features for DNS experiment, given a trained DCEC models, a normal training dataset and two datasets for evaluation, a "normal" one and an "abnormal" one :param model: A trained model :param train_dataset: "normal" training dataset, for alpha calculation :param eval_normal_dataset: "normal" or "mixed" evaluation dataset :param eval_abn_dataset: "abnormal" evaluation dataset (optional) :param args - command line arguments :param ret_metadata: :param dpmm_components: Truncation parameter for DPMM :param dpmm_downsample_fac: Downsampling factor for DPMM fitting :param pt_dpmm_path: Path to a pretrained DPMM model :return actual experiment done after feature extraction (calc_p) """ # Alpha calculation and fitting train_p = calc_p(model, train_dataset, args, ret_metadata=False) eval_p_ret = calc_p(model, eval_normal_dataset, args, ret_metadata=ret_metadata) if ret_metadata: eval_p_normal, metadata = eval_p_ret else: eval_p_normal = eval_p_ret p_vec = eval_p_normal eval_p_abn = None if eval_abn_dataset: eval_p_abn = calc_p(model, eval_abn_dataset, args, ret_metadata=ret_metadata) p_vec = np.concatenate([eval_p_normal, eval_p_abn]) print("Started fitting DPMM") if pt_dpmm_path is None: dpmm_mix = mixture.BayesianGaussianMixture(n_components=dpmm_components, max_iter=500, verbose=1, n_init=1) dpmm_mix.fit(train_p[::dpmm_downsample_fac]) else: dpmm_mix = load(pt_dpmm_path) dpmm_scores = dpmm_mix.score_samples(p_vec) if eval_p_abn is not None: gt = np.concatenate([np.ones(eval_p_normal.shape[0], dtype=np.int), np.zeros(eval_p_abn.shape[0], dtype=np.int)]) else: gt = np.ones_like(dpmm_scores, dtype=np.int) try: # Model persistence dpmm_fn = args.ae_fn.split('.')[0] + '_dpgmm.pkl' dpmm_path = os.path.join(args.ckpt_dir, dpmm_fn) dump(dpmm_mix, dpmm_path) except ModuleNotFoundError: print("Joblib missing, DPMM not saved") if ret_metadata: return dpmm_scores, gt, metadata else: return dpmm_scores, gt
def cluster(self, X_train): dpgmm = mixture.BayesianGaussianMixture(n_components=self.n_cluster, covariance_type='full', max_iter=400).fit(X_train) while dpgmm.converged_ == False: max_iter = dpgmm.n_iter_ * 2 print("increase the number of iteration to {} to converge".format( max_iter)) dpgmm = mixture.BayesianGaussianMixture( n_components=self.n_cluster, covariance_type='full', max_iter=max_iter).fit(X_train) X_prediction_vgmm = dpgmm.predict(X_train) dict = {} for i in range(self.n_cluster): # dict[i] istore the index of data belongs to cluster i dict[str(i)] = np.where(X_prediction_vgmm == i)[0].tolist() return dict, X_prediction_vgmm
def predictBayesian(data2D): bgmm = mixture.BayesianGaussianMixture(n_components=7, covariance_type='diag', weight_concentration_prior=1e-5, max_iter=10000, random_state=1).fit(data2D) labels = bgmm.predict(data2D) return bgmm, labels
def dirichlet_gmm(self, seed=1, gmm_cmpts=10, prior=1e-3, plot_clusters=False): """ Cluster the data using the Dirichlet Process Gaussian Mixtures method. Approximates an infinite mixture model with a finite one, using the stick-breaknig process. Implemented using the scikit-learn BayesianGaussianMixture() function. Method developed from: http://scikit-learn.org/stable/auto_examples/mixture/plot_gmm_sin.html#sphx-glr-auto-examples-mixture-plot-gmm-sin-py Parameters: - seed, int, the random number seed (for repeatability) - gmm_cmpts, int, the max number of Gaussian distributions to use - prior, float, the Dirichlet concentration of each component. Usually referred to as gamma. - plot_clusters, boolean, flag to indicate whether to plot the output. USE WITH CAUTION!! Will only work if data is 2D. Return: - dataframe; each subreddit (string) is assigned to a cluster (int) """ # train the DP-GMM dp_gmm = mixture.BayesianGaussianMixture( n_components=gmm_cmpts, covariance_type='full', n_init=10, # run the model 10 times, and take the best run weight_concentration_prior=prior, weight_concentration_prior_type='dirichlet_process', mean_precision_prior=prior, init_params="random", random_state=seed).fit(self.data) # generate cluster labels clusts = dp_gmm.predict(self.data) print("Estimated number of DP-GMM clusters: " + str(len(set(clusts)))) # If required, plot the clusters on a 2D scatterplot with ellipses to # show Gaussian components. #************************************************ # *** CAUTION!! WILL ONLY WORK WITH 2D DATA!! *** #************************************************ if plot_clusters: X = np.array(self.data.reset_index()[['x', 'y']]) self.plot_dpgmms(X, clusts, dp_gmm.means_, dp_gmm.covariances_, 1, "Bayesian GMM with a Dirichlet process prior") # assign labels to subreddits and return df = pd.DataFrame({ 'subreddit': self.data.index.values, 'cluster': clusts }).set_index('subreddit') return df
def getProposalFromSamples(self, iterNO): if not self.getSmcSamples(): RuntimeError("SMC samples not yet loaded...") else: gmm = mixture.BayesianGaussianMixture(n_components=self.__maxNumComponents, weight_concentration_prior=self.__priorWeight, covariance_type='full', tol=1e-5, max_iter=int(1e5), n_init=100) gmm.fit(self.getSmcSamples()[iterNO]) proposal = np.exp(gmm.score_samples(self.getSmcSamples()[iterNO])) return proposal / sum(proposal)
def Cluster(X, ncomps): dpgmm = mixture.BayesianGaussianMixture( n_components=ncomps, covariance_type='full', n_init=20, max_iter=10000, init_params='kmeans', weight_concentration_prior_type='dirichlet_process').fit(X) labels = dpgmm.predict(X) return labels
def unsup_cluster(self, data_idx): # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html max_mean_dist = 0.001 max_sum_xy = 0.2 min_n_p = round(data_idx.__len__() / 50) pc_tab = self.__mat2tab(self.point_cloud) data = pc_tab[data_idx, :] import sklearn.mixture as skm bgm = skm.BayesianGaussianMixture( n_components=50, covariance_type='full', tol=1e-3, reg_covar=1e-6, max_iter=200, n_init=2, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10) #bgm.fit(floors, self.__mat2tab(self.label)) bgm.fit(data) predicted = bgm.predict(data) planes = [] for k in range(np.min(predicted), np.max(predicted) + 1): klabel = [i for i, j in enumerate(predicted) if j == k] if (klabel.__len__() > 10): # print('n. of points: '+str(klabel.__len__())) idx = [data_idx[kk] for kk in klabel] le = idx.__len__() plane_coeff = self.__find_plane(pc_tab[idx, :]) distances = self.point_plane_dist(idx, plane_coeff) sq_distances = [ distances[i]**2 for i in range(distances.__len__()) ] mean_dist = np.mean(sq_distances) if (mean_dist < max_mean_dist and le > min_n_p and abs(plane_coeff[0]) + abs(plane_coeff[1]) < max_sum_xy): planes.append({ 'dist': mean_dist, 'eq': plane_coeff, 'idx': idx }) print( '[a,b,c,d]=[{0[0]:.2f},{0[1]:.2f},{0[2]:.2f},{0[3]:.2f}]; ' .format(plane_coeff) + str(le) + ' points; ' + 'dist: {:2.2E}'.format(mean_dist)) return planes
def dpgmm_(df, n_components): X = df.to_numpy() dpgmm = mixture.BayesianGaussianMixture(n_components, covariance_type='full').fit(X) plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 'Bayesian Gaussian Mixture with a Dirichlet process prior') labels = dpgmm.predict(X) plt.show() final_index = user_select(df, labels) return final_index
def train_gmm(self, nComponents=5, nPoints=1e4): X = np.random.choice(self.X.ravel(), size=int(nPoints), replace=False) gmm = mixture.BayesianGaussianMixture(n_components=nComponents, covariance_type='full').fit(col(X)) phi = gmm.weights_.squeeze() mu = gmm.means_.squeeze() var = gmm.covariances_.squeeze() # sort ix = mu.argsort() self.phi = phi[ix] self.mu = mu[ix] self.var = var[ix]
def get_gmm_sample_data(self, incoming_df, column_list, sample_size): """ Unsupervised Learning in the form of BayesianGaussianMixture to create sample data. """ gmm = mixture.BayesianGaussianMixture(n_components=2, covariance_type="full", n_init=100, random_state=42).fit(incoming_df) clustered_data = gmm.sample(sample_size) clustered_df = pd.DataFrame(clustered_data[0], columns=column_list) return clustered_df
def variational_gmm_beta(x, alpha, beta): n = x.shape[0] vi = mixture.BayesianGaussianMixture( n_components=2, covariance_type='full', weight_concentration_prior_type='dirichlet_distribution').fit(x) pi_hat = vi.predict_proba(x)[:, 0] S = np.sum(pi_hat) gamma1 = alpha + S gamma2 = beta + n - S return stats.beta.mean(gamma1, gamma2), stats.beta.var(gamma1, gamma2)
def get_bgmm(): return mixture.BayesianGaussianMixture(n_components=k, weight_concentration_prior=weight_concentration_prior, reg_covar=reg_covar, covariance_type='full', n_init=n_init, max_iter=max_iter, random_state=random_state, init_params='random', verbose=3, verbose_interval=10, )
def cluster_validation(n_digit, X): print("methods silhouette_score calinski_harabasz_score davies_bouldin_score") # # Fit a kmeans clustering model kmeans = KMeans(init='k-means++', n_clusters=n_digit, max_iter=3000, tol=1e-4, n_init=10, random_state=0) get_score("kmeans", kmeans, X) # Fit a Gaussian mixture with EM using five components gmm = mixture.GaussianMixture(n_components=n_digit, max_iter=3000, tol=1e-4, covariance_type='spherical', random_state=0) get_score("GMM", gmm, X) # # Fit a Dirichlet process Gaussian mixture using five components dpgmm = mixture.BayesianGaussianMixture(n_components=n_digit, max_iter=3000, tol=1e-4, covariance_type='spherical', random_state=0) get_score("DPGMM", dpgmm, X)
def fit(self, data): cov_prior = [self.dp_gmm[4] for _ in range(data.shape[1])] # mean_prior = [self.dp_gmm[5] for _ in range(data.shape[1])] mean_prior = [0 for _ in range(data.shape[1])] self.model = mix.BayesianGaussianMixture( n_components=self.dp_gmm[0], max_iter=self.dp_gmm[1], weight_concentration_prior=self.dp_gmm[2], covariance_type=self.dp_gmm[3], covariance_prior=cov_prior, mean_prior=mean_prior) # uses a dirichlet process GMM to cluster return self.model.fit(data)
def create_bayesian_gaussian_mixture( data, component_count=1, covariance_type='full', max_iteration_count=DEFAULT_MAX_ITERATION_COUNT): """Creates a Dirichlet process Gaussian mixture with the specified number of components and fits the specified data with the expectation-maximization (EM) algorithm. Note that the Dirichlet process model adapts the number of components automatically.""" model = mixture.BayesianGaussianMixture(n_components=component_count, covariance_type=covariance_type, max_iter=max_iteration_count) return model.fit(data)
def trainGMM(X): """ returns : gmmPredLabels, gmmPredScores """ dpgmm = mixture.BayesianGaussianMixture(n_components=6,covariance_type='full',max_iter=1000).fit(X) gmmPredLabels = dpgmm.predict(X) gmmPredScores = dpgmm.predict_proba(X) with open('gmmModel.p', 'wb') as fp: pickle.dump(dpgmm, fp) return gmmPredLabels,gmmPredScores
def main(): researchResults = parse_csv_files(result_file_path) df = create_mean_results_dataframe(researchResults) df = df.drop('AC', 1) numeric_df = df._get_numeric_data() pca_2 = PCA(2) plot_columns = pca_2.fit_transform(numeric_df) is_gmm = True optimal_cluster_number = find_best_cluster_number( plot_columns, 4, ceil(sqrt(len(df.index)))) if is_gmm: cluster_number = optimal_cluster_number gmm = create_em_mixture(plot_columns, optimal_cluster_number) else: cluster_number = 4 gmm = mixture.BayesianGaussianMixture( n_components=4, covariance_type='full', weight_concentration_prior_type='dirichlet_process').fit( plot_columns) ward = AgglomerativeClustering(n_clusters=cluster_number, linkage='ward').fit(plot_columns) predicted = gmm.predict(plot_columns) df['Cluster'] = predicted clustering_plot(plot_columns, gmm.means_, gmm.covariances_, df['Operator'], predicted) clusters = [ClusterData(i + 1) for i in range(len(predicted))] for index, row in df.iterrows(): clusters[row['Cluster']].elements.append(row) file = open(output_file_path, "w") file.write('Type ') for i, val in enumerate(clusters): if not val.elements: continue file.write(str(val.number)) file.write('\n') for j, row in enumerate(val.elements): for k, cell in row.iteritems(): file.write(str(cell)) file.write(' ') file.write('\n') file.write('\n') file.close() plt.show()
def epsilloids(X, n, covariance_type='full'): # Fit a Gaussian mixture with EM using five components gmm = mixture.GaussianMixture(n_components=n, covariance_type=covariance_type).fit(X) plot_results(X, gmm.predict(X), gmm.means_, gmm.covariances_, 0, 'Gaussian Mixture') # Fit a Dirichlet process Gaussian mixture using five components dpgmm = mixture.BayesianGaussianMixture(n_components=n, covariance_type=covariance_type).fit(X) plot_results(X, dpgmm.predict(X), dpgmm.means_, dpgmm.covariances_, 1, 'Bayesian Gaussian Mixture with a Dirichlet process prior') plt.show()
def vbgmm_clustering(dataset, parameters): if parameters["max_n_components"] > dataset.shape[0] - 1: n_components = dataset.shape[0] - 1 else: n_components = parameters["max_n_components"] vbgmm_result = mixture.BayesianGaussianMixture( n_components=n_components).fit(dataset) result_labels = vbgmm_result.predict(dataset) n_clusters = determine_n_clusters(result_labels) return result_labels, n_clusters